Ejemplo n.º 1
0
def remove_and_correct_outliers(data):
    ##is data in a normal distribution??
    b_constant = 1.4826  ##constant used for normal distribution
    factor = 10 #3 ##factor to multiply for the range
    count = 0
    for i in range(0, len(data[0].values)):  ##iterate through all features, in voce case 6125
        d_s, d_ns, _, _ = utils.get_utterance_values_of_ith_utterance(data, i)  ##get all feature values
        d = d_s + d_ns ##join them together, since the fucntion returns different arrays for stress or not stress
        f_vals = np.array(d, dtype=float) ##transform list into np array
        median = np.median(f_vals) ##get the median
        diff = (f_vals - median)**2 ##subtract median to every element and **2 to get all values to positive
        diff = np.sqrt(diff) ## eliminate the **2 trick to avoid negatives
        med_abs_deviation = np.median(diff) ##get the new mean
        threshold = med_abs_deviation * b_constant ##raange of value to be accepted
        max_range = median + threshold * factor
        min_range = median - threshold * factor
        for j in range(0, len(f_vals)):  ##mark values that are outside the bounderies as outliers
            if f_vals[j] < min_range or f_vals[j] > max_range:
                count += 1
                f_vals[j] = np.nan
        imp = Imputer(missing_values=np.nan, strategy='mean', axis=1)
        f_vals = imp.fit_transform(f_vals)[0]
        for j in range(0, len(f_vals)):
            data[j].values[i] = round(f_vals[j],6)
    print "Detected ", count, " outliers"
    return data
Ejemplo n.º 2
0
def get_modelKmeans():
    # Connect to a pre-existing cluster
    # connect to localhost:54321

    # Log.info("Importing benign.csv data...\n")
    benign_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv"))
    # benign_h2o.summary()

    benign_sci = np.genfromtxt(pyunit_utils.locate("smalldata/logreg/benign.csv"), delimiter=",")
    # Impute missing values with column mean
    imp = Imputer(missing_values="NaN", strategy="mean", axis=0)
    benign_sci = imp.fit_transform(benign_sci)

    for i in range(2, 7):
        # Log.info("H2O K-Means")
        km_h2o = H2OKMeansEstimator(k=i)
        km_h2o.train(x=range(benign_h2o.ncol), training_frame=benign_h2o)
        km_h2o.show()
        model = h2o.get_model(km_h2o._id)
        model.show()

        km_sci = KMeans(n_clusters=i, init="k-means++", n_init=1)
        km_sci.fit(benign_sci)
        print "sckit centers"
        print km_sci.cluster_centers_
Ejemplo n.º 3
0
def get_features(frame):
    '''
    Transforms and scales the input data and returns a numpy array that
    is suitable for use with scikit-learn.

    Note that in unsupervised learning there are no labels.
    '''

    # Replace missing values with 0.0
    # or we can use scikit-learn to calculate missing values below
    #frame[frame.isnull()] = 0.0

    # Convert values to floats
    arr = np.array(frame, dtype=np.float)

    # Impute missing values from the mean of their entire column
    from sklearn.preprocessing import Imputer
    imputer = Imputer(strategy='mean')
    arr = imputer.fit_transform(arr)
    
    # Normalize the entire data set to mean=0.0 and variance=1.0
    from sklearn.preprocessing import scale
    arr = scale(arr)

    return arr
def data_organizer( instances, outcomes ):
   """
   Operations to organize data as desired
   """
   
   # Remove instances without GPA data
   new_instances = []
   new_outcomes = []
   for instance,outcome in zip(instances,outcomes):
      u1,u2,gpa = outcome
      if not math.isnan( gpa ):
         new_instances.append( [value for value in instance] )
         new_outcomes.append( [value for value in outcome] )
         
   instances = new_instances
   outcomes = new_outcomes
  
  
   # Fill in NaN values with median
   instance_list = []
   for idx,instance in enumerate(instances):
      instance_list.append( [ value for value in instance ] ) 
   bandaid = Imputer( strategy='median' )
   instances = bandaid.fit_transform( instance_list )
   

   return instances, outcomes
Ejemplo n.º 5
0
def run_whole_video(exp_folder, lims_ID):
    #initializes video pointer for video of interest based on lims ID
    file_string = get_file_string(exp_folder, lims_ID)
    video_pointer = cv2.VideoCapture(file_string)

    # import wheel data
    wheel = joblib.load('dxds2.pkl')
    first_non_nan = next(x for x in wheel if not isnan(x))
    first_index = np.where(wheel == first_non_nan)[0]
    k = first_index[0]
    imp = Imputer(missing_values='NaN', strategy='mean')
    wheel = imp.fit_transform(wheel)
    wheel = preprocessing.MinMaxScaler((-1, 1)).fit(wheel).transform(wheel)

    # self.video_pointer.set(1, 41000)
    ret, frame = video_pointer.read()

    # crops and converts frame into desired format
    frame = cv2.cvtColor(frame[160:400, 100:640], cv2.COLOR_BGR2GRAY)

    prvs = frame
    nex = frame

    # initialize vectors to keep track of data
    count = 0
    mod = 0
    opticals = []
    angles = []
    frames = []

    # length of movie
    limit = int(video_pointer.get(cv2.cv.CV_CAP_PROP_FRAME_COUNT))


    # create hdf file
    hf = h5py.File('data_' + str(lims_ID) + '.h5', 'w')
    g = hf.create_group('feature space')
    vector = np.zeros((limit, 4321))
    table = g.create_dataset('features', data = vector, shape =(limit, 4321))


    while count <= limit:

        prvs = nex
        frames = process_input(prvs)

        ret, frame = video_pointer.read()
        nex = cv2.cvtColor(frame[160:400, 100:640], cv2.COLOR_BGR2GRAY)

        optical = optical_flow(prvs, nex)
        opticals = optical['mag']
        angles= optical['ang']
        vector_data = np.concatenate((np.reshape(wheel[k], (1)), frames, opticals, angles))

        table[count, :] = vector_data

        count += 1

        if count%1000 == 0:
            print (count)
Ejemplo n.º 6
0
def process(discrete, cont):
  # Create discrete and continuous data matrices
  discrete_X = np.array(discrete)
  cont_X = np.array(cont)

  # Impute discrete values
  imp = Imputer(strategy='most_frequent')
  discrete_X = imp.fit_transform(discrete_X)

  # Impute continuous values
  imp_c = Imputer(strategy='mean')
  cont_X = imp_c.fit_transform(cont_X)

  # Discrete basis representation
  enc = OneHotEncoder()
  enc.fit(discrete_X)
  discrete_X = enc.transform(discrete_X).toarray()

  # Continuous scaling
  scaler = StandardScaler()
  scaler.fit(cont_X)
  cont_X = scaler.transform(cont_X)

  # Merge to one array
  X = np.concatenate((discrete_X, cont_X), axis=1)
  return X
Ejemplo n.º 7
0
def impute_and_scale(df, scaling='std'):
    """Impute missing values with mean and scale data included in pandas dataframe.

    Parameters
    ----------
    df : pandas dataframe
        dataframe to impute and scale
    scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std')
        type of scaling to apply
    """

    df = df.dropna(axis=1, how='all')

    imputer = Imputer(strategy='mean', axis=0)
    mat = imputer.fit_transform(df)

    if scaling is None or scaling.lower() == 'none':
        return pd.DataFrame(mat, columns=df.columns)

    if scaling == 'maxabs':
        scaler = MaxAbsScaler()
    elif scaling == 'minmax':
        scaler = MinMaxScaler()
    else:
        scaler = StandardScaler()

    mat = scaler.fit_transform(mat)

    df = pd.DataFrame(mat, columns=df.columns)

    return df
Ejemplo n.º 8
0
def benignKmeans():
  # Connect to a pre-existing cluster
  # connect to localhost:54321


  #  Log.info("Importing benign.csv data...\n")
  benign_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv"))
  #benign_h2o.summary()

  benign_sci = np.genfromtxt(pyunit_utils.locate("smalldata/logreg/benign.csv"), delimiter=",")
  # Impute missing values with column mean
  imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
  benign_sci = imp.fit_transform(benign_sci)

  # Log.info(paste("H2O K-Means with ", i, " clusters:\n", sep = ""))

  from h2o.estimators.kmeans import H2OKMeansEstimator

  for i in range(1,7):
    benign_h2o_km = H2OKMeansEstimator(k=i)
    benign_h2o_km.train(x = range(benign_h2o.ncol), training_frame=benign_h2o)
    print "H2O centers"
    print benign_h2o_km.centers()

    benign_sci_km = KMeans(n_clusters=i, init='k-means++', n_init=1)
    benign_sci_km.fit(benign_sci)
    print "sckit centers"
    print benign_sci_km.cluster_centers_
Ejemplo n.º 9
0
def learn():
	global classifier, INPUT
	print 1
	data = np.genfromtxt(INPUT, delimiter=' ', dtype='f8')
	np.random.shuffle(data)
	n = len(data)
	y = data[:,1]
	x = data[:][:,range(2,54)]
	# test_x = []
	# test_y = []
	train_x = []
	train_y = []
	print 2
	imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
	x = imp.fit_transform(x)
	print 3
	for i in range(0, n):
		if y[i] == 0:
			continue
		train_x.append(x[i])
		train_y.append(y[i])
		# if i%100==0:
		# 	test_x.append(x[i])
		# 	test_y.append(y[i])
		# else:
		# 	train_x.append(x[i])
		# 	train_y.append(y[i])
	print 4
	classifier.fit(train_x, train_y)
	print 5
Ejemplo n.º 10
0
    def fit(self, train_x, train_y=None, is_norm=True):
        # Normalization
        if is_norm:
            train_x_min = train_x.min(0)
            train_x_ptp = train_x.ptp(axis=0)

            train_x = train_x.astype(float) - train_x_min / train_x_ptp

            if np.any(train_y):
                train_y = train_y.astype(float) - train_x_min / train_x_ptp

        imp = Imputer(missing_values='NaN', strategy='mean', axis=1)
        imp.fit(train_x)
        if np.isnan(train_x).any():
            log("Found {} NaN values in train_x, so try to transform them to 'mean'".format(np.isnan(train_x).sum()), WARN)
            train_x = imp.transform(train_x)

        if np.any(train_y) and np.isnan(train_y).any():
            log("Found {} NaN values in train_y, so try to transform them to 'mean'".format(np.isnan(train_y).sum()), WARN)
            train_y = imp.transform(train_y)

        if np.any(train_y):
            self.model.fit(train_x, train_y)
        else:
            self.model.fit(train_x)
Ejemplo n.º 11
0
def preprocess(data):

    non_sparse_only = True
    use_all_category_only = False
    use_all_impute_mean_mode = False


    if non_sparse_only:
        nominal_samples = data.ix[:,['var4','dummy']] 
        onehot_samples = onehot.transform(nominal_samples,['var4','dummy'])
        onehot_samples = pd.DataFrame(onehot_samples.toarray())
        numbered_samples = data.ix[:,['var7','var8','var10','var11','var13','var15','var17']]
        numbered_samples[['var7','var8']] = numbered_samples[['var7','var8']].convert_objects(convert_numeric=True)
        #(var7 and 8 are ordinal, converting to floats which includes NaNs will allow mean imputing of missing values)
        other_samples = data.ix[:,'crimeVar1':'weatherVar236'] #all the continuous vars
        other_samples = other_samples.drop(['weatherVar115'], axis=1) #nothing in this feature
        samples = pd.concat([onehot_samples,numbered_samples,other_samples],axis=1) #combine w/ the cleaned up other vars
        imp_nan = Imputer(missing_values=np.nan, strategy='mean', axis=0)
        samples_imp = imp_nan.fit_transform(samples)
    
    if use_all_category_only:
        todo
    
    if use_all_impute_mean_mode:
        todo
    
    return samples_imp
def run_main(new_file, start, stop, dat):
    with open(new_file, 'a') as file:
        imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=1)
        import itertools
        with open(dat, "r") as text_file:
            for line in itertools.islice(text_file, start, stop):
                line            = line.replace("NA", "NaN")
                content         = line.rstrip('\n').split('\t')
                CpG             = content.pop(0)
                flag, CpG_location    = get_location(CpG)
                if flag == 'F':
                    continue
                genotype_matrix = get_genotypes(CpG_location)
                genotype_matrix = imp.transform(genotype_matrix)
                genotype_matrix = genotype_matrix.transpose()
            
                 #run PCA
                try:
                    PCA_matrix      = run_pca(genotype_matrix)
                except ValueError:
                    print "value error"
                    continue

                #run linear regression
                meth_values   = pd.Series(content, name="meth_val", dtype=float)
                model         = sm.OLS(meth_values, PCA_matrix)
                results       = model.fit()
                MethValResids = results.resid
                final         = pd.Series(CpG)
                final         = final.append(MethValResids)
                fline         = final.tolist()
                fline         = '\t'.join(str(x) for x in fline)
                fline         = fline + "\n"
                file.write(fline)
Ejemplo n.º 13
0
    def test_3_stage(self):
        from sklearn.preprocessing import Imputer

        infile_name = path_of_data('missing_vals.csv')

        p = Pipeline()

        csv_read_node = p.add(CSVRead(infile_name))
        csv_write_node = p.add(CSVWrite(self._tmp_files.get('out.csv')))
        impute_node = p.add(wrap_and_make_instance(Imputer))

        csv_read_node['output'] > impute_node['X_train']
        impute_node['X_new'] > csv_write_node['input']

        self.run_pipeline(p)

        ctrl_imputer = Imputer()
        ctrl_X_sa = np.genfromtxt(infile_name, dtype=None, delimiter=",",
                                  names=True)
        num_type = ctrl_X_sa[0][0].dtype
        ctrl_X_nd, ctrl_X_sa_type = np_sa_to_nd(ctrl_X_sa)
        ctrl_X_new_nd = ctrl_imputer.fit_transform(ctrl_X_nd)
        control = ctrl_X_new_nd

        result = self._tmp_files.csv_read('out.csv', True)

        self.assertTrue(np.allclose(result, control))
Ejemplo n.º 14
0
def imputed_data(df, colname, strategy="mean"):
    from sklearn.preprocessing import Imputer
    imr = Imputer(missing_values="NaN", strategy=strategy, axis=0)
    imr = imr.fit(df[colname].reshape(-1,1))
    imputed_data = imr.transform(df[colname].values.reshape(-1,1))
    df[colname] = imputed_data
    print("Data has been imputed to \"{}\"".format(colname))
Ejemplo n.º 15
0
class ImputeCategorical(BaseEstimator, TransformerMixin):
    """
    Encodes a specified list of columns or all columns if None.
    """

    def __init__(self, columns=None):
        self.columns = columns
        self.imputer = None

    def fit(self, data, target=None):
        """
        Expects a data frame with named columns to impute.
        """
        # Encode all columns if columns is None
        if self.columns is None:
            self.columns = data.columns

        # Fit an imputer for each column in the data frame
        self.imputer = Imputer(missing_values=0, strategy='most_frequent')
        self.imputer.fit(data[self.columns])

        return self

    def transform(self, data):
        """
        Uses the encoders to transform a data frame.
        """
        output = data.copy()
        output[self.columns] = self.imputer.transform(output[self.columns])

        return output
def impute_missing_data(datapoints, strategy='mean'):
    """ Inputes values for the 8 features missing data

    Arguments:
    datapoints -- X, a dataset with missing values represented 999.0 and 9999.0
    strategy [optional] -- an imputation strategy,
        e.g., mean, median, or most_frequent

    Returns:
    X_imputed -- a dataset with missing values imputed according to the
        provided or default (mean) strategy.

    Uses the scikit-learn Imputer class.
    """
    # First we will replace our placeholder values with NaN to only have
    # to run one imputation.
    np.putmask(datapoints, datapoints == 999.0, np.NaN)
    np.putmask(datapoints, datapoints == 9999.0, np.NaN)

    # Now create an imputer over NaN values, and average over axis=0 (columns)
    # Then, fit the imputer to the dataset.
    imp = Imputer(strategy=strategy, axis=0)
    X_imputed = imp.fit_transform(datapoints)

    return X_imputed
Ejemplo n.º 17
0
def load_datasets(feature_paths, label_paths):
    '''
    读取特征文件和标签文件并返回
    '''
    #定义feature数组变量,列数量和特征维度一致为41;定义空的标签变量,列数量与标签维度一致为1
    feature = np.ndarray(shape=(0,41))
    label = np.ndarray(shape=(0,1))
    for file in feature_paths:
        #使用pandas库的read_table函数读取一个特征文件的内容,其中指定分隔符为逗号、缺失值为问号且文件不包含表头行
        #df = pd.read_table(file, delimiter=',', na_values='?', header=None)
        
        #pandas.read_csv(数据源, encoding=编码格式为utf-8, parse_dates=第0列解析为日期, index_col=用作行索引的列编号)
        data=pd.read_csv(file,encoding='utf-8',parse_dates=[0],index_col=0)
        #DataFrame.sort_index(axis=0 (按0列排), ascending=True(升序), inplace=False(排序后是否覆盖原数据))
        #data 按照时间升序排列
        #data.sort_index(0,ascending=True,inplace=True)
        
        #使用Imputer函数,通过设定strategy参数为‘mean’,使用平均值对缺失数据进行补全。 
        imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
        #fit()函数用于训练预处理器,transform()函数用于生成预处理结果。
        imp.fit(df)
        df = imp.transform(df)
        #将预处理后的数据加入feature,依次遍历完所有特征文件
        feature = np.concatenate((feature, df))

    #读取标签文件
    for file in label_paths:
        df = pd.read_table(file, header=None)
        label = np.concatenate((label, df))
    #将标签归整化为一维向量    
    label = np.ravel(label)
    return feature, label
Ejemplo n.º 18
0
def impute_missing_train(dataframe, missing_values='NaN', strategy='mean'):
    '''
    Given a dataframe, imputes missing values with a given strategy.
    Supported strategies: 'mean', 'median', 'most_frequent'.
    Returns dictionary mapping transformed columns to its imputer value.
    '''
    from sklearn.preprocessing import Imputer
    imp = Imputer(missing_values=missing_values, strategy=strategy, axis=0)
    imputed = imp.fit_transform(dataframe)
    df = pd.DataFrame(imputed)
    df.columns = list(dataframe.columns)
    
    imputers = {}
    if strategy == 'mean':
        for col in df.columns:
            mean = df[col].mean()
            imputers[col] = mean
    if strategy == 'median':
        for col in df.columns:
            median = df[col].median()
            imputers[col] = median
    if strategy == 'most_frequent':
        for col in df.columns:
            mode = df[col].mode()
            imputers[col] = mode
    return df, imputers
Ejemplo n.º 19
0
 def Train_And_Test(self):
     HOG_data=np.loadtxt('dataset.csv',delimiter=",")
     tmpdata=HOG_data[:,0:-2]
     target=HOG_data[:,-2]
     print(target)
     tmpdata[tmpdata==0]=np.nan
     imp=Imputer(missing_values='NaN',strategy='mean')
     data=imp.fit_transform(tmpdata)
     data_train,data_test,target_train,target_test=train_test_split(data,target,test_size=0.3)
     model=SVC(C=1.0,gamma=0.0,kernel='linear', class_weight='auto')
     model.fit(data_train,target_train)
     print(data_train)
     print(target_train)    
     opencv_data_train=np.float32(data_train)
     opencv_target_train=np.float32(target_train)     
     svm_params = dict( kernel_type = cv2.SVM_LINEAR,
                 svm_type = cv2.SVM_C_SVC,
                 C=2.67, gamma=5.383)
     svm = cv2.SVM()
     svm.train(opencv_data_train,opencv_target_train, params=svm_params)
     svm.save("hog_classifier.xml")  
     print(model)
     expected=target_test
     predicted=model.predict(data_test)
     target_names = ['Not Human', 'Human']
     
     print(metrics.classification_report(expected,predicted,target_names=target_names))
     print(metrics.confusion_matrix(expected,predicted))
     print(metrics.roc_curve(expected,predicted))
     pickle.dump(model, open( "svm.p", "wb" ) )
def avg_message_count_by_group(df_users, df_messages, df_user_features):
    
    columns = ["f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9", "f10"]
 
    features = df_user_features[list(columns)].values

    # Impute missing values to retain all sample data
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    X = imp.fit_transform(features)

    # Preprocess dataset and standardize features to have normally distributed data
    # MaxAbsScaler allows scaled features to lie between -1 and +1
    X = MaxAbsScaler().fit_transform(X)

    # Apply PCA decomposition and use first 3 components that explain 75% of variance
    reduced_data = decomposition.PCA(n_components=3).fit_transform(X)
    kmeans = KMeans(init='k-means++', n_clusters=5, n_init=10)
    
    # Predict which group each user belongs to
    cluster_labels = kmeans.fit_predict(reduced_data)    
    df_user_features['group.id'] = cluster_labels
    
    # Call utility function to join the two dataframes
    df_joined_users_messages = get_merged_dataframes(df_users, df_messages)
    df_joined_users_messages_features = get_merged_dataframes(df_user_features, df_joined_users_messages)
      
    # Only keep messages that were received since signing up
    df_joined_users_messages_features = df_joined_users_messages_features[df_joined_users_messages_features['message.date'] 
                                                                          >= df_joined_users_messages_features['signup.date']]
        
    # Get the average message count grouped by group.id
    avg_message_count = df_joined_users_messages_features.groupby('group.id')['message.count'].mean()
    
    # Return the average message count grouped by user groups and rounded to 2 decimals
    return np.round(avg_message_count.tolist(), decimals=2)
Ejemplo n.º 21
0
    def fill_and_remove(self, s_strategy="zeros", l_features = False, 
        b_remove = True):
        '''
        fill all Nan values in numerical data with zeros and then remove data 
        points that all features are equal to zero
        l_features: a list of features to be tested. If any, all features will 
        be used
        b_remove: boolean indicating if should remove keys where all data is 0
        s_strategy: string with the strategy used to fill NaNs. Can be "mean",
        "median" and "zeros"
        '''
        df = self.getData()
        #pre-process data
        if not l_features:
            l_features = self.payments_features + self.stock_features 
            l_features+= self.email_features
        df.loc[:, l_features] = df.loc[:, l_features].astype(float)
        #filling Nan with the strategy selected
        if s_strategy == "zeros":
            df.loc[:, l_features] = df.loc[:, l_features].fillna(0)
        else:
            na_X = df.loc[:, l_features].values
            imp = Imputer(missing_values='NaN', strategy=s_strategy, axis=0)
            df.loc[:, l_features] = imp.fit_transform(na_X)

        #exclude datapoint where every number is equal to 0
        if b_remove:
            df = df.ix[((df.loc[:, l_features]!=0).sum(axis=1)!=0),:]
        #saving the new dataframe       
        self.setData(df)
        #correct scaled df
        if type(self.df_scaled)!=list:
            df2 = self.df_scaled
            df2 = df2.ix[((df.loc[:, l_features]!=0).sum(axis=1)!=0).index,:]
            self.df_scaled = df2             
def run_importance(clf, data, labels, feature_labels=[""], string=""):
    """
    Fit a classifier using all the data and plot the feature importances
    :param clf: Classifier object that has feature_importances_ member
    :param feature_labels: names of the features
    :param string: classifier name
    :return: (void) plot Gini importance vs feature
    """
    num_features = data.shape[1]
    importances = [0]*num_features

    imp = Imputer(missing_values=np.NaN, strategy="mean")
    data = imp.fit_transform(data)

    # run the classifier 100 times and average the importance found after each fit
    for r in range(100):
        clf.fit(data, labels)
        importances = [importances[i]+clf.feature_importances_[i] for i in range(num_features)]
    importances = [importance/100 for importance in importances]

    # Filter out the features that have 0 importance (e.g. values are all 0)
    # non_zeros are the indices in feature_importances that are not 0
    non_zeros = [i for i in range(num_features) if not importances[i] == 0]
    importances = [importances[i] for i in non_zeros]
    feature_labels = [feature_labels[i] for i in non_zeros]

    # Plot the features
    bar_width = 0.7
    plt.bar(range(len(feature_labels)), importances, bar_width)
    plt.xticks([ind + +float(bar_width)/2 for ind in range(len(feature_labels))], feature_labels,rotation="vertical")
    plt.gcf().subplots_adjust(bottom=0.35)
    plt.xlabel("Feature")
    plt.ylabel("Gini Importance")
    plt.title("Gini Importance v. Features for "+string+" Classifier")
    plt.show()
def test():
    vec = DictVectorizer()
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    for filename in glob.glob(r'../dataset/UCI/*.arff'):
        basename = re.sub(r'(\..*?)$','',os.path.basename(filename))
        print basename
        if basename != DS:
            continue
        # cost_matrix = pickle.load(open('../dataset/UCI/'+basename+'_cost_matrix.pkl', 'rb'))
        data = arff.loadarff(filename)[0]
        X = vec.fit_transform(np.array([{str(i):value for i,value in enumerate(list(row)[:-1])} for row in data])).toarray()
        imp.fit(X)
        X = imp.transform(X)
        labels = np.array([row[-1] for row in data])
        y = np.array([{v:k for k,v in enumerate(list(set(labels)))}[label] for label in labels])
        random = np.random.permutation(range(len(X)))
        print 'dataset ratio\t%s'%('\t'.join([alg+" "*(12-len(alg)) for alg in sorted(ALG.keys())]))
        for iteration in xrange(10):
            X, y, class_num, kf = X[random], y[random], set(labels), KFold(len(X), n_folds=10)
            for train, test in kf:
                length, train_size = len(train), 0.1
                X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
                X_label, X_unlabel, y_label, y_unlabel = train_test_split(X_train, y_train, test_size=1.0-train_size, random_state=0)
                for R in xrange(2,10):
                    ones_matrix, cost_matrix = np.array([[1,1],[1,1]]), np.array([[1,1],[R,R]])            
                    # print "%s R=%d"%(basename,R),
                    cross_validation("%s R=%d"%(basename,R), X_label, X_unlabel, y_label, y_unlabel, ones_matrix, cost_matrix)
                exit()
Ejemplo n.º 24
0
def computePearson(args):
  filter(args)

  with open(args.feature_file, 'r') as fp:
    features = [line for line in fp.read().splitlines()
                if not line.startswith('#')]

  X = loadtxt(TMP_DATA_FILE)
  y = loadtxt(TMP_LABEL_FILE)

  assert X.shape[0] == y.shape[0]
  assert X.shape[1] == len(features)

  imputer = Imputer(strategy='median', copy=False)
  X = imputer.fit_transform(X)

  if args.output_file:
    with open(args.output_file, 'w') as fp:
      print >> fp, '\t'.join(['feature', 'coeff', 'pvalue'])
      for i in range(len(features)):
        coeff, pvalue = pearsonr(X[:, i], y)
        print >> fp, '%s\t%f\t%f' % (features[i], coeff, pvalue)

  if args.group_output_file:
    groups = getGroups(features)
    index = {features[i]: i for i in range(len(features))}
    with open(args.group_output_file, 'w') as fp:
      print >> fp, '\t'.join(['prefix', 'feature1', 'feature2', 'coeff', 'pvalue'])
      for prefix, group in groups.iteritems():
        for i in range(len(group)):
          for j in range(i+1, len(group)):
            coeff, pvalue = pearsonr(X[:, index[group[i]]], X[:, index[group[j]]])
            print >> fp, '%s\t%s\t%s\t%f\t%f' % (
                prefix, group[i], group[j], coeff, pvalue)
Ejemplo n.º 25
0
def gettestdata(fil) :
	data = np.genfromtxt(fil,delimiter=',')
	imp = Imputer(missing_values='NaN', strategy='median', axis=0)
	X = imp.fit_transform(data[:,2:])
	X = scale(X).copy()
	#spr.eliminate_zeros()
	return np.array(X)
Ejemplo n.º 26
0
def calcEdges(data):
    n = len(data)        
    usersDic = {}
    usersId = 0
    moviesDic = {}
    moviesId = 0
    for i in range(n):
        r = data[i]
        if r[0] not in moviesDic:
            moviesDic[r[0]] = moviesId
            moviesId += 1
        if r[1] not in usersDic:
            usersDic[r[1]] = usersId
            usersId += 1
    E = np.zeros((moviesId, usersId))
    #E = np.full((moviesId, usersId), np.nan)
    for i in range(n):
        user = usersDic[data[i][1]]
        movie = moviesDic[data[i][0]]
        E[movie, user] = data[i][2]
    estimator = Imputer(0, strategy='mean')
    #estimator = SoftImpute()    
    #estimator.fit(E)
    #E = estimator.predict(E)
    E = estimator.fit_transform(E)
    return E, usersDic, moviesDic
def get_some_data():
    data = melbourne_data;
    y = data.Price
    X = data[cols_to_use]
    my_imputer = Imputer()
    imputed_X = my_imputer.fit_transform(X)
    return imputed_X, y
def plot_ROCList(clfList, data, labels, stringList=""):
    """
    Plot an ROC curve for each classifier in clfList, training on a single 80/20 split
    :param clfList:
    :param data:
    :param labels:
    :param stringList:
    :return:
    """
    if stringList == "":
        stringList = ["" for i in range(len(labels))]
    imp = Imputer(missing_values=np.NaN, strategy="mean")
    data = imp.fit_transform(data)

    # Cross-validate on the data once using each model to get a ROC curve
    AUCs, fprs, tprs, threshs = cvList(data, labels, clfList)

    # Plote a ROC for each clf in clfList
    for i in range(len(clfList)):
        fpr = fprs[i]
        tpr = tprs[i]
        plt.plot(fpr, tpr)
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title(stringList[i]+" ROC Curve, AUC = "+str(AUCs[i]))
        plt.savefig(stringList[i]+"_ROC.png")
        plt.close()
        print stringList[i] + ":" + str(AUCs[i])
Ejemplo n.º 29
0
def bnp_svm(train, test):
	print('bnpsvm')
	## If a value is missing, set it to the average
	imp = Imputer(missing_values='NaN', strategy='mean', axis=0)

	#print("cleaning data")
	train = train.sample(1000)
	## set up training data
	train1 = train.select_dtypes(include=['float64'])
	imp.fit(train1)
	train1 = imp.transform(train1)
	train1 = np.array(train1).astype(float)
	## set up real y
	target = np.array(train['target']).astype(int)


	## set up testing data
	test1 = test.select_dtypes(include=['float64'])
	test1 = imp.transform(test1)
	test1 = np.array(test1).astype(float)



	#print("training...")
	clf = svm.SVC(gamma=0.001, C=100, probability=True)
	#print("testing")
	clf.fit(train1, target)
	#print("predicting")
	yhat = clf.predict_proba(test1)
	return yhat


#print(bnp_svm(train, test))
def run_clfList(clfList, stringList="", normalize=False):
    """
    Run 100-fold 80/20 cross-validation on each classifier in clfList
    print the average AUC for each classifier
    :param clfList: list of classifiers to run
    :param stringList: names of the classifiers
    :param normalize: whether or not to normalize the data
    :return: the average AUC for each classifier in clfList
    """
    # data, labels = six_features(force=False)
    # data, labels = six_and_time_features(force=False)
    # data, labels = five_features(force=False)
    # data, labels = five_and_rts(force=False)
    data, labels = new_features()
    if normalize:
        data = normalize_data(data)

    imp = Imputer(missing_values=np.NaN, strategy="mean")
    data = imp.fit_transform(data)

    # Cross-validate all clfs 100 times
    means = kfoldcvList(data, labels, clfList, 100)
    if stringList == "":
        stringList = ["" for i in range(len(labels))]

    # Print out the mean AUCs
    for i, mean in enumerate(means):
        print stringList[i]+": "+str(mean)

    for mean in means:
        sys.stdout.write(str(mean) + " & ")
    sys.stdout.write("\n")
    return means
# Importing the libraries

import numpy as np  #contains mathematical tools
import matplotlib.pyplot as plt  #plot charts
import pandas as pd  #to import and manage datasets

# Importing dataset
dataset = pd.read_csv('Data.csv')  #reading dataset
# iloc -> integer-location based indexing for selection by position.
X = dataset.iloc[:, :
                 -1].values  #taking all columns except last one which is output label
Y = dataset.iloc[:, 3].values  #taking column of output label

# Taking care of missing data
from sklearn.preprocessing import Imputer  #for completing missing values .Select and Press Ctrl +I to see syntax
imputer = Imputer(missing_values='NaN', strategy='mean',
                  axis=0)  #Replace missing values by mean
imputer = imputer.fit(X[:, 1:3])  #since index 1 and 2 contains missing columns
X[:, 1:3] = imputer.transform(X[:, 1:3])

#Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder  #LabelEncoder to encode values and OneHotEncoder to give dummy values
labelEncoder_X = LabelEncoder()
X[:, 0] = labelEncoder_X.fit_transform(
    X[:, 0])  #to categorize Country column gives encoded values
onehotencoder = OneHotEncoder(
    categorical_features=[0])  # to give which column to categorize
X = onehotencoder.fit_transform(X).toarray()

#to categorize output label  it  wont need OneHotEncoder since it is dependent variable with only 2 labels Yes or No
labelEncoder_Y = LabelEncoder()
Y = labelEncoder_Y.fit_transform(
Ejemplo n.º 32
0
# Data Preprocessing Template

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 3].values

# filling the missing values
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN', strategy='median',
                  axis=0)  # 0 for along columns and 1 for along rows
imputer.fit(X[:, 1:3])  # 3 is exclude (1 and 2 have missing data)
X[:, 1:3] = imputer.transform(X[:, 1:3])  # missing data will be filled

# Encoding categorical data (countries here)
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])

# As we cant categories countries so we will create dummy variables(matrix form)
onehotencoder = OneHotEncoder(categorical_features=[0])
X = onehotencoder.fit_transform(X).toarray()

labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)  # 0 to no and y to yes
Ejemplo n.º 33
0
def processData():

    train = pd.read_csv("train.csv")

    catFeatures = []
    numFeatures = []
    for name, val in zip(train.columns, train.dtypes):
        if val in [np.dtype('O'), np.dtype('int64')]:
            if name not in [
                    'GTIME', 'GSTATUS_THREE_MONTHS', 'GSTATUS_SIX_MONTHS',
                    'GSTATUS_ONE_YEAR', 'GSTATUS_THREE_YEARS'
            ]:
                catFeatures.append(name)
        else:
            numFeatures.append(name)

    # catFeatures = ['GENDER', 'ABO', 'LIFE_SUP_TCR', 'MALIG_TCR', 'EXC_HCC', 'EXC_CASE', 'PERM_STATE', 'PREV_AB_SURG_TCR', 'BACT_PERIT_TCR', 'PORTAL_VEIN_TCR', 'TIPSS_TCR', 'WORK_INCOME_TCR', 'INIT_DIALYSIS_PRIOR_WEEK', 'INIT_MELD_OR_PELD', 'FINAL_DIALYSIS_PRIOR_WEEK', 'FINAL_MELD_OR_PELD', 'PERM_STATE_TRR', 'WORK_INCOME_TRR', 'MALIG_TRR', 'LIFE_SUP_TRR', 'PORTAL_VEIN_TRR', 'PREV_AB_SURG_TRR', 'TIPSS_TRR', 'HBV_CORE', 'HBV_SUR_ANTIGEN', 'HCV_SEROSTATUS', 'EBV_SEROSTATUS', 'HIV_SEROSTATUS', 'CMV_STATUS', 'CMV_IGG', 'CMV_IGM', 'TXLIV', 'PREV_TX', 'DDAVP_DON', 'CMV_DON', 'HEP_C_ANTI_DON', 'HBV_CORE_DON', 'HBV_SUR_ANTIGEN_DON', 'DON_TY', 'GENDER_DON', 'HOME_STATE_DON', 'NON_HRT_DON', 'ANTIHYPE_DON', 'PT_DIURETICS_DON', 'PT_STEROIDS_DON', 'PT_T3_DON', 'PT_T4_DON', 'VASODIL_DON', 'VDRL_DON', 'CLIN_INFECT_DON', 'EXTRACRANIAL_CANCER_DON', 'HIST_CIG_DON', 'HIST_COCAINE_DON', 'DIABETES_DON', 'HIST_HYPERTENS_DON', 'HIST_OTH_DRUG_DON', 'ABO_DON', 'INTRACRANIAL_CANCER_DON', 'SKIN_CANCER_DON', 'HIST_CANCER_DON', 'PT_OTH_DON', 'HEPARIN_DON', 'ARGININE_DON', 'INSULIN_DON', 'DIAL_TX', 'ABO_MAT', 'AGE_GROUP', 'MALIG', 'RECOV_OUT_US', 'TATTOOS', 'LI_BIOPSY', 'PROTEIN_URINE', 'CARDARREST_NEURO', 'INOTROP_SUPPORT_DON', 'CDC_RISK_HIV_DON', 'HISTORY_MI_DON', 'CORONARY_ANGIO_DON', 'LT_ONE_WEEK_DON']
    # numFeatures = ['WGT_KG_DON_CALC', 'INIT_INR', 'ETHCAT_DON', 'ETHNICITY', 'DGN_TCR', 'REM_CD', 'INIT_AGE', 'ALBUMIN_TX', 'BMI_DON_CALC', 'EXC_EVER', 'OTH_LIFE_SUP_TCR', 'FINAL_ASCITES', 'WGT_KG_CALC', 'END_BMI_CALC', 'LISTYR', 'DDR1', 'FINAL_ALBUMIN', 'DB2', 'INIT_BMI_CALC', 'CITIZENSHIP', 'DB1', 'EDUCATION', 'DAYSWAIT_CHRON', 'OTH_LIFE_SUP_TRR', 'MED_COND_TRR', 'INIT_WGT_KG', 'MELD_PELD_LAB_SCORE', 'NUM_PREV_TX', 'INIT_SERUM_SODIUM', 'VENTILATOR_TCR', 'TX_PROCEDUR_TY', 'LITYP', 'INIT_SERUM_CREAT', 'WGT_KG_TCR', 'TBILI_DON', 'HGT_CM_CALC', 'SGOT_DON', 'ASCITES_TX', 'INIT_MELD_PELD_LAB_SCORE', 'ECD_DONOR', 'CREAT_TX', 'INIT_ENCEPH', 'INIT_HGT_CM', 'PRI_PAYMENT_TRR', 'INIT_STAT', 'ARTIFICIAL_LI_TCR', 'PT_CODE', 'WL_ID_CODE', 'INIT_ALBUMIN', 'ARTIFICIAL_LI_TRR', 'AGE_DON', 'ON_VENT_TRR', 'PRI_PAYMENT_TCR', 'BLOOD_INF_DON', 'CREAT_DON', 'REGION', 'INIT_ASCITES', 'HEMATOCRIT_DON', 'DIAB', 'TBILI_TX', 'FINAL_INR', 'AGE', 'FUNC_STAT_TRR', 'ETHCAT', 'CITIZENSHIP_DON', 'DEATH_MECH_DON', 'FUNC_STAT_TCR', 'FINAL_SERUM_SODIUM', 'COD_CAD_DON', 'FINAL_BILIRUBIN', 'BUN_DON', 'END_STAT', 'BMI_CALC', 'DDR2', 'FINAL_SERUM_CREAT', 'HIST_DIABETES_DON', 'ENCEPH_TX', 'SHARE_TY', 'DA1', 'PH_DON', 'FINAL_MELD_PELD_LAB_SCORE', 'BMI_TCR', 'INIT_BILIRUBIN', 'DISTANCE', 'SGPT_DON', 'PULM_INF_DON', 'HGT_CM_TCR', 'TRANSFUS_TERM_DON', 'FINAL_ENCEPH', 'DIAG', 'DA2', 'HGT_CM_DON_CALC', 'URINE_INF_DON', 'COLD_ISCH', 'INR_TX', 'DEATH_CIRCUM_DON', 'CANCER_SITE_DON']

    #Categorical pipeline
    cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(catFeatures)),
        ('imputer', CategoricalImputer()),
        ('cat_encoder',
         CategoricalEncoder("onehot-dense", handle_unknown='ignore')),
    ])

    #Numerical pipeline
    num_pipeline = Pipeline([
        ('selector', DataFrameSelector(numFeatures)),
        ('imputer', Imputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

    #Full pipeline
    full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

    # train = pd.read_csv("train.csv")
    X_train = full_pipeline.fit_transform(train.loc[:,
                                                    catFeatures + numFeatures])
    gstatusSixMonths_train = train["GSTATUS_SIX_MONTHS"].values
    gstatusOneYear_train = train["GSTATUS_ONE_YEAR"].values
    gstatusThreeYears_train = train["GSTATUS_THREE_YEARS"].values
    gstatus_train = train["GSTATUS_THREE_YEARS"].values
    gtime_train = train["GTIME"].values
    Y_train = np.array([[gstatus_train[i], gtime_train[i]]
                        for i in range(len(gtime_train))
                        ])  #[is_not_censored, survival time]

    test = pd.read_csv("test.csv")
    X_test = full_pipeline.transform(test.loc[:, catFeatures + numFeatures])
    gstatusSixMonths_test = test["GSTATUS_SIX_MONTHS"].values
    gstatusOneYear_test = test["GSTATUS_ONE_YEAR"].values
    gstatusThreeYears_test = test["GSTATUS_THREE_YEARS"].values
    gstatus_test = test["GSTATUS_THREE_YEARS"].values
    gtime_test = test["GTIME"].values
    Y_test = np.array([[gstatus_test[i], gtime_test[i]]
                       for i in range(len(gtime_test))
                       ])  #[is_not_censored, survival time]

    return X_train, Y_train, X_test, Y_test
Ejemplo n.º 34
0
import pandas as pd

base = pd.read_csv('./datasets/credit-data.csv')  #load a file
base.loc[base.age < 0, 'age'] = base['age'][
    base.age > 0].mean()  #replace all negative ages to ages mean

base.loc[pd.isnull(base['age'])]  #find out on base all indexes with null age

forecasts = base.iloc[:, 1:
                      4].values  #forecasts variable will receive the columns 1,2 and 3 of base. The ":" means we want all lines.
classes = base.iloc[:, 4].values

#Using sklearn to localize all missing_values and replace using a strategy
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer = imputer.fit(forecasts[:, 1:4])
forecasts[:, 1:4] = imputer.transform(forecasts[:, 1:4])

#when using knn algorithms is necessary standardisation or normalization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
forecasts = scaler.fit_transform(forecasts)

#Divide database in training data and test data
from sklearn.cross_validation import train_test_split
forecasts_training, forecasts_testing, classes_training, classes_testing = train_test_split(
    forecasts, classes, test_size=0.25, random_state=0)

from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
classifier.fit(forecasts_training, classes_training)
Ejemplo n.º 35
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Sep 18 20:07:04 2018

@author: diego
"""

import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer, MinMaxScaler

data = pd.read_csv('../data/pacientes_ucic.csv', sep=';')

imputer = Imputer()
minmaxscaler = MinMaxScaler()

data['SAPS-3'] = imputer.fit_transform(data[['SAPS-3']])
"""
data['SAPS-3'] = minmaxscaler.fit_transform(data[['SAPS-3']])
"""

print(data['SAPS-3'])
Ejemplo n.º 36
0
#print(corr_matrix["median_house_value"].sort_values(ascending=False))

housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

sample_incomplete_rows = housing[housing.isnull().any(axis=1)].head()
#print(sample_incomplete_rows)
sample_incomplete_rows.dropna(subset=["total_bedrooms"])

sample_incomplete_rows.drop("total_bedrooms", axis=1)

median = housing["total_bedrooms"].median()
sample_incomplete_rows["total_bedrooms"].fillna(median, inplace=True)
#print(sample_incomplete_rows)

imputer = Imputer(strategy="median")

housing_num = housing.drop('ocean_proximity', axis=1)

imputer.fit(housing_num)

#print(imputer.statistics_)

#print(housing_num.median().values)

X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X,
                          columns=housing_num.columns,
                          index=list(housing.index.values))
housing_tr.loc[sample_incomplete_rows.index.values]
Ejemplo n.º 37
0
import pandas as pd

dataset = pd.read_csv('data1.csv')
X = dataset.iloc[:, 1:11].values
y = dataset['Class']
'''
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder = LabelEncoder()
X[:, 1:11] = labelencoder.fit_transform(X[:,1:11])
onehotencoder = OneHotEncoder(categorical_features = [10])
X = onehotencoder.fit_transform(X).toarray()'''

# Handling missing values
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer = imputer.fit(X[:, 1:11])
X[:, 1:11] = imputer.transform(X[:, 1:11])

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
Ejemplo n.º 38
0
from sklearn.preprocessing import LabelEncoder

labelencoder_X = LabelEncoder()
X_tr[:, 34] = labelencoder_X.fit_transform(X_tr[:, 34].astype(str))
X_tr[:, 35] = labelencoder_X.fit_transform(X_tr[:, 35].astype(str))
X_tr[:, 68] = labelencoder_X.fit_transform(X_tr[:, 68].astype(str))
X_tr[:, 93] = labelencoder_X.fit_transform(X_tr[:, 93].astype(str))
X_ts[:, 34] = labelencoder_X.fit_transform(X_ts[:, 34].astype(str))
X_ts[:, 35] = labelencoder_X.fit_transform(X_ts[:, 35].astype(str))
X_ts[:, 68] = labelencoder_X.fit_transform(X_ts[:, 68].astype(str))
X_ts[:, 93] = labelencoder_X.fit_transform(X_ts[:, 93].astype(str))

# missing data
from sklearn.preprocessing import Imputer

imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer = imputer.fit(X_tr[:, :])
X_tr[:, :] = imputer.transform(X_tr[:, :])
imputer = imputer.fit(X_ts[:, :])
X_ts[:, :] = imputer.transform(X_ts[:, :])

# Encoding categorical data: OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

a = np.concatenate((X_tr, X_ts))
onehotencoder = OneHotEncoder(categorical_features=[34, 35, 68, 93],
                              sparse=True)
a = onehotencoder.fit_transform(a).toarray()
X_tr = a[:len(X_tr), :]
X_ts = a[len(X_tr):, :]
Ejemplo n.º 39
0
import seaborn as sns
sns.boxplot(x="Pclass", y="Age", hue="Survived",data=dataset, palette="Set3")

''' 
--------------------------------------------------------------------------------------------------
        DATA PREP 
--------------------------------------------------------------------------------------------------
'''
# SELECTING X and y
X = dataset.iloc[:,[3,4]].values
# X = dataset.iloc[:,[1,3]].values

# HANDLING MISSING DATA
# fillnan with mean/median/most_frequent
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer = imputer.fit(X.iloc[:,0].values.reshape((len(X),1)))
X.iloc[:,0] = imputer.transform(X.iloc[:,0].values.reshape((len(X),1)))
X.info()
X.drop(0, axis=1, inplace=True)
# Drop Nan
X.dropna(inplace = True)
y.dropa(inplace = True)

# ENCODING CATEGORICAL FEATURES
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# The values 0,1,2,etc into categorical values
labelencoder_X = LabelEncoder()
X.iloc[:,1] = labelencoder_X.fit_transform(X.iloc[:, 1])
# Here we create the dummies
onehotencoder = OneHotEncoder(categorical_features=[1])
Ejemplo n.º 40
0
import pandas as pd
from sklearn import model_selection
import pickle

# Importing the dataset for training
dataset = pd.read_csv('train.csv')
new_data = dataset.iloc[:, [0, 1, 2, 4, 5, 6, 7, 9, 11]]
##encoding the training data
data_dummy = pd.get_dummies(new_data)
##
X_train = data_dummy.iloc[:, [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]].values
y_train = data_dummy.iloc[:, 1].values

#missing valuse addressing for training data
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer = imputer.fit(X_train[:, [2]])
X_train[:, [2]] = imputer.transform(X_train[:, [2]])
np.set_printoptions(threshold=np.nan)

test_dataset = pd.read_csv('test.csv')
test_verify = pd.read_csv('gender_submission.csv')
new_test_data = test_dataset.iloc[:, [0, 1, 3, 4, 5, 6, 8, 10]]
##encoding the training data
test_data_dummy = pd.get_dummies(new_test_data)
##
X_test = test_data_dummy.iloc[:, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]].values
y_test = test_verify.iloc[:, [1]].values

#missing valuse addressing for training data
from sklearn.preprocessing import Imputer
Ejemplo n.º 41
0
#test_X = test_data.drop(['Id'], axis=1)
low_cardinality_cols = [
    cname for cname in train_X.columns
    if train_X[cname].nunique() < 10 and train_X[cname].dtype == "object"
]
numeric_cols = [
    cname for cname in train_X.columns
    if train_X[cname].dtype in ['int64', 'float64']
]
my_cols = low_cardinality_cols + numeric_cols

train_predictors = train_X[my_cols]
#test_predictors = test_X[my_cols]
#print(train_predictors.shape)
one_hot_encoded_training_predictors = pd.get_dummies(train_predictors)
#one_hot_encoded_test_predictors = pd.get_dummies(test_predictors)
#print(*one_hot_encoded_training_predictors.columns, sep=',')
#my_submission = pd.DataFrame({'Id': test_data.Id, 'SalePrice': rf_val_predictions})
#one_hot_encoded_training_predictors.to_csv('TRAINING_1.csv', index=False)
#final_train, final_test = one_hot_encoded_training_predictors.align(one_hot_encoded_test_predictors,
#                                                                    join='left',
#                                                                    axis=1)
my_pipeline = make_pipeline(Imputer(), RandomForestRegressor())
#my_pipeline.fit(final_train, y)
#print(my_pipeline.predict(final_test))
scores = cross_val_score(my_pipeline,
                         one_hot_encoded_training_predictors,
                         y,
                         scoring='neg_mean_absolute_error')
print('Mean Absolute Error %2f' % (-1 * scores.mean()))
import pandas as pd

base = pd.read_csv('credit-data.csv')
base.loc[base.age < 0, 'age'] = 40.92

previsores = base.iloc[:, 1:4].values
classe = base.iloc[:, 4].values

from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer = imputer.fit(previsores[:, 1:4])
previsores[:, 1:4] = imputer.transform(previsores[:, 1:4])

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
previsores = scaler.fit_transform(previsores)

from sklearn.model_selection import train_test_split
previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = train_test_split(
    previsores, classe, test_size=0.25, random_state=0)

from sklearn.neural_network import MLPClassifier
classificador = MLPClassifier(verbose=True,
                              max_iter=1000,
                              tol=0.0000010,
                              solver='adam',
                              hidden_layer_sizes=(100),
                              activation='relu')
classificador.fit(previsores_treinamento, classe_treinamento)
previsoes = classificador.predict(previsores_teste)
Ejemplo n.º 43
0
Archivo: knn.py Proyecto: hetianch/UMAP
import numpy as np
# import data and labels using python array handling package numpy
data = np.loadtxt(
    "/Users/071cht/Desktop/programming_language_tutorial/Python/scikit/Ye_thesis/data_scikit/data.txt",
    delimiter=',')
labels = np.loadtxt(
    "/Users/071cht/Desktop/programming_language_tutorial/Python/scikit/Ye_thesis/data_scikit/labels.txt"
)
intLabels = labels.astype(int)
# scikit has a imputer class which provide basic strategies for imputing missing values, using mean/median/or the most frequent values of the row or column
# in which the missing values are located.

#the following code retures an np.array, data_nonmissing. It is a non-missing value version of data
from sklearn.preprocessing import Imputer

imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(data)
data_nonmissing = imp.transform(data)

#rename data and intLables to chase_X and chase_y for model fitting convenience
X = data_nonmissing
y = intLabels

#set training and testing? data for 7-fold cross validation
seperateIdx = len(X) * 60 / 100
X_train = X[0:seperateIdx]
y_train = y[0:seperateIdx]
X_test = X[seperateIdx:]
y_test = y[seperateIdx:]

print X_train.shape, X_test.shape
Ejemplo n.º 44
0
# Data Preprocessing

# Importing the Libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('Data.csv')
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 3].values

# Take care of missing data
from sklearn.preprocessing import Imputer

imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer = imputer.fit(x[:, 1:3])
x[:, 1:3] = imputer.transform(x[:, 1:3])

# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

labelencoder_x = LabelEncoder()
x[:, 0] = labelencoder_x.fit_transform(x[:, 0])
onehotencoder = OneHotEncoder(categorical_features=[0])
x = onehotencoder.fit_transform(x).toarray()
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
    if(np.isnan(df_test['Age'][i])):
        if(df_test['Title'][i] == 'Mr'):
            df_test['Age'][i] = 25
        if(df_test['Title'][i] == 'Mrs'):
            df_test['Age'][i] = 25
        if(df_test['Title'][i] == 'Miss'):
            df_test['Age'][i] = 5
        if(df_test['Title'][i] == 'Master'):
            df_test['Age'][i] = 5


# In[ ]:


#After removing these features, it's time to fill the missing values
imputer = Imputer(missing_values = np.nan, strategy = 'median', axis = 0)
df_train[['Age']] = imputer.fit_transform(df_train[['Age']])
df_test[['Age']] = imputer.fit_transform(df_test[['Age']])

df_train.loc[ df_train['Age'] <= 16, 'Age'] = 0
df_train.loc[(df_train['Age'] > 16) & (df_train['Age'] <= 32), 'Age'] = 1
df_train.loc[(df_train['Age'] > 32) & (df_train['Age'] <= 48), 'Age'] = 2
df_train.loc[(df_train['Age'] > 48) & (df_train['Age'] <= 64), 'Age'] = 3
df_train.loc[ df_train['Age'] > 64, 'Age'] = 4

df_test.loc[ df_test['Age'] <= 16, 'Age'] = 0
df_test.loc[(df_test['Age'] > 16) & (df_test['Age'] <= 32), 'Age'] = 1
df_test.loc[(df_test['Age'] > 32) & (df_test['Age'] <= 48), 'Age'] = 2
df_test.loc[(df_test['Age'] > 48) & (df_test['Age'] <= 64), 'Age'] = 3
df_test.loc[ df_test['Age'] > 64, 'Age'] = 4
Ejemplo n.º 46
0
#coding:utf8

import numpy as np
import pandas as pd
from sklearn.preprocessing import Imputer, LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

df = pd.read_csv("./resource/1.csv")

X = df.iloc[:, :-1].values
Y = df.iloc[:, -1].values

# dealing with missing data
imputer = Imputer(missing_values="NaN", strategy="mean")
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

# print(X)

# deal with class y data
encoder_X = LabelEncoder()
X[:, 0] = encoder_X.fit_transform(X[:, 0])

# print(X)

one_hot_encoder = OneHotEncoder(categorical_features=[0])
X = one_hot_encoder.fit_transform(X).toarray()
# print(one_hot_encoder.n_values_)
# print(one_hot_encoder.feature_indices_)
# print(X)
Ejemplo n.º 47
0
import seaborn as sns
#Importing train data
train = pd.read_csv("titanic_train.csv")

#Splitting the train dataset
train.drop('Cabin', axis=1, inplace=True)
x_train = train.drop('Survived', axis=1)
y_train = train['Survived']

#Visualising train data for null values
sns.heatmap(x_train.isnull())

#Filling the missing values of Age column in train dataset with its mean.

from sklearn.preprocessing import Imputer
imputer_train = Imputer(missing_values="NaN", strategy="mean", axis=0)
imputer_train = imputer_train.fit(x_train['Age'].values.reshape(-1, 1))
x_train['Age'] = imputer_train.transform(x_train['Age'].values.reshape(-1, 1))

#Visualising train data for null values
sns.heatmap(x_train.isnull())

#Importing test data
test = pd.read_csv('titanic_test.csv')
x_test = train.drop('Survived', axis=1)
y_test = train['Survived']

#Visualising test data for null values
sns.heatmap(x_test.isnull())

#Filling the missing values of Age column in test dataset with its mean.
import pandas as pd
import numpy as np

data = pd.read_excel("/home/karthik/Cauvery Study/dataset_rainfall_whole.xlsx")
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

from sklearn.preprocessing import Imputer

imputer = Imputer(missing_values=0, strategy='most_frequent', axis=0)
imputer = imputer.fit(X[:, [5, 6]])
X[:, [5, 6]] = imputer.transform(X[:, [5, 6]])

X1 = data.iloc[:, [0, 1, 2, 3, 4, 5, 6, 8]]
#imputer = imputer.fit(y[:,0:1])
#y[:,0:1] = imputer.transform(y[:,0:1])

#from sklearn.model_selection import train_test_split
#X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.1)

from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators=10)
regressor.fit(X, y)

from tkinter import *

master = Tk()

y_pred = regressor.predict(X1).astype(int)
Ejemplo n.º 49
0
def imputer(X):
    #fill in empty values
    imp = Imputer(missing_values="NaN", strategy="most_frequent", axis=0)
    imp = imp.fit(X[:, [1, 2, 3, -1]])
    X[:, [1, 2, 3, -1]] = imp.transform(X[:, [1, 2, 3, -1]])
    return X
Ejemplo n.º 50
0
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, Imputer, StandardScaler

train_data = pd.read_csv('train_data/data.csv', delimiter=',')

x = train_data[train_data.keys()[:-1]].values
y = train_data['result'].values

result_encoder = LabelEncoder()
result_encoder.fit(y)
y = result_encoder.transform(y)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Data normalization
imputer = Imputer(strategy='mean')
imputer.fit(x_train)
X_train = imputer.transform(x_train)
X_test = imputer.transform(x_test)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

model = Sequential([
    Dense(26, input_dim=x.shape[1]),
    Activation('relu'),
    Dense(1),
    Activation('sigmoid')
])
Ejemplo n.º 51
0
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

np.set_printoptions(threshold=np.nan)

dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, 3].values

from sklearn.preprocessing import Imputer, LabelEncoder, OneHotEncoder

imputer = Imputer(missing_values='NaN', strategy="mean", axis=0)
imputer = imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

labelEncoder = LabelEncoder()
X[:, 0] = labelEncoder.fit_transform(X[:, 0])
oneHotEncoder_X = OneHotEncoder(categorical_features=[0])
X = oneHotEncoder_X.fit_transform(X).toarray()

labelEncoder_Y = LabelEncoder()
Y = labelEncoder_Y.fit_transform(Y)
Ejemplo n.º 52
0
dfg.sum()
df[df['R6'] > 1]
df[df.R6 > 1] = df.R6.mean()
df[df.R5 > 1] = df.R5.mean()
df[df.R19 > 1] = df.R19.mean()
df.max()
df.columns
X = df.drop(['object'], axis=1)
Y = df['object']
Y.iloc[12] = 'R'
Y.iloc[19] = 'R'
Y.iloc[200] = 'M'
#Removing NA Values
#Take care of Missing data
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values="NaN", strategy='mean', axis=0)
imputer = imputer.fit(X)
X = imputer.transform(X)

#Converting Categorical data
#Encoding categorical data using dummy variables for X
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(Y)
onehotencoder = OneHotEncoder(categorical_features=[0])
Y = onehotencoder.fit_transform(Y).toarray()

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.3,
}

# Replacing Categorical Values with the Encoded Values
for col in col_names:
    if df[col].dtype == 'O':
        df[col] = df[col].replace(enc[col])

# Creating Separate Dataframes for Features and Class
X = df.iloc[:, :-1].values
y = df.iloc[:, 12].values

# Removing Loan_ID Column from the Dataset
X = np.delete(X, 0, 1)

# Creating Instances of Imputer Class for Missing Value Management
imputer_mode = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
imputer_mean = Imputer(missing_values='NaN', strategy='mean', axis=0)

# Replacing 'NaN' Values with Mode of the Values in the Respective Columns
X[:7] = imputer_mode.fit_transform(X[:7])
X[9:] = imputer_mode.fit_transform(X[9:])

# Replacing 'NaN' Values present in "LoanAmount" Column with the Mean of the Values of that Column
X_temp = X[:, 7].reshape(-1, 1)
X_temp = imputer_mean.fit_transform(X_temp)
X_temp = X_temp.reshape(1, -1)
X = np.delete(X, 7, 1)
X = np.insert(X, 7, X_temp, axis=1)

#----------------------------------------Data Preprocessing and Data Cleaning----------------------------------------
    action='store_true',
    help='Whether to use scikit data balancing by changing penalties '
    'in learning algorithm formulation or manually balance by '
    'undersampling majority class and oversampling minority class')
args = parser.parse_args()

if __name__ == "__main__":
    # Let numpy know that NA corresponds to our missing value
    data = numpy.genfromtxt(args.input_filename,
                            delimiter=",",
                            skip_header=1,
                            missing_values="NA",
                            filling_values="NaN")
    # Impute the data and replace missing values
    imputer = Imputer(missing_values="NaN",
                      strategy='mean',
                      axis=0,
                      copy=False)
    imputer.fit(data)
    data = imputer.transform(data)
    features = data[:, 0:args.label_column]
    labels = data[:, args.label_column].astype(int)

    train_features, test_features, train_labels, test_labels = (
        model_selection.train_test_split(features, labels, test_size=0.20))

    # scale data only if model is linear (svm, logisitic regression) or scales of features
    # are relevant (knn)
    if args.algorithm in ['linear-svm', 'kernel-svm', 'logistic', 'knn']:
        (train_features,
         test_features) = utils.scale_data(train_features, test_features,
                                           'minmax')
Ejemplo n.º 55
0
 def __init__(self):
     self.reg = Pipeline([
         ('imputer', Imputer(strategy='median')),
         ('regressor', RandomForestRegressor(n_estimators = 500, max_features=0.5, min_samples_leaf = 5))
     ])
Ejemplo n.º 56
0
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import Imputer, Normalizer
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

imputer = Imputer(strategy="median")
imputer.fit(training_features)
training_features = imputer.transform(training_features)
testing_features = imputer.transform(testing_features)

# Score on the training set was:0.9348076424295388
exported_pipeline = make_pipeline(
    StackingEstimator(
        estimator=GradientBoostingClassifier(learning_rate=0.01,
                                             max_depth=4,
                                             max_features=0.05,
                                             min_samples_leaf=10,
                                             min_samples_split=19,
                                             n_estimators=100,
                                             subsample=0.9000000000000001)),
    Normalizer(norm="max"),
Ejemplo n.º 57
0
@author: Ashlin
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
mydata = pd.read_csv('Data.csv')
print mydata.head()
print mydata.iloc[:, 0]

print mydata.head()
X = mydata.iloc[:, 0:3].values
y = mydata.iloc[:, 3].values
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
X[:, 1:3] = imputer.fit_transform(X[:, 1:3])

print X

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder = LabelEncoder()
X[:, 0] = labelencoder.fit_transform(X[:, 0])
onehotencoder = OneHotEncoder(categorical_features=[0])
X = onehotencoder.fit_transform(X).toarray()
print X
y = labelencoder.fit_transform(y)
print y
print X[:, 2]
from sklearn.model_selection import train_test_split
Ejemplo n.º 58
0
def predict_catkit_demo(images):
    """Return a prediction of adsorption energies for structures generated with
    CatKitDemo.

    Parameters
    ----------
    images : list
        List of atoms objects representing adsorbate-surface structures.
    model : str
        Path and filename of Catlearn model pickle.
    """
    model_ref = {'H': 'H2',
                 'O': 'H2O, H2',
                 'C': 'CH4, H2'}

    # Make list of strings showing the references.
    display_ref = []
    for atoms in images:
        try:
            initial_state = [model_ref[s] for s in
                             ase.atoms.string2symbols(
                                 atoms.info['key_value_pairs']['species'])]
        except KeyError:
            return {}
        display_ref.append(
                '*, ' + ', '.join(list(np.unique(initial_state))))

    images = autogen_info(images)

    gen = FeatureGenerator(nprocs=1)
    train_fpv = default_fingerprinters(gen, 'adsorbates')
    train_fpv = [gen.mean_chemisorbed_atoms,
                 gen.count_chemisorbed_fragment,
                 gen.count_ads_atoms,
                 gen.count_ads_bonds,
                 gen.ads_av,
                 gen.ads_sum,
                 gen.bulk,
                 gen.term,
                 gen.strain,
                 gen.mean_surf_ligands,
                 gen.mean_site,
                 gen.median_site,
                 gen.max_site,
                 gen.min_site,
                 gen.sum_site,
                 gen.generalized_cn,
                 gen.en_difference_ads,
                 gen.en_difference_chemi,
                 gen.en_difference_active,
                 gen.db_size,
                 gen.delta_energy]
    matrix = gen.return_vec(images, train_fpv)

    feature_index = np.load(clean_index_name)
    clean_feature_mean = np.load(clean_mean)

    impute = Imputer(missing_values="NaN", strategy='mean')
    impute.statistics_ = clean_feature_mean
    new_data = impute.transform(matrix[:, feature_index])

    prediction = gp.predict(new_data,
                            get_validation_error=False,
                            get_training_error=False,
                            uncertainty=True)

    output = {'mean': list(prediction['prediction']),
              'uncertainty': list(prediction['uncertainty']),
              'references': display_ref}
    return output
Ejemplo n.º 59
0
Archivo: CV.py Proyecto: bodowd/ZK
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import Imputer
import xgboost as xgb

from ZK import ZKTools

# --------------------
print('Loading data...')
path = '/Users/Bing/Documents/DS/Zillow_Kaggle/'
df_train = pd.read_csv('train_features.csv')
df_target = pd.read_csv('train_target.csv').values.ravel()

imp = Imputer()
df_train_imp = pd.DataFrame(imp.fit_transform(df_train),
                            columns=df_train.columns)

n_estimators = 1
random_state = 0
# RF
rf_params = {
    'n_jobs': -1,
    'n_estimators': n_estimators,
    'warm_start': True,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features': 'sqrt',
    'verbose': 0
}
rf = RandomForestRegressor(random_state=random_state, **rf_params)
rf_CV = ZKTools.CV(df_train=df_train_imp,
Ejemplo n.º 60
0
class NetworkClassifer():
    def __init__(self, features, labels, validation_features,
                 validation_labels):
        self.features = features
        self.feature_labels = [
            'min', 'max', 'mean', 'skew', 'std', 'kurtosis',
            'sum of absolute difference', 'baseline_n', 'baseline_diff',
            'baseline_diff_skew', 'n_pks', 'n_vals', 'av_pk', 'av_val',
            'av pk val range', '1 hz', '5 hz', '10 hz', '15 hz', '20 hz',
            '30 hz', '60 hz', '90 hz'
        ]

        self.labels = np.ravel(labels)
        self.validation_features = validation_features
        self.validation_labels = np.ravel(validation_labels)
        self.impute_and_scale()

    def impute_and_scale(self):
        print('Scaling and imputing training dataset...')
        self.imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
        self.imputer.fit(self.features)
        imputed_features = self.imputer.transform(self.features)
        self.std_scaler = StandardScaler()
        self.std_scaler.fit(imputed_features)
        self.iss_features = self.std_scaler.transform(imputed_features)
        print('Done')

        print(
            'Scaling and imputing validation features using training dataset...'
        )
        imputed_validation_features = self.imputer.transform(
            self.validation_features)
        self.iss_validation_features = self.std_scaler.transform(
            imputed_validation_features)
        print('Done')

    def _cross_validation(self, clf, k_folds=5):
        self.scores = cross_validation.cross_val_score(clf,
                                                       self.iss_features,
                                                       self.labels,
                                                       cv=k_folds,
                                                       n_jobs=5,
                                                       scoring='roc_auc')

    def randomforest_info(self, max_trees=1000, step=40, k_folds=5):
        print('Characterising R_forest. Looping through trees: ')
        self.treedata = np.zeros((max_trees / step, 10))
        for i, n_trees in enumerate(np.arange(0, max_trees, step)):
            if n_trees == 0:
                n_trees = 1

            r_forest = RandomForestClassifier(
                n_estimators=n_trees,
                n_jobs=5,
                max_depth=None,
                min_samples_split=1,
                random_state=0,
            )
            scores = cross_validation.cross_val_score(r_forest,
                                                      self.iss_features,
                                                      self.labels,
                                                      cv=k_folds,
                                                      n_jobs=5)
            r_forest_full = RandomForestClassifier(n_estimators=n_trees,
                                                   n_jobs=5,
                                                   max_depth=None,
                                                   min_samples_split=1,
                                                   random_state=0)
            r_forest_full.fit(self.iss_features, self.labels)
            self.treedata[i, 0] = n_trees
            self.treedata[i, 1] = scores.mean()
            self.treedata[i, 2] = scores.std()
            # now add the test dataset - score
            self.treedata[i, 3] = r_forest_full.score(
                self.iss_validation_features, self.validation_labels)

            r_forest_lda = RandomForestClassifier(n_estimators=n_trees,
                                                  n_jobs=5,
                                                  max_depth=None,
                                                  min_samples_split=1,
                                                  random_state=0)
            r_forest_lda_full = RandomForestClassifier(n_estimators=n_trees,
                                                       n_jobs=5,
                                                       max_depth=None,
                                                       min_samples_split=1,
                                                       random_state=0)
            r_forest_lda_full.fit(self.lda_iss_features, self.labels)
            lda_scores = cross_validation.cross_val_score(
                r_forest_lda,
                self.lda_iss_features,
                self.labels,
                cv=k_folds,
                n_jobs=5)
            self.treedata[i, 4] = lda_scores.mean()
            self.treedata[i, 5] = lda_scores.std()
            self.treedata[i, 6] = r_forest_lda_full.score(
                self.lda_iss_validation_features, self.validation_labels)

            r_forest_pca = RandomForestClassifier(n_estimators=n_trees,
                                                  n_jobs=5,
                                                  max_depth=None,
                                                  min_samples_split=1,
                                                  random_state=0)
            r_forest_pca_full = RandomForestClassifier(n_estimators=n_trees,
                                                       n_jobs=5,
                                                       max_depth=None,
                                                       min_samples_split=1,
                                                       random_state=0)
            r_forest_pca_full.fit(self.pca_iss_features, self.labels)
            pca_scores = cross_validation.cross_val_score(
                r_forest_pca,
                self.pca_iss_features,
                self.labels,
                cv=k_folds,
                n_jobs=5)
            self.treedata[i, 7] = pca_scores.mean()
            self.treedata[i, 8] = pca_scores.std()
            self.treedata[i, 9] = r_forest_pca_full.score(
                self.pca_iss_validation_features, self.validation_labels)

    def pca(self, n_components=6):
        self.pca = PCA(n_components)
        self.pca_iss_features = self.pca.fit_transform(self.iss_features)
        self.pca_iss_validation_features = self.pca.transform(
            self.iss_validation_features)

    def lda(self, n_components=2, pca_reg=True, reg_dimensions=10):
        self.lda = LinearDiscriminantAnalysis(n_components=n_components,
                                              solver='eigen',
                                              shrinkage='auto')
        #self.lda = LDA(n_components)
        if pca_reg:
            self.pca_reg = PCA(reg_dimensions)
            pca_reg_features = self.pca_reg.fit_transform(self.iss_features)
            self.lda_iss_features = self.lda.fit_transform(
                pca_reg_features, self.labels)
            pca_reg_validation_features = self.pca_reg.transform(
                self.iss_validation_features)
            self.lda_iss_validation_features = self.lda.transform(
                pca_reg_validation_features)
        else:
            self.lda_iss_features = self.lda.fit_transform(
                self.iss_features, self.labels)
            self.lda_iss_validation_features = self.lda.transform(
                self.iss_validation_features)

    def lda_run(self, k_folds=5):
        self.r_forest_lda = RandomForestClassifier(n_estimators=2000,
                                                   n_jobs=5,
                                                   max_depth=None,
                                                   min_samples_split=2,
                                                   random_state=7,
                                                   max_leaf_nodes=None,
                                                   min_samples_leaf=2,
                                                   criterion='gini',
                                                   max_features='sqrt',
                                                   class_weight='balanced')

        self.lda_scores = cross_validation.cross_val_score(
            self.r_forest_lda,
            self.lda_iss_features,
            self.labels,
            cv=k_folds,
            n_jobs=5)
        print(
            "Cross validation Random Forest performance LDA: Accuracy: %0.2f (std %0.2f)"
            % (self.lda_scores.mean() * 100, self.lda_scores.std() * 100))
        self.r_forest_lda.fit(self.lda_iss_features, self.labels)
        print(
            str(
                self.r_forest_lda.score(self.lda_iss_validation_features,
                                        self.validation_labels) * 100) +
            'LDA test-set performance')

        y_true = self.validation_labels
        y_pred = self.r_forest_lda.predict(self.lda_iss_validation_features)
        target_names = ['S1', 'S2', 'S3', 'S4']
        report = classification_report(y_true,
                                       y_pred,
                                       target_names=target_names)
        print('Random forest report lda')
        print(report)

        ##### Hacky way to export features, so can optimise RF etc ######
        train_X = pd.DataFrame(self.lda_iss_features)
        train_y = pd.DataFrame(self.labels)
        training = pd.concat([train_X, train_y], axis=1)
        training.to_csv(
            '/Users/Jonathan/Dropbox/Data_sharing_VMJC/training_lda.csv',
            index=False)

        test_X = pd.DataFrame(self.lda_iss_validation_features)
        test_y = pd.DataFrame(self.validation_labels)
        test = pd.concat([test_X, test_y], axis=1)
        test.to_csv('/Users/Jonathan/Dropbox/Data_sharing_VMJC/test_lda.csv',
                    index=False)

        train_X = pd.DataFrame(self.iss_features)
        train_y = pd.DataFrame(self.labels)
        training = pd.concat([train_X, train_y], axis=1)
        training.to_csv(
            '/Users/Jonathan/Dropbox/Data_sharing_VMJC/training.csv',
            index=False)

        test_X = pd.DataFrame(self.iss_validation_features)
        test_y = pd.DataFrame(self.validation_labels)
        test = pd.concat([test_X, test_y], axis=1)
        test.to_csv('/Users/Jonathan/Dropbox/Data_sharing_VMJC/test.csv',
                    index=False)

    def pca_run(self, k_folds=5):
        self.r_forest_pca = RandomForestClassifier(n_estimators=2000,
                                                   n_jobs=5,
                                                   max_depth=None,
                                                   min_samples_split=1,
                                                   random_state=0)
        self.pca_scores = cross_validation.cross_val_score(
            self.r_forest_pca,
            self.pca_iss_features,
            self.labels,
            cv=k_folds,
            n_jobs=5)
        print(
            "Cross validation RF performance PCA: Accuracy: %0.2f (std %0.2f)"
            % (self.pca_scores.mean() * 100, self.pca_scores.std() * 100))

        self.r_forest_pca.fit(self.pca_iss_features, self.labels)
        print(
            str(
                self.r_forest_pca.score(self.pca_iss_validation_features,
                                        self.validation_labels)) +
            'PCA test-set performance ')

    def run(self):

        r_forest = RandomForestClassifier(n_estimators=2000,
                                          n_jobs=5,
                                          max_depth=None,
                                          min_samples_split=1,
                                          random_state=0,
                                          class_weight='balanced')
        self._cross_validation(r_forest)
        print("Cross validation RF performance: Accuracy: %0.2f (std %0.2f)" %
              (self.scores.mean() * 100, self.scores.std() * 100))

        self.r_forest = RandomForestClassifier(n_estimators=2000,
                                               n_jobs=5,
                                               max_depth=None,
                                               min_samples_split=1,
                                               random_state=0,
                                               class_weight='balanced')
        self.r_forest.fit(self.iss_features, self.labels)

        print(
            str(
                self.r_forest.score(self.iss_validation_features,
                                    self.validation_labels)) +
            'randomforest test-set performance')

        y_true = self.validation_labels
        y_pred = self.r_forest.predict(self.iss_validation_features)
        target_names = ['inter-ictal', 'ictal']
        target_names = ['S1', 'S2', 'S3', 'S4']
        t = classification_report(y_true, y_pred, target_names=target_names)
        print('Random forest report:')
        print(t)

        return None