Esempio n. 1
0
 def make_preprocessing_pandas(self, _df_csv_read_ori, _preprocessing_type , _label):
     """ SKLearn을 사용해서 Pandas를 Proprocessing
         label은 Preprocessing 하면 안됨
     Args:
       params:
         * _preprocessing_type: ['scale', 'minmax_scale', 'robust_scale', 'normalize', 'maxabs_scale']
         * _df_csv_read_ori : pandas dataframe
         * _label
     Returns:
       Preprocessing DataFrame
     """
     if _preprocessing_type == None or _preprocessing_type == 'null':
         logging.info("No Preprocessing")
         result_df =  _df_csv_read_ori
     else :
         logging.info("Preprocessing type : {0}".format(_preprocessing_type))
         numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
         for i, v in _df_csv_read_ori.dtypes.iteritems():
             if v in numerics:
                 if i not in _label:
                     #preprocessing_types = ['scale', 'minmax_scale', 'robust_scale', 'normalize', 'maxabs_scale']
                     #_preprocessing_type = ['maxabs_scale']
                     if 'scale' in _preprocessing_type:
                         _df_csv_read_ori[i] = preprocessing.scale(_df_csv_read_ori[i].fillna(0.0))
                     if 'minmax_scale' in _preprocessing_type:
                         _df_csv_read_ori[i] = preprocessing.minmax_scale(_df_csv_read_ori[i].fillna(0.0))
                     if 'robust_scale' in _preprocessing_type:
                         _df_csv_read_ori[i] = preprocessing.robust_scale(_df_csv_read_ori[i].fillna(0.0))
                     if 'normalize' in _preprocessing_type:
                         _df_csv_read_ori[i] = preprocessing.normalize(_df_csv_read_ori[i].fillna(0.0))
                     if 'maxabs_scale' in _preprocessing_type:
                         _df_csv_read_ori[i] = preprocessing.maxabs_scale(_df_csv_read_ori[i].fillna(0.0))
         result_df = _df_csv_read_ori
     return result_df
Esempio n. 2
0
def display_network(W, n_col=None, n_row=None, transpose=False, padding=1, image_shape=None):
    """visualizing

    :param W:
    :param transpose:
    :return:
    """
    # scale each one to [-1, 1]
    assert W.ndim == 2
    # TODO: add other normalization behaviour
    W = maxabs_scale(W, axis=1)
    n_basis, n_pixel = W.shape
    if image_shape is None:
        image_shape = int(np.sqrt(n_pixel)), int(np.sqrt(n_pixel))
    assert image_shape[0] * image_shape[1] == n_pixel
    if n_col is None and n_row is None:
        n_col = int(np.ceil(np.sqrt(n_basis)))
        n_row = int(np.ceil(float(n_basis) / n_col))
    cell_height = image_shape[0] + 2 * padding
    cell_width = image_shape[1] + 2 * padding
    total_image = np.ones(shape=(n_row * cell_height, n_col * cell_width),
                          dtype=np.float64)

    for idx, (row_idx, col_idx) in enumerate(product(range(n_row), range(n_col))):
        if idx >= n_basis:
            break

        position_to_plot = (slice(row_idx * cell_height + padding, row_idx * cell_height + padding + image_shape[0]),
                            slice(col_idx * cell_width + padding, col_idx * cell_width + padding + image_shape[1]))
        cell_this = W[idx].reshape(image_shape)
        if transpose:
            cell_this = cell_this.T
        total_image[position_to_plot] = cell_this

    return total_image
Esempio n. 3
0
 def sparse(self, X):
     """
     This function performs a preprocessing on features which retains 
     the sparsity of features. A lot of the data is 0 which probably means 
     it's missing. Experimental use only.
     """
     return preprocessing.maxabs_scale(X)
Esempio n. 4
0
    def query(self, query, k=None, indices=None, sort=True, return_scores=False):
        models = self.retrieval_models
        weights = maxabs_scale(self.weights)  # max 1 does not crash [0,1]
        agg_fn = self.aggregation_fn
        # It's important that all retrieval model return the same number of documents.
        all_scores = [m.query(query, k=k, indices=indices, sort=False, return_scores=True)[1] for m in models]

        if weights is not None:
            all_scores = [weight * scores for weight, scores in zip(all_scores, weights)]

        scores = np.vstack(all_scores)
        if callable(agg_fn):
            aggregated_scores = agg_fn(scores)
        else:
            numpy_fn = getattr(np, agg_fn)
            aggregated_scores = numpy_fn(scores, axis=0)

        # combined = aggregate_dicts(combined, agg_fn=agg_fn, sort=True)

        # only cut-off at k if this is the final (sorted) output
        ind = argtopk(aggregated_scores, k) if sort else np.arange(aggregated_scores.shape[0])
        if return_scores:
            return ind, aggregated_scores[ind]
        else:
            return ind
Esempio n. 5
0
    def predict(self, X, do_scale=False):
        if do_scale:
            X = preprocessing.maxabs_scale(X)
        pred = self.active(X) 

        if self.__output_01:
            for x in numpy.nditer(pred, op_flags=['readwrite']):
                x[...] = 1. if x[...]>=.5 else 0.
        return pred
 def __init__(self, filePath="/home/francisco/voz",ldis=0.15,dthres=60):
     '''
     Constructor
     '''
     self.ldis=ldis #length discrimination rate a 15% work fine
     self.distThreshold=dthres #distance threshold 
     basepath=filePath
     self.mfccs={}
     mfccAll=[]
     labels=os.listdir(basepath)
     for i,nn in enumerate(labels):
         n=nn.split(".")[0]#take away extension .wav
         (rate1,sig1)=wav.read(basepath+'/{}.wav'.format(n))
         sig1=pp.maxabs_scale(sig1) #Normalize amplitude to +-1
         mfcc_feat1 = mfcc(sig1,rate1)
         self.mfccs[n]=mfcc_feat1
Esempio n. 7
0
    def fit(self, X, y, do_scale=False, batch_size=512):
        if do_scale:
            X = preprocessing.maxabs_scale(X)
        batch_num = (len(X) + batch_size - 1) / batch_size

        last_loss = None
        epoch = 0
        while 1:
            st = time.time()

            epoch += 1
            epoch_cost = 0
            for batch in range(batch_num):
                beg = batch * batch_size
                end = beg + batch_size
                # try to cast label.
                label = numpy.array(y[beg:end])#.astype(numpy.float32)
                if len(label.shape) == 1:
                    label.shape = (label.shape[0], 1)
                epoch_cost += self.train(X[beg:end], label)

            dt = time.time() - st

            loss = epoch_cost / batch_num
            diff_loss = 0
            if last_loss is not None:
                diff_loss = last_loss - loss
            print >> sys.stderr, 'Epoch[%d] loss : %f (diff_loss:%f, diff_time:%.3f(s))' % (
                    epoch, loss, diff_loss, dt)
            if last_loss is not None:
                if last_loss - loss < 1e-5:
                    print >> sys.stderr, 'Early stop'
                    break
                '''
                if self.__learning_rate>=1e-3 and last_loss - loss < 1e-3:
                    self.__learning_rate = self.__learning_rate * 0.5
                    print >> sys.stderr, 'Change learning rate : %f (%f)' % (self.__learning_rate,
                            last_loss - loss)
                '''
            last_loss = loss
Esempio n. 8
0
def extract_features(signal,signal_label,epoch,fs,is_emg=False):
    
    
    fr,p = calculate_psd_and_f(signal,fs,epoch)
    epoch = epoch*fs
    
    #Data from EEG
    
    if (is_emg == False):
        max_fr = 50
        
        ## Calculate the total power, total power per epoch, and extract the relevant frequencies.
        ## IMPORTANT NOTE: These are not the ACTUAL power values, they are standardized to account
        ## for individual variability, and are thus relative.
        freq = fr[(fr>=0.5) & (fr <=max_fr)]
        sum_power = p[:,(fr>=0.5) & (fr <=max_fr)]
        max_power = np.max(sum_power,axis=1)
        min_power = np.min(sum_power,axis=1)
        range_power = max_power - min_power
        std_power = ((sum_power.T-min_power)/range_power).T
           
        ## Calculate the relative power at the different brain waves:
        delta = np.sum(std_power[:,(freq>=0.5) & (freq <=4)],axis=1)
        
         
        thetacon = np.sum(std_power[:,(freq>=4) & (freq <=12)],axis=1)
        theta1 = np.sum(std_power[:,(freq>=6) & (freq <=9)],axis=1)
        theta2 = np.sum(std_power[:,(freq>=5.5) & (freq <=8.5)],axis=1)
        theta3 = np.sum(std_power[:,(freq>=7) & (freq <=10)],axis=1)
        
        beta = np.sum(std_power[:,(freq>=20) & (freq <=40)],axis=1)
                
        alpha = np.sum(std_power[:,(freq>=8) & (freq <=13)],axis=1)
        sigma = np.sum(std_power[:,(freq>=11) & (freq <=15)],axis=1)
        spindle = np.sum(std_power[:,(freq>=12) & (freq <=14)],axis=1)
        gamma= np.sum(std_power[:,(freq>=35) & (freq <=45)],axis=1)
        
        temp1= np.sum(std_power[:,(freq>=0.5) & (freq <=20)],axis=1)
        temp2= np.sum(std_power[:,(freq>=0.5) & (freq <=50)],axis=1)
        
        temp3= np.sum(std_power[:,(freq>=0.5) & (freq <=40)],axis=1)
        temp4= np.sum(std_power[:,(freq>=11) & (freq <=16)],axis=1)
        
        
        EEGrel1 = thetacon/delta;
        EEGrel2 = temp1/temp2;
        EEGrel3 = temp4/temp3;
        
        hann = np.hanning(12);
        
        spindelhan1=np.convolve(hann,EEGrel3,'same');
        
        spindelhan=np.transpose(spindelhan1);
        
        ## Calculate the 90% spectral edge:
        spectral90 = 0.9*(np.sum(sum_power,axis=1))
        s_edge = np.cumsum(sum_power,axis=1)
        l = [[n for n,j in enumerate(s_edge[row_ind,:]) if j>=spectral90[row_ind]][0] for row_ind in range(s_edge.shape[0])]
        spectral_edge = np.take(fr,l) # spectral edge 90%, the frequency below which power sums to 90% of the total power
        
         ## Calculate the 50% spectral mean:
        spectral50 = 0.5*(np.sum(sum_power,axis=1))
        s_mean = np.cumsum(sum_power,axis=1)
        l = [[n for n,j in enumerate(s_mean[row_ind,:]) if j>=spectral50[row_ind]][0] for row_ind in range(s_mean.shape[0])]
        spectral_mean50 = np.take(fr,l) 
                
    else:
        #for EMG
        max_fr = 100
        
        ## Calculate the total power, total power per epoch, and extract the relevant frequencies: 
        freq = fr[(fr>=0.5) & (fr <=max_fr)]
        sum_power = p[:,(fr>=0.5) & (fr <=max_fr)]
        max_power = np.max(sum_power,axis=1)
        min_power = np.min(sum_power,axis=1)
        range_power = max_power - min_power
        std_power = ((sum_power.T-min_power)/range_power).T
    
    
    ## Calculate the Root Mean Square of the signal
    signal = signal[0:p.shape[0]*epoch]
    s = np.reshape(signal,(p.shape[0],epoch))
    rms = np.sqrt(np.mean((s)**2,axis=1)) #root mean square
    ## Calculate amplitude and spectral variation:
    amplitude = np.mean(np.abs(s),axis=1)
    amplitude_m=np.median(np.abs(s),axis=1)
    signal_var = (np.sum((np.abs(s).T - np.mean(np.abs(s),axis=1)).T**2,axis=1)/(len(s[0,:])-1)) # The variation
    ## Calculate skewness and kurtosis
    m3 = np.mean((s-np.mean(s))**3,axis=1) #3rd moment
    m2 = np.mean((s-np.mean(s))**2,axis=1) #2nd moment
    m4 = np.mean((s-np.mean(s))**4,axis=1) #4th moment
    skew = m3/(m2**(3/2)) # skewness of the signal, which is a measure of symmetry
    kurt = m4/(m2**2) #kurtosis of the signal, which is a measure of tail magnitude
    
    ## Calculate more time features
    
    signalzero=preprocessing.maxabs_scale(s,axis=1)
    zerocross = (np.diff(np.sign(signalzero)) != 0).sum(axis=1)
        
    maxs = np.amax(s,axis=1)
    mins = np.amin(s,axis=1)
    
    peaktopeak= maxs - mins
    
    arv1 = ((np.abs(s)))

    arv = np.sum(arv1,axis=1)

    arv = arv / len(s)
                 
    #Energy and amplitud           
    
            
    deltacomp = butter_bandpass_filter(s, 0.5, 4, fs, 5)
    #calculate energy like this
    deltaenergy = sum([x*2 for x in np.matrix.transpose(deltacomp)])
    deltaamp = np.mean(np.abs(deltacomp),axis=1)
         
        
    thetacomp = butter_bandpass_filter(s, 4, 12, fs, 5)
    #calculate energy like this
    thetaenergy = sum([x*2 for x in np.matrix.transpose(thetacomp)])
    thetaamp = np.mean(np.abs(thetacomp),axis=1)
                 
       
    theta1comp = butter_bandpass_filter(s, 6, 9, fs, 5)
    #calculate energy like this
    theta1energy = sum([x*2 for x in np.matrix.transpose(theta1comp)])
    theta1amp = np.mean(np.abs(theta1comp),axis=1)  
    
    theta2comp = butter_bandpass_filter(s, 5.5, 8.5, fs, 5)
    #calculate energy like this
    theta2energy = sum([x*2 for x in np.matrix.transpose(theta2comp)])
    theta2amp = np.mean(np.abs(theta2comp),axis=1)
                 
    theta3comp = butter_bandpass_filter(s, 7, 10, fs, 5)
    #calculate energy like this
    theta3energy = sum([x*2 for x in np.matrix.transpose(theta3comp)])
    theta3amp = np.mean(np.abs(theta3comp),axis=1)
                 
    betacomp = butter_bandpass_filter(s, 20, 40, fs, 5)
    #calculate energy like this
    betaenergy = sum([x*2 for x in np.matrix.transpose(betacomp)])
    betaamp = np.mean(np.abs(betacomp),axis=1)
    
    alfacomp = butter_bandpass_filter(s, 8, 13, fs, 5)
    #calculate energy like this
    
    alfaenergy = sum([x*2 for x in np.matrix.transpose(alfacomp)])
    
    alfaamp = np.mean(np.abs(alfacomp),axis=1)
                 
    sigmacomp = butter_bandpass_filter(s, 11, 15, fs, 5)
    #calculate energy like this
    sigmaenergy = sum([x*2 for x in np.matrix.transpose(sigmacomp)])
    sigmaamp = np.mean(np.abs(sigmacomp),axis=1)
                 
    spindlecomp = butter_bandpass_filter(s, 12, 14, fs, 5)
    #calculate energy like this
    spindleenergy = sum([x*2 for x in np.matrix.transpose(spindlecomp)])
    spindleamp = np.mean(np.abs(spindlecomp),axis=1)
    
    gammacomp = butter_bandpass_filter(s, 35, 45, fs, 5)
    #calculate energy like this
    gammaenergy = sum([x*2 for x in np.matrix.transpose(gammacomp)])
    gammaamp = np.mean(np.abs(gammacomp),axis=1)
       
    ## Calculate the spectral mean and the spectral entropy (essentially the spectral power distribution):
    spectral_mean = np.mean(std_power,axis=1)
    spectral_entropy = -(np.sum((std_power+0.01)*np.log(std_power+0.01),axis=1))/(np.log(len(std_power[0,:])))
    
     
    ## Create a matrix of all of the features per each epoch of the signal
    corr_signal = signal[:len(signal)-(len(signal)%epoch)]
    epochs = np.arange(len(corr_signal)/epoch)+1
    
    if (is_emg == False):
        feature_matrix = np.column_stack((epochs,delta,deltaenergy,deltaamp, thetacon, thetaenergy, thetaamp, theta1, theta1energy,
                                          theta1amp, theta2, theta2energy, theta2amp, theta3, theta3energy, theta3amp, beta, 
                                          betaenergy, betaamp, alpha, alfaenergy, alfaamp, sigma, sigmaenergy, sigmaamp,
                                          spindle, spindleenergy, spindleamp, gamma, gammaenergy, gammaamp, EEGrel1, EEGrel2, 
                                          spindelhan, spectral_edge, spectral_mean50, zerocross, maxs, peaktopeak, arv,
                                          rms, amplitude, amplitude_m, signal_var, skew, kurt, spectral_mean, spectral_entropy))
                 
        features = (['epochs','delta','deltaenergy','deltaamp','thetacon','thetaenergy','thetaamp', 'theta1','theta1energy',
                     'theta1amp','theta2', 'theta2energy','theta2amp', 'theta3', 'theta3energy','theta3amp', 'beta', 
                     'betaenergy','betaamp','alpha', 'alfaenergy', 'alfaamp', 'sigma', 'sigmaenergy', 'sigmaamp', 
                     'spindle', 'spindlenergy', 'spindleamp', 'gamma', 'gammaenergy', 'gammaamp', 'EEGrel1', 'EEGrel2', 
                     'spindelhan', 'spectral_edge', 'spectral_mean50', 'zerocross', 'maxs' , 'peaktopeak', 'arv',
                     'rms', 'amplitude', 'amplitude_m', 'signal_var', 'skew', 'kurt', 'spectral_mean', 'spectral_entropy'])
    else:
        feature_matrix = np.column_stack((epochs,amplitude,signal_var,skew,kurt,rms,
                                     spectral_mean,spectral_entropy,amplitude_m))
        
        features = (['epochs','amplitude','signal_var','skew',
                          'kurt','rms','spectral_mean','spectral_entropy','amplitude_m'])
    feature_labels = []
    
    for i in range(len(features)):
        feature_labels.append('%s_%s' % (signal_label,features[i]))
    return feature_matrix,feature_labels
from sklearn.preprocessing import scale
import sklearn.preprocessing as pp
from sklearn.cluster import KMeans
import cv2
import path
import os

basepath="/home/francisco/voz"
kMns=KMeans(n_clusters=25)
mfccs={}
mfccAll=[]
labels=os.listdir(basepath)
for i,nn in enumerate(labels):
    n=nn.split(".")[0]
    (rate1,sig1)=wav.read(basepath+'/{}.wav'.format(n))
    sig1=pp.maxabs_scale(sig1)
    mfcc_feat1 = mfcc(sig1,rate1)
    #mfcc_feat1=scale(mfcc_feat1)#Standarizar?
    #for f in mfcc_feat1:
    #    mfccAll.append(f)
    mfccs[n]=mfcc_feat1
#mfccAll=np.array(mfccAll)
#print "mfccAll",mfccAll.shape
#mfccAll=scale(mfccAll)#whitenning,standarar
#kMns.fit(mfccAll)
#print kMns.predict(mfccs["paco_no_001"])
#print kMns.predict(mfccs["paco_uno_001"])

os.system("sox -r 16000 -t alsa default recording.wav silence 1 0.1 1% 1 1.5 1%")
(rate2,sig2) = wav.read("recording.wav")
#sig2=pp.maxabs_scale(sig2)
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn import preprocessing
import numpy as np

n = 2  # 训练样本数
m = 5  # 样本维度
X = np.random.randn(n, m) * 10  # 生成随机自变量
# w  = np.array([1,2,3,4,5,0.01,0.1,0.05,0.0001,6])
# # w = np.arange(1,11,1).T                     # 设置线性系数
# Y  = np.dot(X,w) + np.random.randn(n)/10       # 经过线性系数加权并加噪
# Y[Y>0] = 1
# Y[Y<=0] = 0
# X1 = preprocessing.normalize(X,norm='l2')
X2 = preprocessing.maxabs_scale(X, axis=1)
# print(X,'\n',X2)
X3 = preprocessing.normalize(X, axis=0)
print(X, '\n', X3)

# n = 200           # 训练样本数
# m = 10            # 样本维度
# X = np.random.randn(n,m)*10                   # 生成随机自变量
# w = np.array([1,2,3,4,5,0.01,0.1,0.05,0.0001,6])
# # w = np.arange(1,11,1).T                     # 设置线性系数
# Y = np.dot(X,w) + np.random.randn(n)/10       # 经过线性系数加权并加噪
# Y[Y>0] = 1
# Y[Y<=0] = 0
# Y1 = np.dot(X,w) + np.random.randn(n)/10

# model = linear_model.LinearRegression()
Esempio n. 11
0
        pass  #return somethin terminate this function
    else:
        return df[st_index:st_index + win_size]


# ================================= Main Function ==========================
if __name__ == '__main__':
    file_path = r'D:\Project files\input_ml.csv'
    df = pd.read_csv(file_path, index_col='Date')
    df = df.fillna(0)  # Replace NaN value with 0
    df['Signal'] = df['Signal'].astype(int)  # convert to int object

    # test gen ml model with python
    y = df['Signal']
    X = df.drop('Signal', axis=1)
    X = preprocessing.maxabs_scale(X)  #Scale each feature to the [-1, 1] range
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=0,
                                                        shuffle=False)

    clf = LogisticRegression()
    clf.fit(X_train, y_train)
    confidence = clf.score(X_test, y_test)  #Test model with test data

    # set role and normalize
    #pipeline = make_pipeline(preprocessing.Normalizer(),
    #                         LogisticRegression())

    #df = df.set_index(pd.to_datetime(df['Date'], format='%d-%m-%Y'))
Esempio n. 12
0
    def read(self, filename):
        self.__X = []
        self.__Y = []
        self.__info = []
        first_row = True

        fd = file(filename)
        progress = pydev.FileProgress(fd, filename)
        raw_X = []
        for row in pydev.foreach_row(fd, seperator=self.__seperator):
            progress.check_progress()

            # whether to ignore first row.
            if first_row and self.__ignore_first_row:
                first_row = False
                continue

            # check column count.
            if self.__expect_column_count < 0:
                self.__expect_column_count = len(row)
                if self.__target_column < 0:
                    self.__target_column = self.__expect_column_count - 1
                print >> sys.stderr, 'columns set to %d, target:%d' % (self.__expect_column_count, self.__target_column)
            elif len(row) != self.__expect_column_count:
                    continue

            # strip each columns.
            row = map(lambda x:x.strip(), row)

            # get x dict.
            id_value = []
            v_size = 0
            ignored_info = []
            for rid, value in enumerate(row):
                # continue if target columns.
                if rid == self.__target_column:
                    continue
                # continue if filter columns.
                if rid in self.__ignore_columns:
                    ignored_info.append(value)
                    continue

                # dense and id-value-sparse
                if self.__row_mode == DataReader.DenseValue:
                    cid = rid
                elif self.__row_mode == DataReader.IVSparse:
                    cid, value = value.split(':')
                    cid = int(cid)

                if cid in self.__concrete_ids:
                    # one-hot representation for key.
                    # feature = id-value : 1
                    fid, value = self.__feature_trans.allocate_id('#%03d:%s' % (cid, value)), 1
                else:
                    # feature = id : value
                    fid, value = self.__feature_trans.allocate_id('#%03d' % (cid)), float(value)

                id_value.append( (fid, value) )
                if v_size < fid+1:
                    v_size = fid+1 

            x = numpy.ndarray(shape=(v_size,))
            x.fill(0)
            for fid, value in id_value:
                x[fid] = float(value)

            raw_X.append(x)

            # get Y
            if self.__concrete_target:
                row[self.__target_column] = self.__target_trans.allocate_id( row[self.__target_column] )
            y = row[self.__target_column]
            self.__Y.append(y)

            self.__info.append( self.__seperator.join(ignored_info) )

        progress.end_progress()
        
        # resize for each X.
        x_size = self.__feature_trans.size()
        for x in raw_X:
            new_x = numpy.ndarray(shape=(x_size,), dtype=numpy.float32)
            new_x.fill(0)
            new_x[:x.shape[0]] = x
            self.__X.append( new_x )

        # resize Y if concrete label.
        if self.__target_one_hot:
            raw_Y = self.__Y
            self.__Y = []
            y_size = self.__target_trans.size()
            for y in raw_Y:
                new_y = numpy.ndarray(shape=(y_size,), dtype=numpy.float32)
                new_y.fill(0)
                new_y[ int(y) ] = 1.
                self.__Y.append( new_y )

        # transform X to numpy.ndarray
        self.__X = numpy.array(self.__X)

        # preprocessing.
        if self.__maxabs_scale:
            print >> sys.stderr, 'Do maxabs_scale'
            self.__X = preprocessing.maxabs_scale(self.__X)

        # make Y as ndarray
        self.__Y = numpy.array(self.__Y).astype(numpy.float32)

        #self.__feature_trans.debug()
        #self.__target_trans.debug()

        print >> sys.stderr, 'Data load [ %d(records) x %d(features) ]' % (len(self.__X), len(self.__X[0]))
Esempio n. 13
0
def normalize_features(features):
    from sklearn.preprocessing import maxabs_scale
    return maxabs_scale(features).reshape(1, -1)[0]
Esempio n. 14
0
 def recognize(self, fileName):
     (rate2, sig2) = wav.read(fileName)
     sig2 = pp.maxabs_scale(sig2)  #Normalize amplitude to +-1
     mfcc_feat2 = mfcc(sig2, rate2)
     return self.get1NN(mfcc_feat2)
df = pd.DataFrame(data = temp, columns = ['Time', 'Data']) #data frame (using pandas)

########## preprocess data 

print "Preprocessing data..."
if value_type in {"accelerometer", "magnetometer"}:
    df2 = df[df['Data']!=0]

elif value_type == "temperature":
    df2 = df[df['Data']<70]

elif value_type == "humidity":
    c = Counter(df.Data)
    value_to_remove = c.most_common(1)[0][0] #value that appears the most frequently
    df2 = df[df['Data']!= value_to_remove]
    maxabs_scale(df2.Data, copy = False)
else:
    print "invalid value type"

print "Data Preprocessed"

########## initialize for featurizing

time_window_start = time_reference 
feature_list = []
# number of data points
n = df2.shape[0]
# counter, to determine how many data points from the file have been processed (so that we know if we should stop or not)
count = 0

######### create feature for each time window
Esempio n. 16
0
def generateDataset(target, restrictions, input_labels, labels_to_categorize,
                    table, testset_size, label):
    global X_train, X_test, y_train, y_test, input_vectors, input_pieces, targets, scalename, currentName, currentPrediction, currentSQL, currentAUC, currentSensitivity, currentSpecificity, currentDatasetSize, currentTrainingsetSize, currentTestingsetSize, currentScaling, currentCrossvalidation, lastid, currentAUCData, pickle
    currentPrediction = label

    #This part is somewhat research-spesific and needs to be altered, as different models are trained.

    #Construct the SQL query:
    SQL = 'SELECT ' + target[0] + ', ' + ",".join(map(
        str, input_labels)) + ' FROM ' + table + ' WHERE ' + target[
            1] + ' IS NOT NULL AND ' + ' IS NOT NULL AND '.join(
                map(str, input_labels)) + " IS NOT NULL AND " + restrictions

    #Set SQL string as global current and fetch the data
    currentSQL = SQL
    cur.execute(SQL)
    dataset = cur.fetchall()

    #Create dictionary for categorical arrays:
    categorical_input_arrays = {}
    for label in categorical_inputs:
        categorical_input_arrays[label] = []

    #Use first item in SQL rows as targets, append the rest either to array to be processed or to be used straightly
    for i in dataset:

        # add firts column to targets
        targets.append(i[0])

        #add categorical columns to dictionary
        for j in xrange(0, len(categorical_inputs)):
            categorical_input_arrays[categorical_inputs[j]].append(i[j + 1])

        #add rest i.e. numericals and booleans to input_vectors
        input_vectors.append(i[len(categorical_inputs) + 1:])

    #Construct label arrays for human-readable decision trees:
    input_pieces.extend(input_labels[len(categorical_inputs):])

    #loop through categoricals, get distinctive labels from SQL, fit and use label encoder and add result to input_vectors
    for label in list(categorical_input_arrays):
        cur.execute('SELECT DISTINCT ' + label + ' FROM ' + table + ' ')
        label_set = []
        for l in cur.fetchall():
            label_set.append(l[0])

        #fit labelencoder, construct input_pieces, append encoded vector to input vectors:
        le.fit(label_set)
        for cl in le.classes_:
            input_pieces.extend([label + ': ' + cl])
        to_append = le.transform(categorical_input_arrays[label])
        for k in range(0, len(to_append)):
            input_vectors[k] = np.hstack(
                (np.array(input_vectors[k]), np.array(to_append[k])))

    #Constructing the main sets: training features and targets, testing features and targets.
    input_vectors = np.array(input_vectors)

    #Scale the input vectors:
    if scalename == 'scale':
        input_vectors = preprocessing.scale(input_vectors)
    elif scalename == 'maxabs':
        input_vectors = preprocessing.maxabs_scale(input_vectors)

    targets = np.array(targets)
    X_train, X_test, y_train, y_test = train_test_split(input_vectors,
                                                        targets,
                                                        test_size=testset_size,
                                                        random_state=42,
                                                        stratify=targets)

    dataset_json = {}
    dataset_json['training_input'] = X_train.tolist()
    dataset_json['training_output'] = y_train.tolist()
    dataset_json['testing_input'] = X_test.tolist()
    dataset_json['testing_output'] = y_test.tolist()

    json_dump = json.dumps(dataset_json)
    json_hash = hashlib.sha224(json_dump).hexdigest()

    #check if identical dataset is already stored:
    cur.execute("SELECT id FROM datasets WHERE hash like '" + json_hash + "'")
    res = cur.fetchall()
    store_dataset = False

    try:
        dataset_id = res[0][0]
    except:
        store_dataset = True
        #if no identical dataset, select max id and add one to that:
        q = 'SELECT max(id) FROM datasets'
        cur.execute(q)
        res2 = cur.fetchall()
        try:
            dataset_id = res2[0][0] + 1
        except:
            dataset_id = 1

    if (store_dataset):
        q = 'INSERT INTO datasets (id, hash, dataset) VALUES (%s, %s, %s)'
        cur.execute(q, (dataset_id, json_hash, json_dump))
        conn.commit()

    if (printing):
        f.printDatasetInfo(label, len(dataset), 0.33, len(y_train),
                           len(y_test), scalename, SQL)

    #update the current dataset sizes
    currentTrainingsetSize, currentTestingsetSize = len(y_train), len(y_test)
Esempio n. 17
0
import matplotlib.pylab as plt
import pandas as pd
import numpy as np

#전처리를 지원하는 패키지
#R에서는  scale = z점수
from sklearn.preprocessing import scale, robust_scale, minmax_scale, maxabs_scale

print((np.arange(10, dtype=np.float) - 3))
x = (np.arange(10, dtype=np.float) - 3).reshape(-1, 1)  #행과 열
print(x)
df = pd.DataFrame(np.hstack(
    [x, scale(x),
     robust_scale(x),
     minmax_scale(x),
     maxabs_scale(x)]),
                  columns=[
                      'x', 'scale(x)', 'robust_scale(x)', 'minmax_scale(x)',
                      'maxabs_scale(x)'
                  ])

df

#분포의 변화는 불변
import seaborn as sns
from sklearn.datasets import load_iris

iris = load_iris()

data1 = iris.data
data2 = scale(iris.data)
Esempio n. 18
0
    def forest(self, filename):
        self.logger.info('Starting random forest analysis.')
        df = RandomForest.get_matrix(self)

        self.logger.info('Setting up variables.')
        x = np.array(df.drop([self.variable], axis=1))
        x = preprocessing.maxabs_scale(x)
        y = np.array(df[self.variable])
        c = Counter(y)
        c = dict(c)

        sample_count = 0
        for _, value in c.items():
            sample_count += value

        print(f"There appear to be a total of {sample_count} samples.")
        for category, value in c.items():
            print(" %s samples labeled at %s, or %2.1f%s of the total"
                  % (value, category, value/sample_count*100, "%"))

        # Setup the different classifiers. Decision tree and a 'forest' of tress are obvious choices.
        # Also included are two type of boosting.
        dt = DecisionTreeClassifier()
        rf = RandomForestClassifier(n_estimators=100, max_features="auto", random_state=33)
        ab = AdaBoostClassifier(n_estimators=100, random_state=33)
        gb = GradientBoostingClassifier(n_estimators=100, random_state=33)

        # We will also test the two main types of Cross Validation
        sf = StratifiedKFold(n_splits=10, random_state=33, shuffle=True)
        tts = train_test_split(x, y, test_size=.25, shuffle=True)

        # Decision Trees, TTS and then combined results of samples folds
        self.logger.info("Running Decision Tree Analysis.")
        x_train, x_test, y_train, y_test = tts
        dt.fit(x_train, y_train)
        y_predict_tts = dt.predict(x_test)
        print("Decision Tree:\n\tTrain/Test Split Accuracy: %2.1f%s"
              % (accuracy_score(y_test, y_predict_tts) * 100, "%"))

        i = 0
        score = 0
        for train, test in sf.split(x, y):
            dt.fit(x[train], y[train])
            y_predict = dt.predict(x[test])
            score += accuracy_score(y[test], y_predict)
            i += 1
        print("\tSample Folds Accuracy: %2.1f%s" % (score / i * 100, "%"))

        self.logger.info("Running Random Forest Analysis")
        # Random Forest, TTS and then combined results of samples folds
        # We don't need to generate the train/test data again.
        rf.fit(x_train, y_train)
        y_predict_tts = rf.predict(x_test)
        print("Random Forest:\n\tTrain/Test Split Accuracy: %2.1f%s"
              % (accuracy_score(y_test, y_predict_tts) * 100, "%"))

        scores = cross_val_score(rf, x, y, cv=10)
        print("\t10X (Stratified)KFold Accuracy: %0.2f%s (+/- %0.2f)" % (scores.mean() * 100, "%", scores.std() * 200))

        # AdaBoost, TTS and then combined results of samples folds
        self.logger.info("Running AdaBoost Analysis")
        ab.fit(x_train, y_train)
        y_predict_tts = ab.predict(x_test)
        print("AdaBoost:\n\tTrain/Test Split Accuracy: %2.1f%s" % (accuracy_score(y_test, y_predict_tts) * 100, "%"))

        i = 0
        score = 0
        for train, test in sf.split(x, y):
            ab.fit(x[train], y[train])
            y_predict = ab.predict(x[test])
            score += accuracy_score(y[test], y_predict)
            i += 1
        print("\tSample Folds Accuracy: %2.1f%s" % (score / i * 100, "%"))

        # Gradient Boost, TTS and then combined results of samples folds
        self.logger.info("Running Gradient Boost Analysis")
        ab.fit(x_train, y_train)
        y_predict_tts = ab.predict(x_test)
        print("Gradient Boost:\n\tTrain/Test Split Accuracy: %2.1f%s"
              % (accuracy_score(y_test, y_predict_tts) * 100, "%"))
        i = 0
        score = 0
        for train, test in sf.split(x, y):
            gb.fit(x[train], y[train])
            y_predict = gb.predict(x[test])
            score += accuracy_score(y[test], y_predict)
            i += 1
        print("\tSample Folds Accuracy: %2.1f%s" % (score / i * 100, "%\n"))

        # Here is the output if features in the RF classifier. Just for vanilla RF.
        # Screen output is limited to the top 100 features.
        importance = rf.feature_importances_
        indices = np.argsort(importance)[::-1]
        features = df.drop([self.variable], axis=1).columns[indices]
        standard_deviations = np.std([tree.feature_importances_ for tree in rf.estimators_], axis=0)
        print("%s informative features out of %s total..." % (np.count_nonzero(importance), importance.shape[0]))

        df_complete_list = pd.DataFrame(zip(features, importance[indices], standard_deviations[indices]),
                                        columns=('features', 'importance', 'std'))
        output_file = self.outdir+'/'+filename
        df_complete_list.to_csv(path_or_buf=output_file, sep=',')

        if np.count_nonzero(importance) > 100:
            features_to_print = 100
            print("Showing the top 100! Full list can be found in csv file.")
        else:
            features_to_print = np.count_nonzero(importance)
            print("Showing them all. Also writing to csv file.")
        for f in range(features_to_print):
            print("%d. feature %s (%f)" % (
                f + 1, features[f], importance[indices[f]]))

        plt.figure()
        plt.title(f"Random Forest Feature Importance: {self.variable}", fontsize=12)
        plt.bar(range(features_to_print), importance[indices][:features_to_print], color="r",
                yerr=standard_deviations[indices][:features_to_print])
        plt.xticks(range(features_to_print), features, rotation=90, fontsize=3)
        plt.xlim([-1, features_to_print])
        # plt.show()
        plt.savefig(self.outdir+"/"+self.variable+'_rfFeatures.pdf', format='pdf', dpi=150)
Esempio n. 19
0
def maxabsScale(data):
    return maxabs_scale(data)
Esempio n. 20
0
from pyriemann.estimation import Covariances

from mne import Epochs, pick_types, find_events
from mne.channels import read_layout
from mne.io import concatenate_raws, read_raw_edf
from mne.datasets import eegbci
from mne.decoding import CSP
from datetime import datetime

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale, robust_scale, minmax_scale, maxabs_scale

import pickle

with open('C:/Users/dk/PycharmProjects/giga_cnn/convert/smt100_ica_2.pkl', 'rb') as f:
    x_data = pickle.load(f)


scaler = StandardScaler()
i=0

raw_data_train = np.zeros_like(x_data)
for i in range(x_data.shape[0]):
    raw_fit = maxabs_scale(x_data[i, :, :])
    raw_data_train[i,:,:] = raw_fit[:,:]
    print(i)


with open('C:/Users/dk/PycharmProjects/giga_cnn/convert/smt100_ica_maxabs_2.pkl', 'wb') as f:
    pickle.dump(raw_data_train, f)
Esempio n. 21
0
    os.makedirs(training_dataset)

data = pd.read_csv('bd2019-weather-prediction-training-20190608.csv')

print(data.isnull().sum())
data.fillna("00,", inplace=True)

data['date'] = pd.to_datetime(data['date'])
data = data[~data['rain20'].isin([999990])]
data = data[~data['rain08'].isin([999990])]
data = data[~data['wind_speed'].isin([999999])]
data = data[~data['wind_direction'].isin([999999])]
data = data[~data['visibility'].isin([999999])]
data = data[~data['temperature'].isin([999999])]
data = data[~data['humidity'].isin([999999])]
data['cloud'] = preprocessing.maxabs_scale(data['cloud'])
data['wind_direction'].replace(999001, 0, inplace=True)
data['wind_direction'].replace(999002, 22.5, inplace=True)
data['wind_direction'].replace(999003, 45, inplace=True)
data['wind_direction'].replace(999004, 67.5, inplace=True)
data['wind_direction'].replace(999005, 90, inplace=True)
data['wind_direction'].replace(999006, 112.5, inplace=True)
data['wind_direction'].replace(999007, 135, inplace=True)
data['wind_direction'].replace(999008, 157.5, inplace=True)
data['wind_direction'].replace(999009, 180, inplace=True)
data['wind_direction'].replace(999010, 202.5, inplace=True)
data['wind_direction'].replace(999011, 225, inplace=True)
data['wind_direction'].replace(999012, 247.5, inplace=True)
data['wind_direction'].replace(999013, 270, inplace=True)
data['wind_direction'].replace(999014, 292.5, inplace=True)
data['wind_direction'].replace(999015, 315, inplace=True)
Esempio n. 22
0
def run_pca(expression, annotation, powerlaw):
    tissue_data, description_data = read_annotation(annotation)

    # Load Expression data
    df = pd.read_table(expression, header=0, index_col=0)
    run_ids = list(df.columns.values)
    dataMatrix = np.transpose(np.array(df))

    # Run PCA
    sklearn_pca = sklearnPCA(n_components=2)
    sklearn_transf = sklearn_pca.fit_transform(
        preprocessing.maxabs_scale(dataMatrix, axis=0))

    # Tissues and color table
    tissues = [tissue_data[r.replace('.htseq', '')] for r in run_ids]
    colors = {
        'leaf': 'green',
        'root': 'brown',
        'shoot': 'blue',
        'plant': 'black',
        'seed': 'red',
        'flower': 'cyan',
        'stem': 'yellow',
        'seedling': 'white',
        'pollen': 'violet'
    }

    found_tissues = {}

    plt.figure(1)

    with sns.axes_style("whitegrid", {"grid.linestyle": None}):
        plt.subplot(121)
        for run, tissue, pca_data in zip(run_ids, tissues, sklearn_transf):
            label = tissue if tissue in colors.keys() else 'other'

            plt.plot(
                pca_data[0],
                pca_data[1],
                'o',
                markersize=7,
                color=colors[tissue] if tissue in colors.keys() else 'gray',
                alpha=0.5,
                label=label
                if label not in found_tissues.keys() else "_nolegend_")

            found_tissues[label] = True

        plt.xlabel('PC 1 (%0.2f %%)' %
                   (sklearn_pca.explained_variance_ratio_[0] * 100))
        plt.ylabel('PC 2 (%0.2f %%)' %
                   (sklearn_pca.explained_variance_ratio_[1] * 100))

        plt.legend()
        plt.draw()

    with sns.axes_style("whitegrid"):
        plt.subplot(122)

        df = pd.read_table(powerlaw, names=['Node degree', 'Gene count'])

        ax = sns.regplot(x='Node degree',
                         y='Gene count',
                         data=df,
                         fit_reg=False)
        ax.set(xlim=(1, 10000), ylim=(1, 10000), xscale='log', yscale='log')

    plt.show()
Esempio n. 23
0
# Split the dataset into train and test and **organize** it as necessary to work with our model.

# In[3]:

# digits.data stores flattened ndarray size 64 from 8x8 images.
X, Y = digits.data, digits.target

# Split dataset into 80% train images and 20% test images
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    shuffle=True)

# normalize the input values by scaling each feature by its maximum absolute value
X_train = preprocessing.maxabs_scale(X_train)
X_test = preprocessing.maxabs_scale(X_test)

# ## Training and Saving the Model

# Let's **train our model** and **save the training model to a file**:

# In[4]:

# Create a classifier: a support vector classifier
model = svm.SVC(gamma=0.001, C=100)

# Learn the digits on the train subset
model.fit(X_train, Y_train)

# Save the model to a file
Esempio n. 24
0
if __name__ == '__main__':
    '''
    import visdom
    from Data_generator_normalize import data_generate
    generator = data_generate()
    
    vis = visdom.Visdom(env='yancy_env')

    test_set, label = generator.SQ_data_generator(train=False, examples=20, normalize=True)
    data = np.squeeze(test_set, axis=-1).reshape([-1, test_set.shape[-2]])  # (n, 2048)
    label = label.reshape([-1])  # (n,)

    cls = 3
    # try vis_tSNE
    vis_tSNE(data, label, cls, vis, name='test')
    # try t_sne
    fig = t_sne(data, label, classes=cls, name='ta')
    plt.show()
    '''
    a0 = [[1, 2, 1], [1, 3, 4], [5, 8, -10]]
    a = np.array(a0)
    b = maxabs_scale(a.astype(np.float), axis=1)
    c = my_normalization1(a)
    d = my_normalization2(a)
    e = my_normalization3(a)
    print(b)
    print(c)
    print(d)
    print(e)
def handle_fare(data):
    new_data = data
    new_data["Fare"] = fill_na_with_mean(new_data, "Fare")
    new_data["Fare"] = new_data["Fare"]/ 20
    new_data["Fare"] = preprocessing.maxabs_scale(data["Fare"])
    return new_data
Esempio n. 26
0
def normalize_features(features):
    from sklearn.preprocessing import maxabs_scale
    return maxabs_scale(features).reshape(1, -1)[0]
Esempio n. 27
0
		modelFeatures.append({ "active": 1, "feature": "f_pos", "args": [], "kwargs": { "table": "models/table" } })
		
		clf = OneVsRestClassifier(svm.LinearSVC(random_state=0)) #svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)

    # load data from all active features
	examples = loadData(args.dataref)

	# print counts of candidate sets
	# grouped by length
	if False:
		counts = getLengthCounts(examples)
		print counts
		sys.exit(0)

	# get permuted features if requested
	if args.permute:
		totalX, totalY = permuteFeatures(examples, modelFeatures, length=args.permute)
	# otherwise get features in a regular format
	else:
		totalX, totalY = combineFeatures(examples, modelFeatures, verbose=True)

	# scaling data
	print "Scaling data..."
	totalX = preprocessing.maxabs_scale(totalX)

	# run kfold validation 
	print "----------------------------------"
	print "Running kfold validation on all features"
	print "----------------------------------"
	kfoldValidation(10, np.array(totalX), np.array(totalY), classifier=clf, verbose=True)
Esempio n. 28
0
        b = binarizer(allValues)
        # print(f"working with classes {b.ref}")
        for i in range(len(allData)):
            # encoded = lb.transform([allValues[i]])
            # normData[i] = np.concatenate((normData[i],encoded[0]))
            encoded = b.transform(allValues[i])
            normData[i] = np.concatenate((normData[i], encoded))
        # for i in range(len(allData)):
        #     normData[i].append()
    elif value == "number":
        # Minor bug - if inf found, we replace with max to norm to 1.
        maxVal = np.where(np.isinf(allValues), -np.Inf, allValues).argmax()
        for i in range(len(allValues)):
            if (allValues[i] == np.float("inf")):
                allValues[i] = maxVal
        newValues = preprocessing.maxabs_scale(allValues)
        for i in range(len(allData)):
            normData[i] = np.concatenate(
                (normData[i], np.array([newValues[i]])))
    else:
        raise Exception("what did you just hand me?")
    newcollen = len(normData[0])
    totalCols += newcollen - oldcollen
    print(
        f"Normalized {key} considered a {value} by adding {newcollen-oldcollen} cols"
    )
print(f"Total cols added per item is {totalCols}")
print(len(normData))
print(normData[0].shape)
# Gotta Tensor-ify everything, if possible.
 def recognize(self,fileName):
     (rate2,sig2) = wav.read(fileName)
     sig2=pp.maxabs_scale(sig2) #Normalize amplitude to +-1
     mfcc_feat2 = mfcc(sig2,rate2)
     return self.get1NN(mfcc_feat2)
Esempio n. 30
0
def maxAbsScale(y, axis=0):
    """normalises array dividing by the max abs value"""
    return maxabs_scale(y, axis=axis)
#%% loading data
folder='C:/Users/Dan/Documents/MATLAB'
os.chdir(folder)

data=scipy.io.loadmat('S09_fft.mat')
data=data['S09_fft']
data=np.transpose(data.reshape(64*40,80*16,order='F'))

#data=scipy.io.loadmat('S09_fft_2D.mat')
#data=data['data']
#dataM=np.transpose(data)

labels=np.tile(np.arange(1,81),16)
#%% scaling
from sklearn.preprocessing import maxabs_scale
data = maxabs_scale(data)

#%% SVM
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split,  cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score, accuracy_score, mean_squared_error
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.svm import SVC
import itertools
start = time.time()
chi_k = 'all'
scores_all=dict()
scores=list()
for i in itertools.combinations(np.arange(1,81), 2):
     print(i)
def init_disp_conv(cp, dataset):
    # make_disp function could be used for the creation of symoblic var of the
    # input, yet we had issues in past experiments with the process of stacking layers
    # and initializing their parameters on individual functions, therefore we decided on
    # initializing every aspect of each network in the same function.

    # Retrieve all dispersions from "test" dataset
    dataset = dataset[:, 3]
    # Get parameters
    featurex = int(cp.get('Dispersion', 'feature_x'))
    featurey = int(cp.get('Dispersion', 'feature_y'))
    channels = int(cp.get('Dispersion', 'channels'))
    pool_size = int(cp.get('D1', 'pool'))
    def_filters = int(cp.get('Default', 'filters'))
    def_filter_size = int(cp.get('Default', 'filter_size'))
    def_stride = int(cp.get('Default', 'stride'))
    def_padding = int(cp.get('Default', 'padding'))
    # Resize each dispersion, the dispersion are treated as image
    dataset = [scipy.misc.imresize(x, (featurex, featurey)) for x in dataset]
    dataset = np.array(dataset)
    # Reshape to 2D
    dataset = dataset.reshape(dataset.shape[0], featurex * featurey)
    # Scale
    dataset = maxabs_scale(dataset)
    # Reshape to conv compatible shape
    dataset = dataset.reshape(dataset.shape[0], channels, featurex, featurey)
    input_var = theano.shared(name='input_var',
                              value=np.asarray(dataset,
                                               dtype=theano.config.floatX),
                              borrow=True)
    # Layer stacking (Dispersion subnetwork #1)
    # ------------------------------------------------------------------------
    # Input layer
    input_layer = network = lasagne.layers.InputLayer(shape=(None, channels,
                                                             featurex,
                                                             featurey),
                                                      input_var=input_var)
    # Introduce padding
    pad = network = lasagne.layers.PadLayer(incoming=network,
                                            width=((0, 1), (1, 0)))
    # Convolutional layer #1
    network = lasagne.layers.Conv2DLayer(incoming=network,
                                         num_filters=def_filters,
                                         filter_size=(def_filter_size,
                                                      def_filter_size),
                                         stride=def_stride,
                                         pad=def_padding)
    # Convolutional layer #2 (by default)
    network = lasagne.layers.Conv2DLayer(
        incoming=network,
        num_filters=int(cp.get('D1', 'convfilters')),
        filter_size=(int(cp.get('D1',
                                'filtersize')), int(cp.get('D1',
                                                           'filtersize'))),
        stride=int(cp.get('D1', 'stride')),
        pad=int(cp.get('D1', 'pad')))
    # Maxpool
    network = lasagne.layers.MaxPool2DLayer(incoming=network,
                                            pool_size=(pool_size, pool_size))
    # End of layer stacking for subnetwork #1
    # ########################################################################

    # Layer stacking (Dispersion subnetwork #2)
    # ------------------------------------------------------------------------
    # Subnetwork #1 till padding
    network2 = pad
    # Convolutional layer #1 (different params)
    network2 = lasagne.layers.Conv2DLayer(incoming=network2,
                                          num_filters=def_filters,
                                          filter_size=(def_filter_size,
                                                       def_filter_size),
                                          stride=def_stride,
                                          pad=def_padding)
    # Convolutional layer #2 (different params)
    network2 = lasagne.layers.Conv2DLayer(
        incoming=network2,
        num_filters=int(cp.get('D2', 'convfilters')),
        filter_size=(int(cp.get('D2',
                                'filtersize')), int(cp.get('D2',
                                                           'filtersize'))),
        stride=int(cp.get('D2', 'stride')),
        pad=int(cp.get('D2', 'pad')))
    # Maxpool
    network2 = lasagne.layers.MaxPool2DLayer(incoming=network2,
                                             pool_size=(pool_size, pool_size))
    # Flatten both subnetworks
    network = lasagne.layers.FlattenLayer(network)
    network2 = lasagne.layers.FlattenLayer(network2)
    # Connect subnetworks
    out = lasagne.layers.ConcatLayer(incomings=(network, network2), axis=1)
    # log('Printing Dispersion Net Structure.......')
    # log(lasagne.layers.get_output_shape(lasagne.layers.get_all_layers(network)))
    return [input_layer, input_var, out]
Esempio n. 33
0
from oml.models.regularizers import L1, L2Sq
from oml.optimizers.sgd import Fobos
from oml.optimizers.adagrad import AdaGrad, AdaRDA
from oml.optimizers.rda import Rda, AcceleratedRDA
from oml.optimizers.vr import Svrg
from oml.optimizers.freerex import FreeRex
from oml.optimizers.adam import Adam, AdMax
from oml.optimizers.rms_prop import RMSProp
from oml.optimizers.nesterov import AccSGD
from oml.datasouces.iterator import NumpyIterator

from matplotlib import pyplot as plt

data = load_boston()

x = maxabs_scale(data['data'])
t = data['target']

feature = x.shape[1]
target = 1

data = np.hstack((x, t.reshape(-1, 1)))

np.random.shuffle(data)

train_data = data[:data.shape[0] // 2, :]
test_data = data[data.shape[0] // 2:, :]

train_iter = NumpyIterator(train_data, batch_size=10)
test_iter = NumpyIterator(test_data)

groups = test_data.groupby(['Pclass']).size()
groups.plot.bar()


# models expects data to be normalized in order to perform better.
# So we will try to normalize each features.
# 

# In[ ]:


data = train_data.copy()
data["Pclass"] = data["Pclass"] - 1
data["Pclass"] =  preprocessing.maxabs_scale(data["Pclass"])
print(data["Pclass"].value_counts())


# In[ ]:


def handle_pclass(data):
    new_data = data
    new_data["Pclass"] = new_data["Pclass"] -1
    new_data["Pclass"] = preprocessing.maxabs_scale(data["Pclass"])
    return new_data

data = train_data.copy()
data = drop_survived(data)
data = drop_passenger_id(data)
Esempio n. 35
0
    # print counts of candidate sets
    # grouped by length
    if False:
        counts = getLengthCounts(examples)
        print counts
        sys.exit(0)

    # get permuted features if requested
    if args.permute:
        totalX, totalY = permuteFeatures(examples,
                                         modelFeatures,
                                         length=args.permute)
    # otherwise get features in a regular format
    else:
        totalX, totalY = combineFeatures(examples, modelFeatures, verbose=True)

    # scaling data
    print "Scaling data..."
    totalX = preprocessing.maxabs_scale(totalX)

    # run kfold validation
    print "----------------------------------"
    print "Running kfold validation on all features"
    print "----------------------------------"
    kfoldValidation(10,
                    np.array(totalX),
                    np.array(totalY),
                    classifier=clf,
                    verbose=True)
def handle_pclass(data):
    new_data = data
    new_data["Pclass"] = new_data["Pclass"] -1
    new_data["Pclass"] = preprocessing.maxabs_scale(data["Pclass"])
    return new_data
Esempio n. 37
0
def investigate(
	FILE,
	scale_features = 'maxabs',
	start = 1,
	end = None,
	show = False,
	images_dir = cfg.finvestig_images_dir,
	data_dir = cfg.finvestig_data_dir,
	results_dir = cfg.finvestig_results_dir):

	'''This produces a heatmap of the correlations between features.  The correlation
	strength is computed using Pearson Correlation Coefficients.

	:param FILE: Features file.  This should be in CSV format, with column 0 being the instance name
				 and row zero being the names of the features.
	:param scale_features: There are various ways to scale the features data.
						   The scaling is done column-wise (i.e. on each feature individually).  default='maxabs'.
						   - maxabs = Scale to [-1,1]
						   - scale = Zero mean and unit stdev
						   - minmax = Translate and scale to [0,1]
						   - normalize = Normalize each feature to unit norm
						   - robust = Shift outliers in according to interquartile range
	:param start: Starting feature to include in the correlations heatmap.  default=1
	:param end: Last feature to include in the correlations heatmap. default=None
	:param show: Whether to show images as they are being produced.  default=False
	:param images_dir: Directory to dump images.  default='../images'
	:param data_dir: Directory to dump data.  default='../data'
	:param results_dir: Directory to dump results.  default='../results'
	:return: Heatmap in PDF format.  Plot is automatically saved. The filename of
			 every saved output automatically has the input file names used to produce it.
	'''

	###################################################################
	# Section 1: Grabs Feature Data
	###################################################################

	stamp = '%s' %(os.path.basename(FILE).split('.')[0])

	with open("%s" %(FILE)) as f:
		reader = csv.reader(f, delimiter=",")
		data = list(reader)

	instances_mip = [os.path.basename(line[0]).split('.')[0] for line in data[1:]]
	features_mip = [line[start:end] for line in data[1:]]

	###################################################################
	# Section 1B: Scale the feature/performance data
	###################################################################
	# normalize = scale to unit norm
	# maxabs_scale = scale to [-1,1]
	# scale = zero mean scaled to std one

	if scale_features == 'scale':
		features_mip = preprocessing.scale(features_mip)
	elif scale_features == 'maxabs':
		features_mip = preprocessing.maxabs_scale(features_mip)
	elif scale_features == 'minmax':
		features_mip = preprocessing.minmax_scale(features_mip)
	elif scale_features == 'normalize':
		features_mip = preprocessing.normalize(features_mip)
	elif scale_features == 'robust':
		features_mip = preprocessing.robust_scale(features_mip)

	###################################################################
	# Section 2A: Pearson Correlation Heatmap
	###################################################################

	corr=np.corrcoef(features_mip,rowvar=False)
	mask = np.zeros_like(corr, dtype=np.bool)
	mask[np.triu_indices_from(mask)] = True
	f, ax = plt.subplots(figsize=(11, 9))
	cmap = sns.diverging_palette(220, 10, as_cmap=True)
	sns.heatmap(corr, mask=mask, cmap=cmap, center=0,
	            square=True, linewidths=.5, cbar_kws={"shrink": .5}, xticklabels=data[0][start:end], yticklabels=data[0][start:end])
	plt.xticks(rotation=90)
	plt.tick_params(labelsize=6)
	plt.yticks(rotation=0)
	plt.title("Feature Pearson Correlation Heatmap")
	plt.savefig('%s/Correlation Heatmap_%s.pdf' %(images_dir,stamp), bbox_inches='tight', pad_inches=0)
	if show == True:
		plt.show()
Esempio n. 38
0
    def read(self, filename):
        self.__X = []
        self.__Y = []
        self.__info = []
        first_row = True

        fd = file(filename)
        progress = pydev.FileProgress(fd, filename)
        raw_X = []
        for row in pydev.foreach_row(fd, seperator=self.__seperator):
            progress.check_progress()

            # whether to ignore first row.
            if first_row and self.__ignore_first_row:
                first_row = False
                continue

            # check column count.
            if self.__expect_column_count < 0:
                self.__expect_column_count = len(row)
                if self.__target_column < 0:
                    self.__target_column = self.__expect_column_count - 1
                print >> sys.stderr, 'columns set to %d, target:%d' % (
                    self.__expect_column_count, self.__target_column)
            elif len(row) != self.__expect_column_count:
                continue

            # strip each columns.
            row = map(lambda x: x.strip(), row)

            # get x dict.
            id_value = []
            v_size = 0
            ignored_info = []
            for rid, value in enumerate(row):
                # continue if target columns.
                if rid == self.__target_column:
                    continue
                # continue if filter columns.
                if rid in self.__ignore_columns:
                    ignored_info.append(value)
                    continue

                # dense and id-value-sparse
                if self.__row_mode == DataReader.DenseValue:
                    cid = rid
                elif self.__row_mode == DataReader.IVSparse:
                    cid, value = value.split(':')
                    cid = int(cid)

                if cid in self.__concrete_ids:
                    # one-hot representation for key.
                    # feature = id-value : 1
                    fid, value = self.__feature_trans.allocate_id(
                        '#%03d:%s' % (cid, value)), 1
                else:
                    # feature = id : value
                    fid, value = self.__feature_trans.allocate_id(
                        '#%03d' % (cid)), float(value)

                id_value.append((fid, value))
                if v_size < fid + 1:
                    v_size = fid + 1

            x = numpy.ndarray(shape=(v_size, ))
            x.fill(0)
            for fid, value in id_value:
                x[fid] = float(value)

            raw_X.append(x)

            # get Y
            if self.__concrete_target:
                row[self.__target_column] = self.__target_trans.allocate_id(
                    row[self.__target_column])
            y = row[self.__target_column]
            self.__Y.append(y)

            self.__info.append(self.__seperator.join(ignored_info))

        progress.end_progress()

        # resize for each X.
        x_size = self.__feature_trans.size()
        for x in raw_X:
            new_x = numpy.ndarray(shape=(x_size, ), dtype=numpy.float32)
            new_x.fill(0)
            new_x[:x.shape[0]] = x
            self.__X.append(new_x)

        # resize Y if concrete label.
        if self.__target_one_hot:
            raw_Y = self.__Y
            self.__Y = []
            y_size = self.__target_trans.size()
            for y in raw_Y:
                new_y = numpy.ndarray(shape=(y_size, ), dtype=numpy.float32)
                new_y.fill(0)
                new_y[int(y)] = 1.
                self.__Y.append(new_y)

        # transform X to numpy.ndarray
        self.__X = numpy.array(self.__X)

        # preprocessing.
        if self.__maxabs_scale:
            print >> sys.stderr, 'Do maxabs_scale'
            self.__X = preprocessing.maxabs_scale(self.__X)

        # make Y as ndarray
        self.__Y = numpy.array(self.__Y).astype(numpy.float32)

        #self.__feature_trans.debug()
        #self.__target_trans.debug()

        print >> sys.stderr, 'Data load [ %d(records) x %d(features) ]' % (len(
            self.__X), len(self.__X[0]))
#extract data from file
docBuff = []
data = []

with open("data.txt") as dataFile:
    for row in dataFile:
        docBuff.append(row.rstrip('\n'))

for i in docBuff:
    data.append(i.split(","))

#clustering fase
k = 3

km = cluster.KMeans(n_clusters=k)
km.fit(preprocessing.maxabs_scale(data))

labels = km.labels_
centroids = km.cluster_centers_
print("k: " + str(k))
print(labels)

cluster_centers = np.sort(km.cluster_centers_, axis=0)
print(cluster_centers)

#plot clustered data
dataMod = np.array(data)

for i in range(k):
    data = dataMod[np.where(labels == i)]
    plt.plot(data[:, 0], data[:, 3], 'o')
Esempio n. 40
0
##############logistic regression
### clf = LogisticRegression(penalty='l2', dual=False, solver='liblinear', C=1.1)
### labels =  ['Sex', 'Age', 'Fare', 'SibSp', 'Parch', 'Has_Family', 'Embarked_S', 'Embarked_C', 'Embarked_Q'] #['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
### cv_score = cross_val_score(clf, 
###         train_data[labels], 
###         train_data['Survived'], cv=3)
### print 'logistic regression cv_score=', cv_score

### clf.fit(train_data[labels], train_data['Survived'])
### lr_predict = clf.predict(test_data[labels])
### test_data['Survived'] = lr_predict
### test_data.to_csv('lr_class.csv', columns=['PassengerId',  'Survived'], index=False)

##############SVM
labels =  ['Sex', 'Age', 'Fare', 'SibSp', 'Parch', 'Has_Family', 'Embarked_S', 'Embarked_C', 'Embarked_Q'] #['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
normalized_train_data = maxabs_scale(train_data[labels].values)
normalized_test_data = maxabs_scale(test_data[labels].values)

####### exhaustively CV grid search
### parameters_grid = [{'kernel': ['rbf'], 'gamma': [1e-4, 1e-3, 1e-2, 1e-1, 1, 5, 10, 50, 100], 'C': [0.1, 0.5, 1, 5, 10, 50, 100]},
###         {'kernel': ['sigmoid'], 'gamma': [1e-4, 1e-3, 1e-2, 1e-1, 1, 5, 10, 50, 100], 'C': [0.1, 0.5, 1, 5, 10, 50, 100], 'coef0':[1e-4, 1e-3, 1e-2, 1e-1, 1, 5, 10, 50, 100]},
###     {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
### clf = SVC()
### grid_search = GridSearchCV(clf, parameters_grid, cv=3, n_jobs=2)
### grid_search.fit(normalized_train_data, train_data['Survived'])
### print grid_search.best_params_, grid_search.best_score_
#### 
#### clf = SVC(C=5, kernel='rbf', gamma=1)
#### cv_score = cross_val_score(clf, 
####         normalized_train_data,
####         train_data['Survived'], cv=3)
Esempio n. 41
0
def investigate(FILE,
                scale_features='maxabs',
                show=False,
                images_dir=cfg.finvestig_images_dir,
                data_dir=cfg.finvestig_data_dir,
                results_dir=cfg.finvestig_results_dir):
    '''Multidimensional scaling is a technique to map high dimeinsional data to
	lower dimensions. The heuristic used is to preserve pairwise distances as well
	as possible in this mapping.  In this function, it is used to:

	  1) Understand the dimensionality of the feature space according to MDS
	  2) Visualize how the data maps to 2D and 3D

	Further functionality can be added to more closely match the goals of the
	Principal Component Analysis function.

	:param FILE: Features file. This should be in CSV format, with column 0 being
	       		 the instance name and row zero being the names of the features.
	:param scale_features: There are various ways to scale the features data.
	             The scaling is done column-wise (i.e. on each feature individually).
	             default='maxabs'.

	             - maxabs = Scale to [-1,1]
	             - scale = Zero mean and unit stdev
	             - minmax = Translate and scale to [0,1]
	             - normalize = Normalize each feature to unit norm
	             - robust = Shift outliers in according to interquartile range

	:param show: Whether to show images as they are being produced.  default=False
	:param images_dir: Directory to dump images.  default='../images'
	:param data_dir: Directory to dump data.  default='../data'
	:param results_dir: Directory to dump results.  default='../results'
	:return: Plots all returned in PDF format.  Plots are automatically saved:

	     - Plot 1: Information retained with respect to dimension being mapped to
	       x-axis is scaled by heuristically.  Change this if plot isn't nice.
	     - Plot 2: MDS in d=2
	     - Plot 3: MDS in d=2, with names of instances attached to points
	     - Plot 4: MDS in d=3.  Plot is not automatically saved.

	'''

    ###################################################################
    # Section 1: Grabs Feature Data
    ###################################################################

    stamp = '%s' % (os.path.basename(FILE).split('.')[0])
    print(stamp)
    with open("%s" % (FILE)) as f:
        reader = csv.reader(f, delimiter=",")
        data = list(reader)

    instances_mip = [
        os.path.basename(line[0]).split('.')[0] for line in data[1:]
    ]
    features_mip = [line[1:] for line in data[1:]]

    ###################################################################
    # Section 1B: Scale the feature/performance data
    ###################################################################
    # normalize = scale to unit norm
    # maxabs_scale = scale to [-1,1]
    # scale = zero mean scaled to std one

    if scale_features == 'scale':
        features_mip = preprocessing.scale(features_mip)
    elif scale_features == 'maxabs':
        features_mip = preprocessing.maxabs_scale(features_mip)
    elif scale_features == 'minmax':
        features_mip = preprocessing.minmax_scale(features_mip)
    elif scale_features == 'normalize':
        features_mip = preprocessing.normalize(features_mip)
    elif scale_features == 'robust':
        features_mip = preprocessing.robust_scale(features_mip)

    ###################################################################
    # Section 2A: MDS, Find the number of dimensions to map to
    # This can take a few minutes, so feel free to grab a coffee at this point
    ###################################################################
    # Below, "information" is defined as (1-stress/scale)
    # The scale here is stress[dimension(2)]
    # Information is a scalar in [0,1]
    ### Stress is defined as the sum of squared difference between
    ### distances in the embedded space and distances in the original space

    max_dim = int(ceil(0.1 * len(features_mip[0])))

    print()
    print("Max dimension projecting to is %s" % (max_dim))
    print()

    stress, dimension = [], []
    fig, ax = plt.subplots()
    for i in range(2, max_dim + 1):  # choose the range of dimensions to map to
        print('Projecting to dimension %s' % i)
        mds = manifold.MDS(i)  # number of dimensions to map to
        proj = mds.fit_transform(features_mip).T
        stress.append(mds.stress_)
        dimension.append(i)
    information = [1 - i / stress[0] for i in stress]

    print()

    n = 1
    for line in information:
        n += 1
        if line >= 0.7:  #set a 70% threshold for information kept
            print('%.3f information is kept at dimension %s' % (line, n))
            break
    n = 1
    for line in information:
        n += 1
        if line >= 0.9:  #set a 90% threshold for information kept
            print('%.3f information is kept at dimension %s' % (line, n))
            break
    n = 1
    for line in information:
        n += 1
        if line >= 0.95:  #set a 95% threshold for information kept
            print('%.3f information is kept at dimension %s' % (line, n))
            break
    extraticks = [0.7, 0.9, 0.95]
    plt.axhline(y=0.7, color='r', linestyle='-')
    plt.axhline(y=0.9, color='r', linestyle='-')
    plt.axhline(y=0.95, color='r', linestyle='-')
    plt.plot(dimension, information, 'bo')
    plt.yticks(list(plt.yticks()[0]) + extraticks)
    plt.ylim((-0.1, 1.1))
    plt.xlabel('Dimension')
    plt.ylabel('Information Retained')
    plt.title("MDS Normalized Retained Information")
    plt.savefig('%s/MDS_information_%s.pdf' % (images_dir, stamp),
                bbox_inches='tight',
                pad_inches=0)
    if show == True:
        plt.show()
    plt.close()

    ###################################################################
    # Section 2B: MDS, Draw 2D and 3D MDS plots
    ###################################################################

    print()
    print('Currently producing some more images...')

    # Draw 2D MDS
    mds = manifold.MDS(2)  # number of dimensions to map to
    proj = mds.fit_transform(features_mip).T
    fig, ax = plt.subplots()
    plt.plot(proj[0], proj[1], 'ro')
    plt.title("Feature MDS")
    plt.xscale("symlog")
    plt.yscale("symlog")
    plt.axis('tight')
    plt.savefig('%s/MDS_%s.pdf' % (images_dir, stamp),
                bbox_inches='tight',
                pad_inches=0)
    if show == True:
        plt.show()
    plt.close()

    # Do the same as the above, just with names of instances attached to points
    proj = mds.fit_transform(features_mip).T
    fig, ax = plt.subplots()
    for i, txt in enumerate(instances_mip):
        ax.annotate(txt, (proj[0][i], proj[1][i]))
    plt.plot(proj[0], proj[1], 'ro')
    plt.title("Feature MDS")
    plt.xscale("symlog")
    plt.yscale("symlog")
    plt.axis('tight')
    plt.savefig('%s/MDS (names)_%s.pdf' % (images_dir, stamp),
                bbox_inches='tight',
                pad_inches=0)
    if show == True:
        plt.show()
    plt.close()

    # Draw 3D MDS.  3D Images will not save automatically.
    mds = manifold.MDS(3)  # number of dimensions to map to
    proj = mds.fit_transform(features_mip).T
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(proj[0], proj[1], proj[2])
    ax.set_xscale("symlog")
    ax.set_yscale("symlog")
    ax.set_zscale("symlog")
    if show == True:
        plt.show()
    plt.close()
Esempio n. 42
0
    obj = json.loads(line)
    team = int(obj['team'])
    vec = np.zeros(num_features)
    for tuple_key, count in obj['tuples'].iteritems():
        val = count
        val = min(max(val - WEIGHT_GATE, 0), WEIGHT_CEIL)
        val = 0 if val > WEIGHT_IGNORE else val
        if BINARY_WEIGHT_PRE:
            val = 1 if val > 0 else 0
        vec[get_tuple_key(tuple_key)] = val
    team_features[team_index[team]] = vec
print team_features.shape
print team_features

print "Normalizing matrix..." # TODO norm
maxabs_scale(team_features, axis=0, copy=False) # scale each feature
maxabs_scale(team_features, axis=1, copy=False) # scale each team
print team_features
csc = csc_matrix(team_features)

if DO_SVD:
    print "Computing SVD..."
    ut, s, vt = sparsesvd(csc, KEEP_SV)
    #team_feat_dense = np.dot(np.transpose(ut), np.square(np.diag(s)))
    #team_feat_dense = np.transpose(ut)
    team_feat_dense = np.dot(np.transpose(ut), np.diag(s))
    print "Dense team feature vectors:", team_feat_dense.shape
    print team_feat_dense
    actual_features = len(s)
    print "%s singular values, min: %s, max: %s" % (actual_features, min(s), max(s))
else:
Esempio n. 43
0
import numpy as np
from sklearn.datasets import fetch_20newsgroups
dd = fetch_20newsgroups(subset='all')
print(dd.description)
print(dd.keys())
np.unique(dd.target, return_inverse=True)
from pprint import pprint
pprint(list(dd.target_names))
dd.data[1]
dd.target_names[dd.target[1]]

from sklearn.preprocessing import scale, robust_scale, minmax_scale, maxabs_scale

x = (np.arange(10, dtype=np.float) - 3).reshape(-1, 1)
df = pd.DataFrame(np.hstack([x, scale(x), robust_scale(x), minmax_scale(x), maxabs_scale(x)]), 
                  columns=["x", "scale(x)", "robust_scale(x)", "minmax_scale(x)", "maxabs_scale(x)"])
df


from sklearn.preprocessing import normalize

x = np.vstack([np.arange(5, dtype=float) - 20, np.arange(5, dtype=float) - 2]).T
y1 = scale(x)
y2 = normalize(x)

print("original x:\n", x)
print("scale:\n", y1)
print("norms (scale)\n", np.linalg.norm(y1, axis=1))
print("normlize:\n", y2)
print("norms (normalize)\n", np.linalg.norm(y2, axis=1))