Example #1
0
            ################### FEATURE SELECTION
            if feat_sel:
                print('FEATURE SELECTION ...')

                data_frame = pd.concat([X_init, y_init], axis=1)
                data_frame = feature_importance(data_frame, coeff_threshold,
                                                'class')

                X_test_then_train = X_test_then_train[data_frame.columns[:-1]]
                X_init = X_init[data_frame.columns[:-1]]

                features = features[data_frame.columns[:-1]]

            #DATA PREPARATION FOR SCIKIT-MULTIFLOW
            stream.X = features.values
            stream.y = labels.values

            ################### HYPERPARAMETER TUNING

            if hyperparameter_tuning:
                print('HYPERPARAMETER TUNING ...')

                for reg in range(len(regressors)):

                    reg_name = regressors[reg].__class__.__name__

                    if reg_name == 'PassiveAggressiveRegressor':
                        print(reg_name, ' tuning ...')

                        PAR_timer = timer()
def cargaDatos(datasets,data,severity,speed,lim_data):
    
    if data==0:#weather
    
        stream = FileStream('your_path')
        stream.prepare_for_use() 
    
        df=pd.DataFrame(stream.X)
        x = df.values
        min_max_scaler = preprocessing.MinMaxScaler()
        x_scaled = min_max_scaler.fit_transform(x)
        df = pd.DataFrame(x_scaled)
        
        stream.X=df.as_matrix()
            
    elif data==1:#elec
    
        stream = FileStream('your_path')
        stream.prepare_for_use() 
        
    elif data==2:#covtype
    
        stream = FileStream('your_path')
        stream.prepare_for_use() 
    
        df=pd.DataFrame(stream.X)
        x = df.values
        min_max_scaler = preprocessing.MinMaxScaler()
        x_scaled = min_max_scaler.fit_transform(x)
        df = pd.DataFrame(x_scaled)
    
        df=df[0:5000]#Limitar porque tiene muchas features
        stream.X=df.as_matrix()
        
        #Hay q hacer que las labels vayan de 0-6 para que el tamaƱo del repositorio de OnlineGRF coincida
        stream.y=stream.y-1
        stream.target_values=list(np.unique(stream.y))
        
    elif data==3:#moving_squares
    
        stream = FileStream('your_path')
        stream.prepare_for_use() 

        df=pd.DataFrame(stream.X)
        df=df[0:lim_data]#Limitar datos a 50k samples        
        stream.X=df.as_matrix()

    elif data==4:#sea_stream
    
        stream = FileStream('your_path')
        stream.prepare_for_use() 
        
    elif data==5:#usenet2
    
        stream = FileStream('your_path')
        stream.prepare_for_use() 
        
    elif data==6:#gmsc
    
        df=pd.read_csv('your_path',sep=',',header=0)
        df = df.drop('Unnamed: 0', 1)#Quitamos la primera columna
        df=df.dropna(how='any')#Se quitan las filas con Nan
        df=df[0:lim_data]#Limitar datos a 50k samples    

        feats=df[['RevolvingUtilizationOfUnsecuredLines', 'age',
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
       'NumberOfDependents']]

#        x = feats.values
#        min_max_scaler = preprocessing.MinMaxScaler()
#        x_scaled = min_max_scaler.fit_transform(x)
#        feats = pd.DataFrame(x_scaled)
        
        clas=df[['SeriousDlqin2yrs']]
        
        df_result = pd.concat([feats, clas], axis=1, sort=False)
        
        df_result.to_csv('your_path')

        stream = FileStream('your_path')
        stream.prepare_for_use() 
        
        stream.X=feats.as_matrix()
        stream.y=clas.as_matrix()
        
    elif data==7:#airlines
    
        df = pd.read_csv('your_path', sep=',', header=None)
        
        #Tratar las features nominales: 0,2,3 columns
        #1. Si hacemos OneHot encoding, se convierte en tantas features que al usar GRF y su parametro gamma tarda demasiado
#        df=pd.get_dummies(df, columns=[0,2,3], prefix=["airline", "airport_from", "airport_to"])    
#        df.to_csv("//home//txuslopez//Dropbox//jlopezlobo//Data sets//Non stationary environments//Airlines//airlines2.csv")
        #2. Hacemos Label encoding
        df.iloc[:,0] = df.iloc[:,0].astype('category')
        df.iloc[:,0]=df.iloc[:,0].cat.codes
        
        df.iloc[:,2] = df.iloc[:,2].astype('category')
        df.iloc[:,2]=df.iloc[:,2].cat.codes

        df.iloc[:,3] = df.iloc[:,3].astype('category')
        df.iloc[:,3]=df.iloc[:,3].cat.codes
        
        #Quitamos la primera columna
        df=df.drop([0], axis=1)

#        df=pd.DataFrame(stream.X)
#                
#        x = df.values
#        min_max_scaler = preprocessing.MinMaxScaler()
#        x_scaled = min_max_scaler.fit_transform(x)
#        df = pd.DataFrame(x_scaled)
        df.to_csv('your_path')

        df=df[0:lim_data]#Limitar datos a 50k samples    

        stream = FileStream('your_path')
        stream.prepare_for_use() 

        stream.X=df.as_matrix()
        
    elif data==8 or data==9 or data==10 or data==11:#sinteticos
    
        synt_name=''
        synt_name2=''
        
        if data==8:        
            synt_name='circleG'
            synt_name2='CircleG'
        elif data==9:        
            synt_name='line'
            synt_name2='Line'
        elif data==10:        
            synt_name='sineH'
            synt_name2='SineH'
        elif data==11:        
            synt_name='sine'
            synt_name2='Sine'
        
        path='your_path'
        fil=synt_name+'//data'+synt_name2+'Sev'+str(severity)+'Sp'+str(speed)+'Train.csv'
        
        raw_data= pd.read_csv(path + fil, sep=',',header=None)
        caso=raw_data[raw_data.columns[0:3]]#Delete the last useless column
        caso.iloc[:,2]=(caso.iloc[:,2]).astype(int)#Se convierte la clase a int
        
        new_fil=synt_name+'_'+'Sev'+str(severity)+'_Sp'+str(speed)+'Train.csv'
        caso.to_csv(path+synt_name+'//'+ new_fil)
        
        stream = FileStream(path+synt_name+'//'+ new_fil)
        stream.prepare_for_use() 
        
        if synt_name=='sine':#Hay que escalar los datos
            df=pd.DataFrame(stream.X)
            x = df.values
            min_max_scaler = preprocessing.MinMaxScaler()
            x_scaled = min_max_scaler.fit_transform(x)
            df = pd.DataFrame(x_scaled)
            caso=df
                           
        stream.X=caso.iloc[:,0:2].as_matrix()
        
    elif data==12 or data==13 or data==14 or data==15 or data==16 or data==17 or data==18 or data==19:#sinteticos extendidos
    
        synt_name=''
        synt_name2=''
        
        if data==12 or data==13:        
            synt_name='circleG'
            synt_name2='CircleG'
        elif data==14 or data==15:        
            synt_name='line'
            synt_name2='Line'
        elif data==16 or data==17:        
            synt_name='sineH'
            synt_name2='SineH'
        elif data==18 or data==19:     
            synt_name='sine'
            synt_name2='Sine'
        
        path='your_path'   
        fil=synt_name+'//data'+synt_name2+'Sev'+str(severity)+'Sp'+str(speed)+'Train.csv'
        
        raw_data= pd.read_csv(path + fil, sep=',',header=None)
        caso=raw_data[raw_data.columns[0:3]]#Delete the last useless column
        caso.iloc[:,2]=(caso.iloc[:,2]).astype(int)#Se convierte la clase a int
        
        #Se alargan los concepts estables
        caso2=pd.DataFrame()
        if data==12 or data==14 or data==16 or data==18:#concepto estable 1    
            caso=caso[0:999]
            caso2=caso.iloc[np.tile(np.arange(len(caso)), 50)]        
            new_fil=synt_name+'_'+'concept1.csv'
        elif data==13 or data==15 or data==17 or data==19:#concepto estable 2
            caso=caso[1000:1999]
            caso2=caso.iloc[np.tile(np.arange(len(caso)), 50)]        
            new_fil=synt_name+'_'+'concept2.csv'
            
        
        caso2.to_csv(path+synt_name+'//'+ new_fil)
        
        stream = FileStream(path+synt_name+'//'+ new_fil)
        stream.prepare_for_use() 
        
        if synt_name=='sine':#Hay que escalar los datos para que no sean negativos, sino algunos algoritmos cascan
            df=pd.DataFrame(stream.X)
            x = df.values
            min_max_scaler = preprocessing.MinMaxScaler()
            x_scaled = min_max_scaler.fit_transform(x)
            df = pd.DataFrame(x_scaled)
            caso2=df
                           
        stream.X=caso2.iloc[:,0:2].as_matrix()        
                
    return stream