Esempio n. 1
0
    def _get_y(self):
        if self.COLUMN_NAME_TARGET not in self.df_data.columns:
            if (0 == len(self.list_df_data_file)) \
            or (self.list_df_data_file is None) :
                print("\n*** WARNING : No target!!")
                return None
            else:
                #---------------------------------------------------------------
                # Data are stored into files on harddisk
                #---------------------------------------------------------------
                y = np.zeros(self.total_row)
                start_row = 0
                for df_data_file in self.list_df_data_file:
                    df_data = p5_util.object_load(df_data_file)
                    end_row = start_row + len(df_data)
                    y[start_row:end_row] = p9_util.convert_ser2arr(
                        df_data[self.COLUMN_NAME_TARGET])
                    start_row = end_row

        else:
            return p9_util.convert_ser2arr(
                self.df_data[self.COLUMN_NAME_TARGET])
Esempio n. 2
0
def build_model(dict_param_benchmark, dict_param_keras_cnn):
    '''Build either submission model or benchmark model based on configuration 
    given as function parameter.
    
    Input : 
        *   dict_param_benchmark : parameters for both, submission and benchmark models.
        *   dict_param_keras_cnn : CNN hyper-parameters for building either 
        benchmak or submission model.

    Output : 
        *   the compiles CNN model 
        *   list of callbacks activated by Keras package in order to process 
        additional operations.          
        
    '''

    dict_param_keras = dict_param_keras_cnn['dict_param_keras']

    input_shape = dict_param_keras['input_dim']
    nbClasses = dict_param_keras['nbClasses']
    dropout_rate = dict_param_keras['dropout_rate']
    lr = dict_param_keras['lr']

    #----------------------------------------------------
    # Hard-coded parameters
    #----------------------------------------------------
    is_batch_norm = dict_param_keras['is_batch_normalized']

    if dict_param_benchmark is None:
        model_type = 'submission'
    else:
        model_type = dict_param_benchmark['model_type']

    if dict_param_benchmark['is_embedding_layer']:
        filename_tokenizer = p9_util_benchmark.build_filename_tokenizer(
            dict_param_benchmark=dict_param_benchmark)
        tokenizer = p5_util.object_load(filename_tokenizer)

        sequence_input = Input(shape=(input_shape[0], ), dtype='int32')
        embeddings_dimension = input_shape[1]

        embedding_matrix = build_embeddings_matrix(
            tokenizer, dict_param_benchmark_=dict_param_benchmark)

        embedding_layer = Embedding(len(tokenizer.word_index) + 1,
                                    embeddings_dimension,
                                    weights=[embedding_matrix],
                                    input_length=input_shape[0],
                                    trainable=False)
        x = embedding_layer(sequence_input)
    else:
        x = Input(shape=input_shape)
        sequence_input = x

    if 'submission' == model_type:
        if is_batch_norm:
            x = BatchNormalization()(x)
        x = Conv1D(256, 2, activation='relu', padding='same')(x)
        print(type(x))
        x = MaxPooling1D(5, padding='same')(x)

        if is_batch_norm:
            x = BatchNormalization()(x)
        x = Conv1D(256, 3, activation='relu', padding='same')(x)
        x = MaxPooling1D(5, padding='same')(x)

        if is_batch_norm:
            x = BatchNormalization()(x)
        x = Conv1D(256, 4, activation='relu', padding='same')(x)
        x = MaxPooling1D(2, padding='same')(x)
        if False:
            if is_batch_norm:
                x = BatchNormalization()(x)
            x = Conv1D(256, 5, activation='relu', padding='same')(x)
            x = MaxPooling1D(2, padding='same')(x)

            if is_batch_norm:
                x = BatchNormalization()(x)
            x = Conv1D(256, 6, strides=1, activation='relu', padding='same')(x)
            x = MaxPooling1D(2, padding='same')(x)

        x = Flatten()(x)
        if is_batch_norm:
            x = BatchNormalization()(x)
        x = Dropout(dropout_rate)(x)
        x = Dense(128, activation='relu')(x)
        preds = Dense(2, activation='softmax')(x)
    elif 'benchmark' == model_type:
        x = Conv1D(128, 2, activation='relu', padding='same')(x)
        x = MaxPooling1D(5, padding='same')(x)

        x = Conv1D(128, 3, activation='relu', padding='same')(x)
        x = MaxPooling1D(5, padding='same')(x)

        x = Conv1D(128, 4, activation='relu', padding='same')(x)
        x = MaxPooling1D(40, padding='same')(x)

        x = Flatten()(x)
        x = Dropout(dropout_rate)(x)
        x = Dense(128, activation='relu')(x)
        preds = Dense(2, activation='softmax')(x)
    else:
        print("\n*** ERROR : Unknown model type = {}".format(
            dict_param_benchmark['model_type']))

    #---------------------------------------------------------------------------
    # Build a compiled model from input and output layers.
    #---------------------------------------------------------------------------
    list_callback = list()
    if (sequence_input is not None) and (preds is not None):
        model = Model(sequence_input, preds)
        model.compile(loss='binary_crossentropy',
                      optimizer=RMSprop(lr=lr),
                      metrics=['acc'])

        model.summary()
        if dict_param_benchmark['val_score_max']:
            # checkpoint
            filepath = p9_util_benchmark.build_filename_model(
                dict_param_benchmark=dict_param_benchmark)
            checkpoint = ModelCheckpoint(filepath,
                                         monitor='val_acc',
                                         verbose=1,
                                         save_best_only=True,
                                         mode='max')
            list_callback = [checkpoint]
        else:
            list_callback = None
    else:
        model = None
    return model, list_callback
Esempio n. 3
0
def build_embeddings_matrix(tokenizer, dict_param_benchmark_=None):
    '''Build an embedding matrix from a tokenizer and configuration parameters.
    
    Embedding matrix has the following structure : 
    --> Rows are words or tokens issued from a corpus of texts.
    --> Columns are  vectors coefficients of embeddings.
    
    Embeddings matrix is saved over harddisk before its value to be returned.
    The embeddings matrix file name is buil from dictionary of configuration 
    parameters.
    
    Inputs : 
        * tokenizer : this is an operator that has been used in corpus 
        tokenization process. It contains the vocabulary issued from corpus 
        tokenization. Rows of resulting embedding matrix will be issued from 
        the tokenizer vocabulary.

        *   dict_param_benchmark : parameters structured as a dictionary. When 
        this value is None, then dictionary defined in this file is used.

    Output :
        * embeddings matrix the represents a digitalized corpus.
    '''

    if dict_param_benchmark_ is None:
        embeddings_dimension = EMBEDDINGS_DIMENSION
    else:
        embeddings_dimension = dict_param_benchmark_['embeddings_dimension']

    #---------------------------------------------------------------------------
    # Load embeddings
    #---------------------------------------------------------------------------
    if dict_param_benchmark_ is None or not dict_param_benchmark_[
            'is_embedding_reloaded']:
        #-----------------------------------------------------------------------
        # In case configuration is undefined or flag to reload embeddings matrix
        # is False, then this last is built.
        #-----------------------------------------------------------------------
        print('Loading embeddings...')
        embeddings_index = {}
        with open(EMBEDDINGS_PATH) as f:
            for line in f:
                values = line.split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs

        #---------------------------------------------------------------------------
        # Build embeddings
        #---------------------------------------------------------------------------
        print('Building embeddings...')
        embedding_matrix = np.zeros(
            (len(tokenizer.word_index) + 1, embeddings_dimension))
        num_words_in_embedding = 0
        for word, i in tokenizer.word_index.items():
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                num_words_in_embedding += 1
                embedding_matrix[i] = embedding_vector
            else:
                #-------------------------------------------------------------------
                # words not found in embedding index will be all-zeros.
                #-------------------------------------------------------------------
                pass
        #-----------------------------------------------------------------------
        # Save embeddings matrix
        #-----------------------------------------------------------------------
        filename_embedding_matrix = p9_util_benchmark.build_filename_embedding_matrix(
            dict_param_benchmark=dict_param_benchmark_)
        p5_util.object_dump(embedding_matrix,
                            filename_embedding_matrix,
                            is_verbose=True)
    else:
        #-----------------------------------------------------------------------
        # Load embeddings matrix
        #-----------------------------------------------------------------------
        print('Loading embeddings matrix...')
        filename_embedding_matrix = p9_util_benchmark.build_filename_embedding_matrix(
            dict_param_benchmark=dict_param_benchmark_)
        embedding_matrix = p5_util.object_load(filename_embedding_matrix,
                                               is_verbose=True)

    return embedding_matrix
Esempio n. 4
0
def build_cnn_datagenerator(filename_datapreparator=None, \
                        percent_var=1.0, dataPreparator_v2 = None,
                        dict_param_generator = None):
    '''This function allows to build datagenerator object for a Keras CNN 
    model.
    
    Such process includes : 
        * The load of DataPreparator_v2 object
        * The build of DataGenerator object used from Keras as a source of data.

    The configuration file, p9_util_config.py is used to read configuration and  
    hyper-parameters.
    
    Inputs :
        * filename_datapreparator : file name of dumped DataPreparator object.
        Such object is issued from data-preparation process.
        
        * percent_var : precentage of variance expected. This is used in order to build 
        CNN architecture with multiple channels. The number of channel depends of the 
        number of PCA dimensions fixed for dataset.
        
        * dataPreparator_v2 : Data-Preparator_v2 object used for data preparation.
        
        * dict_param_generator : parameter for CNN data generator. When None, 
        then module 'p9_util_config.dict_param_generator' is used.
    Output :
        * DataGenerator object type.
    
    '''
    if dataPreparator_v2 is None :
        if filename_datapreparator is None :
            print("\n*** ERROR : DataPreparator file name undefined!")
            return None
        #-----------------------------------------------------------------------
        # Load of DataPreparator object
        #-----------------------------------------------------------------------
        dataPreparator_v2 = p5_util.object_load(filename_datapreparator)
    else :
        print("Using DataPreparator given as parameter...")
        pass
    
    print("\nDataPreparator Dataframe shape= {}".format(dataPreparator_v2.df_data.shape))
    

    #---------------------------------------------------------------------------
    # Fixe DataGenerator parameters
    #---------------------------------------------------------------------------
    if dict_param_generator is None :
        dict_param_generator = p9_util_config.dict_param_generator
    else : 
        pass
    
    #---------------------------------------------------------------------------
    # Update aprameters depending from DataPreparator_v2
    #---------------------------------------------------------------------------
    dict_param_generator['binary_threshold'] = dataPreparator_v2.threshold

    #---------------------------------------------------------------------------
    # Extract PCA operator in order to extract the number of components 
    # given a percentage of variance rate.
    #---------------------------------------------------------------------------
    pca = dataPreparator_v2.xpca
    if pca is not None :
        nb_component = p3_util_plot.get_component_from_cum_variance(pca, percent_var) 
        print("\nComponents= {} for variance= {}%".format(nb_component, percent_var*100))
        dict_param_generator['keras_nb_channel'] = nb_component
    else :
        dict_param_generator['keras_nb_channel'] =0
    
    #---------------------------------------------------------------------------
    # Input dim for CNN network is defined by the tuple :
    # (number_of_measures, number_of_features).
    # 
    # for text classification : (number_of_tokens, embedding_dim)
    #
    # In case of multiplexed dimensions, for a given dimension, then, each token 
    # is assigned a coefficient that is the value of this digitalized token over 
    # this given dimension.
    # Then, for each text, they are dataPreparator_v2.max_length tokens.
    # Input dim is then (dataPreparator_v2.max_length, 1).
    #
    # When embedding dimensions are not multiplexed :
    #  If dimension are truncated using PCA operator, then input dimensions
    #  are : (dataPreparator_v2.max_length, self.df_data['matrix_padded_truncated'].shape[1])
    #
    #  If embedding dimension are not truncated, then input dimensions are :
    #  are : (dataPreparator_v2.max_length, self.df_data['matrix_padded'].shape[1])
    # 
    #---------------------------------------------------------------------------
    if dict_param_generator['is_dimension_mux'] :
        nb_feature = 1
        data_column_name = 'matrix_padded_truncated'
    else : 
        if 'matrix_padded_truncated' in dataPreparator_v2.df_data.columns :
            nb_feature = dataPreparator_v2.df_data['matrix_padded_truncated'].iloc[0].shape[1]
            data_column_name = 'matrix_padded_truncated'
        elif 'matrix_padded' in dataPreparator_v2.df_data.columns :
            nb_feature = dataPreparator_v2.df_data['matrix_padded'].iloc[0].shape[1]
            data_column_name = 'matrix_padded'
        else :
            if 0 < len(dataPreparator_v2.list_df_data_file) :
                #---------------------------------------------------------------
                # Data has been recorded on harddidk. Search if 
                # column name exists into dataframe.
                #---------------------------------------------------------------
                filename = dataPreparator_v2.list_df_data_file[-1]
                df_data = p5_util.object_load(filename)
                if 'matrix_padded_truncated' in df_data.columns :                
                    nb_feature = df_data['matrix_padded_truncated'].iloc[0].shape[1]
                    data_column_name = 'matrix_padded_truncated'
                elif 'matrix_padded' in df_data.columns :
                    nb_feature = df_data['matrix_padded'].iloc[0].shape[1]
                    data_column_name = 'matrix_padded'
                else : 
                    print("\n*** ERROR : build_cnn_datagenerator(): No column name \'matrix_padded_truncated\' nor \'matrix_padded\' CNN dimension is undefined! ")
                    return None

            else : 
                print("\n*** ERROR : build_cnn_datagenerator(): No column name \'matrix_padded_truncated\' nor \'matrix_padded\' CNN dimension is undefined! ")
                return None
    #print(dataPreparator_v2.max_length, nb_feature)
    keras_input_dim = (dataPreparator_v2.max_length, nb_feature)
    dict_param_generator['keras_input_dim'] = keras_input_dim
    
    print("")
    for key, value in dict_param_generator.items() :
        print("{} : {}".format(key,value))
        
    print("\nBuilding datagenerator...")
    generator = test_datapreparator.build_generator(dataPreparator_v2, \
                     dict_param_generator,\
                     data_column_name=data_column_name)
    
       
    return generator
Esempio n. 5
0
def datapreparation_step_process(dict_param_sequence=None, is_debug=False ) :
    
    if dict_param_sequence is None :
        print("\n*** ERROR : parameter not defined for function!")
        return None

    
    
    #--------------------------------------------------------------------------
    # Build backup / restore file name
    #--------------------------------------------------------------------------
    data_type = dict_param_sequence["data_type"]
    root_file_name = dict_param_sequence["root_file_name"]

    n_sample = get_n_sample(dict_param_sequence)
    if n_sample is None :
        return None
    file_format =  dict_param_sequence["file_format"]
    
    
    #---------------------------------------------------------------------------
    # Get step number then step parameters and process it.
    #---------------------------------------------------------------------------
    dict_param_step = dict_param_sequence['dict_param_step']
    step = dict_param_sequence['step']
    dict_step = dict_param_step[step]
    
    if dict_step is None :
        print("\n*** INFO : step= {} of automated process skiped!".format(step))
        return None
        
    dict_param_dataprep = dict_step['dict_param_dataprep']
    
    

    #---------------------------------------------------------------------------
    # Steps processing
    #---------------------------------------------------------------------------
    if 1 == step :
        root_dataset_filename = dict_step["dataset_filename"]
        if None is root_dataset_filename :
            print("\n*** ERROR : step= {}: dataset filename for this step not provided!".format(step))
            return None

        #---------------------------------------------------------------------------
        # Load dataset
        #---------------------------------------------------------------------------
        dataset_filename = root_dataset_filename+'_'+str(data_type)+str(file_format)
        if not is_debug :
            ser_X_data_type, ser_y_data_type = p5_util.object_load(dataset_filename)    
            
            X_data_type = ser_X_data_type.sample(n_sample)
            index = X_data_type.index
            y_data_type = ser_y_data_type[index]
            
            X_data_type = X_data_type.tolist()
            y_data_type = y_data_type.tolist()
                        
            #-------------------------------------------------------------------
            # Apply operations over this step using fit_transform() method.
            #-------------------------------------------------------------------
            dataPreparator_v2 = DataPreparator_v2.DataPreparator_v2(**dict_param_dataprep)
            X = dataPreparator_v2.fit_transform_step(X_data_type,y_data_type)

        filename = root_file_name+'_'+str(data_type)+'_'+str(n_sample)+'_step'+str(step)+str(file_format)            
        

        #---------------------------------------------------------------------------
        # Save step result
        #---------------------------------------------------------------------------
        if not is_debug :
            p5_util.object_dump(dataPreparator_v2,filename)
        print("\nStep : {} Save data-preparation into file {}".format(step,filename))
        return filename
    elif 2 == step :
        print("\nStep : {}".format(step))
        
        #-----------------------------------------------------------------------
        # Not used. Already at None value.
        #-----------------------------------------------------------------------
        filename = dict_step['dataprep_step_filename']
        if None is filename :
            #-------------------------------------------------------------------
            # Select file issued from previous step.
            #-------------------------------------------------------------------
            filename = dict_param_sequence["previous_step_file_name"]
            if None is filename :        
                print("\n*** ERROR : step= {}: filename for this step not provided!".format(step))
                return None
        
        #---------------------------------------------------------------------------
        # Load DataPreparator_v2 object from step1
        #---------------------------------------------------------------------------
        if not is_debug :
        
            dataPreparator_v2 = p5_util.object_load(filename, is_verbose=True)
            
            #-------------------------------------------------------------------
            # Check is this step as already took place and then, 
            # apply a sequence of sub-steps.
            #-------------------------------------------------------------------
            dict_param_subsequence = dict_step['dict_param_subsequence']
            start_substep = dict_param_subsequence['start_substep']

            #-----------------------------------------------------------------------
            # Build file name in order to save DataPreparator_v2 oject
            #-----------------------------------------------------------------------
            filename = root_file_name+'_'+str(data_type)+'_'+str(n_sample)+'_step'+str(step)+str(file_format)

            if 0 < start_substep :
                #---------------------------------------------------------------
                # Step 2 already took place. Apply sub-step from step 2.
                # Sub-step sequence is described into dict_param_sequence 
                # dictionary.
                #---------------------------------------------------------------
                dataPreparator_v2 = \
                dataprepatator_subsequence_process(dict_param_sequence,\
                dataPreparator_v2)

                if not is_debug :
                    print("\nStep {} : Save data-preparation into file {} ...".format(step,filename))
                    p5_util.object_dump(dataPreparator_v2,filename)
                else : 
                    pass                
                
            else :
                #-------------------------------------------------------------------
                # If step2 transformation didn't took place or was not completed, 
                # then it is processed here.
                #
                # When bulk_row is >0, transformation is proceeded step 
                # by step, saving results from each step in a file.
                #
                # This step by step process may be interrupted. Then it may be 
                # restarted from the value= id_bulk_row.
                #
                # Otherwise, when id_bulk_row = 0, then transformation starts 
                # from beguining.
                #-------------------------------------------------------------------
                bulk_row = dict_step['bulk_row']
                dict_restart_step = dict_step['dict_restart_step']
                id_bulk_row = dict_restart_step['id_bulk_row']
                
                dataPreparator_v2.build_padded_matrix(bulk_row = bulk_row,\
                root_filename='./data/df_'+str(data_type)+'_'+str(n_sample)+'_step'+str(step),\
                id_bulk_row = id_bulk_row)
        

                if 0 == bulk_row :
                    pass
                else : 
                    #-----------------------------------------------------------
                    # Data are saved into files.
                    # Clean dataframe in case of step by step process.
                    #-----------------------------------------------------------
                    dataPreparator_v2.total_row = dataPreparator_v2.df_data.shape[0]
                    dataPreparator_v2.df_data = pd.DataFrame()                    

                if not is_debug :
                    print("\nStep {} : Save data-preparation into file {} ...".format(step,filename))
                    p5_util.object_dump(dataPreparator_v2,filename)
                    print("\nStep {} : Save data-preparation into file {} Done!".format(step,filename))
                
                if 0 == bulk_row :
                    pass
                else : 
                    print("\nStep {} : Step by step data-preparation : Restart process with (step, substep, previous file) = (2,1,{}) in configuration file!".format(step,filename))
        return filename
    elif 3 == step :
        print("\nStep : {}".format(step))
        ipca_batch_size = dict_step['ipca_batch_size']
        percent_var = dict_step['percent_var']

        if (percent_var > 1.) or (percent_var <= 0.) :
            print("\n*** ERROR : step= {}: percent_var has to belong interval ]0.,1.]; current value= {}".\
                  format(step, percent_var))
            return None

        filename = dict_step['dataprep_step_filename']
        if None is filename :
            filename = dict_param_sequence["previous_step_file_name"]
            if None is filename :        
                print("\n*** ERROR : step= {}: filename for this step not provided!".format(step))
                return None
        if not is_debug :
            #-------------------------------------------------------------------
            # Load dataPreparator_v2 object from step2
            #-------------------------------------------------------------------
            dataPreparator_v2 = p5_util.object_load(filename)

            #-------------------------------------------------------------------
            # PCA operator for dimension reduction.
            #-------------------------------------------------------------------
            xpca = dict_step['xpca']
            if xpca is not None :
                dataPreparator_v2.xpca = xpca
            else :
                pass    

        #-------------------------------------------------------------------
        # Check is this step as already took place and then, 
        # apply a sequence of sub-steps.
        #-------------------------------------------------------------------
        dict_param_subsequence = dict_step['dict_param_subsequence']
        start_substep = dict_param_subsequence['start_substep']
        end_substep   = dict_param_subsequence['end_substep']
        
        if 0 < start_substep :
            #---------------------------------------------------------------
            # Use 2 separated steps in order to build PCA operator then 
            # to proceed to dimension reduction.
            #---------------------------------------------------------------
            if not is_debug :
                dataPreparator_v2 = \
                dataprepatator_subsequence_process(dict_param_sequence,\
                dataPreparator_v2)
            else :
                pass
        else : 
            #---------------------------------------------------------------
            # In the same step, build PCA operator and proceed to dimension 
            # reduction.
            #---------------------------------------------------------------
            if not is_debug :
                dataPreparator_v2.build_matrix_padded_truncated(ipca_batch_size, \
                percent_var)
            else :
                pass
        if len(dict_param_step) == step :
            if 0 < start_substep :
                if len(dict_param_subsequence['dict_param_step']) == end_substep :
                    #-----------------------------------------------------------
                    # Lat step from sequence and last sub-step from sub-sequence
                    # File name will not have any step value extension .
                    #-----------------------------------------------------------
                    filename = root_file_name+'_'+str(data_type)+'_'+str(n_sample)+str(file_format)
                else : 
                    #-----------------------------------------------------------
                    # Lat step from sequence and thists is not la sub-step 
                    # from sub-sequence.
                    # Filename will have a step value extension .
                    #-----------------------------------------------------------
                    filename = root_file_name+'_'+str(data_type)+'_'\
                    +str(n_sample)+'_step'+str(step)+'_substep'+\
                    str(start_substep)+str(file_format)
            else : 
                #---------------------------------------------------------------
                # Last step and no sub-step: file name is the final one.    
                # File name will not have any step value extension .
                #---------------------------------------------------------------
                filename = root_file_name+'_'+str(data_type)+'_'+str(n_sample)+str(file_format)
                
        else :            
            filename = root_file_name+'_'+str(data_type)+'_'+str(n_sample)+'_step'+str(step)+str(file_format)
        if not is_debug :
            print("\nStep {} : Save data-preparation into file {} ...".format(step,filename))
            p5_util.object_dump(dataPreparator_v2,filename)
        print("\nStep {} : Save data-preparation into file {} Done!".format(step,filename))
        return filename
Esempio n. 6
0
def dataprepatator_subsequence_process(dict_param_sequence, dataPreparator_v2) :
    ''' This functions allows to concatenate dataframes stored into multiple files.
    This is due to the fact data has been process by bulk in order to save memory 
    resources.
    All files are read and concatenated into a single dataframe.
    
    Input :
        * dict_param_sequence : parameters of the data-prepatation sequence which 
        also contain parameters to process sub-steps in a step.
        The current step in which this sub-step has to be processed is contained 
        in these parameters.
        
        * dataPreparator_v2 : DataPreparator_v2 object in which df_data 
        attributes will be updated.

        
    
    Output :
        * dataframe containing all data splitted into files.
        
    '''
    df_data = None

    #----------------------------------------------------------------------
    # Extract parameters for sub-sequences related to this step.
    #----------------------------------------------------------------------
    step = dict_param_sequence['step']
    dict_step = dict_param_sequence['dict_param_step'][step]
    dict_param_subsequence = dict_step['dict_param_subsequence']
    n_sample = get_n_sample(dict_param_sequence)

    #---------------------------------------------------------------------------
    # Extract start and send of the sub-steps.
    # They are in dict_param_subsequence dictionary.
    #---------------------------------------------------------------------------
    start_substep = dict_param_subsequence['start_substep']
    end_substep   = dict_param_subsequence['end_substep']

    if 2 == step :
        bulk_row = dict_step['bulk_row']
        #---------------------------------------------------------------------------
        # Extract parameters that configure this sequence of sub-steps
        #---------------------------------------------------------------------------
        dict_param_step = dict_param_subsequence['dict_param_step']
        
        for substep in range(start_substep,end_substep+1) :
            if 1 == substep :
                total_row = dataPreparator_v2.total_row     
                #-------------------------------------------------------------------
                # In this sub-step, files are read form hardisk and aggregated 
                # as a dataframe.
                # 
                # Extract parameters that configure this sub-step.
                #-------------------------------------------------------------------
                fixed_count_file = dict_param_step[substep]['fixed_count_file']

                if 0 == total_row :
                    return None
                else :
                    pass

                if fixed_count_file > 0 :
                    count_file = fixed_count_file
                    tail = 0
                else :        
                    count_file = total_row//bulk_row
                    tail = total_row%bulk_row

                print("Step {} : sub-step: {} : dataframe concatenation of {} files".format(step, substep,count_file))
                root_filename = "./data/df_"+str(dict_param_sequence['data_type'])+"_"+str(n_sample)+"_step"+str(step)
                df_data = pd.DataFrame()
                is_intermediate = False
                for i in range(count_file) :
                    is_intermediate = True
                    filename = root_filename+"_"+str(i)+".dill"
                    df = p5_util.object_load(filename, is_verbose=False)
                    df_data = pd.concat([df_data,df])
                    print("Step {} : sub-step: {} : process status : {}/{}".\
                    format(step, substep, i+1,count_file), end='\r')

                if tail >0 :
                    if is_intermediate :
                        i+=1
                    else :
                        i=0
                    filename = root_filename+"_"+str(i)+".dill"
                    df_data = pd.concat([df_data,df])
            else :
                print("\n*** ERROR : Step : {} / sub-step={} not yet supported".format(step, substep))
                df_data = None        
            #-----------------------------------------------------------------------
            # Drop unused columns from df_data in order to save memory
            #-----------------------------------------------------------------------
            if 'vector' in df_data.columns : 
                del(df_data['vector'])
            
            if 'tokens' in df_data.columns : 
                del(df_data['tokens'])
                
            if 'counting' in df_data.columns : 
                del(df_data['counting'])
        
            #-----------------------------------------------------------------------
            # Update df_data attribute with concatenated dataframe.
            #-----------------------------------------------------------------------
            dataPreparator_v2.df_data = df_data.copy()

    elif 3 == step:	
        for substep in range(start_substep,end_substep+1) :
            method = dict_param_subsequence['dict_param_step'][substep]['method']
            if 1 == substep :
                parameter = dict_step['ipca_batch_size']
            elif 2 == substep :
                parameter = dict_step['percent_var']
            else  :
                print("\n*** ERROR : dataprepatator_subsequence_process() : sub-step= {} not yet supported!".format(substep))
                return None
            method(dataPreparator_v2, parameter)
    else :
        print("\n*** ERROR : no sub-steps supported for step = {}".format(step))
        return None
    
    
        
    return dataPreparator_v2
Esempio n. 7
0
def build_generator(dataPreparator, \
                     dict_param_generator,\
                     data_column_name='vector') :
                     
    '''Builds DataGenerator object from DataPreparator object.
    DataGenerator object allows to feed Keras neural network estimators 
    by pumping data recorded in a set of files named partitions.

    This allows to save RAM memory thanks to the use of hard disk.
    
    DataGenerator object contains all informations required in order to 
    access data into partitions files.
    
    
    Input : 
        * dataPreparator : object containing digitalized dataset along with 
        operators used for digitalization.
        * dict_param_generator : dictionary of parameters used to construct 
        DataGenerator.
        * data_column_name : name of the column from dataframe into 
        dataPreparator containing digitalized dataset.
    Output :
        * DataGenerator object
    '''
    
    data_type = dict_param_generator['data_type']

    #---------------------------------------------------------------------------
    # Consistency of inputs are checked
    #---------------------------------------------------------------------------
    if data_type is None :
        print("\n*** ERROR : build_generator() : unknown data type!")
        return None
    
    if (data_type != "train") and (data_type != "valid"):
        print("\n*** ERROR : build_generator() : Unknown data_type= {}! Supported data_type : train or valid".format(data_type))
        return None
        
    if (dataPreparator.list_df_data_file is None) or (0 == len(dataPreparator.list_df_data_file)) :

        #-----------------------------------------------------------------------
        # All data is stored into dataPreparator dataframe.
        # Retrieve data from DataPreparator dataframes.
        #-----------------------------------------------------------------------
        if data_column_name in dataPreparator.df_data.columns :
            X = np.array(dataPreparator.df_data[data_column_name].tolist()) 
            y = np.array(dataPreparator.df_data.target.tolist())
        else :
            print("\n***ERROR : column name \'{}\' out of train dataframe !".\
            format(data_column_name))
            return None, None
        
        #-----------------------------------------------------------------------
        # Make partitions
        #-----------------------------------------------------------------------
        print(y[:10])
        partition_size = dict_param_generator['partition_size']
        dict_partition, dict_label = p9_util.make_partition(X, \
                                                y, \
                                                partition_size,\
                                                data_type=data_type, \
                                                data_format='ndarray' )

        if (dict_partition is None) or (dict_label is None) :
            print("\n*** ERROR : build_generators() : building partitions for data_type= {} dataset failed!".format(data_type))
            return None

        #---------------------------------------------------------------------------
        # Total number of records
        #---------------------------------------------------------------------------
        len_dataset = X.shape[0]

    else : 
        #-----------------------------------------------------------------------
        # All data are stored into files on harddisk.
        # dataPreparator handle the name of those files.
        # Files are read and partitions are built for each of thses files.
        #-----------------------------------------------------------------------
        dict_partition = dict()
        dict_label = dict()
        partition_size = dict_param_generator['partition_size']
        print("\n*** Partition size = {}".format(partition_size))
        len_dataset = 0
        start_row = 0
        for df_data_file in dataPreparator.list_df_data_file :
            df_data = p5_util.object_load(df_data_file, is_verbose=True)
            if data_column_name in df_data.columns :
                X = np.array(df_data[data_column_name].tolist()) 
                end_row = start_row + X.shape[0]
                y = np.array(df_data.target.tolist())
                #y = np.array(df_data.target.tolist())[start_row:end_row]
                #y = np.array(dataPreparator.df_data.target.tolist())[start_row:end_row]

                start_row = end_row
                len_dataset += X.shape[0]
            else :
                print("\n***ERROR : file name= {} : column name \'{}\' out of train dataframe !".\
                format(df_data_file,data_column_name))
                return None, None

            #-------------------------------------------------------------------
            # Make partitions; dict_partition and dict_label are updated in each
            # function call for making partitions
            #-------------------------------------------------------------------
            dict_partition, dict_label = p9_util.make_partition(X, \
                                                    y, \
                                                    partition_size,\
                                                    data_type=data_type, \
                                                    data_format='ndarray',\
                                                    dict_partition = dict_partition,\
                                                    dict_label = dict_label,\
                                                    is_debug=False)
            
    #---------------------------------------------------------------------------
    # Build data generators
    #---------------------------------------------------------------------------
    dataGenerator = DataGenerator.DataGenerator(dict_partition, \
                                                dict_label, \
                                                len_dataset,\
                                                **dict_param_generator)
        
    #---------------------------------------------------------------------------
    # Save DataGenerator
    #---------------------------------------------------------------------------
    filename = "./data/"+str(data_type)+"_generator.dill"
    p5_util.object_dump(dataGenerator, filename)
    return dataGenerator
Esempio n. 8
0
def load_dataset(filename, dataset_type='P7', is_label_encoded=False) :
    '''Load dataset from file name given as function parameter.
    '''
    is_label_encoded = p8_util_config.IS_LABEL_ENCODED
    if dataset_type == 'P7' :
        (x_train,x_test, y_train, y_test) = p5_util.object_load(filename)
        
        number = x_train.shape[0]
        if p8_util_config.NN_TYPE == 'RNN' :
            x_train = batch_coloredimage_serial_reshape(x_train)
            x_test  = batch_coloredimage_serial_reshape(x_test)
        else :
            pass
        
        print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
        
        if is_label_encoded :
            y_train=array_label_encode_from_index(y_train)
            y_test=array_label_encode_from_index(y_test)
            nClasses = max(len(np.unique(y_train)), len(np.unique(y_test)))
        else :
            nClasses = y_train.shape[1]
    elif dataset_type == 'MNIST' :
        x_train, x_valid, x_test, y_train, y_valid, y_test = load_data_mnist()
        nClasses = max(len(np.unique(y_train)), len(np.unique(y_test)))
        if False:
            y_train=array_label_encode_binary(y_train)
            y_test=array_label_encode_binary(y_test)
            y_valid=array_label_encode_binary(y_valid)
            nClasses = y_train.shape[1]
    elif dataset_type == 'JIGSAW' :
        nClasses = 0
        if False :
            sampling_ratio = p8_util_config.SAMPLING_RATIO
            X_train, y_train, X_test, y_test = \
            load_dataset_jigsaw(sampling_ratio=sampling_ratio)
            
            x_train, x_test = preprocess_dataset_jigsaw(X_train, X_test)    
        else :
            filename = './data/X_train_encoded.dump'
            x_train = p5_util.object_load(filename)

            filename = './data/X_test_encoded.dump'
            x_test = p5_util.object_load(filename)

            filename = './data/y_test.dump'
            y_test = p5_util.object_load(filename)
            if type(y_test) is list :
                y_test = np.array(y_test)

            filename = './data/y_train.dump'
            y_train = p5_util.object_load(filename)
            if type(y_train) is list :
                y_train = np.array(y_train)
    else :
        pass
    
    #w_size = x_train.shape[1]
    #h_size = x_train.shape[2]        

    tuple_dimension = (x_train.shape,x_test.shape,y_train.shape,y_test.shape)
    
    #print("Dimensions= {}".format(tuple_dimension))
    #print("Number of classes= "+str(nClasses))
    if dataset_type == 'MNIST' :
        return x_train, x_test, y_train, y_test,  nClasses \
        ,tuple_dimension[0][1:]        

    return x_train, x_test, y_train, y_test, nClasses,tuple_dimension[0][1:]