def _get_y(self): if self.COLUMN_NAME_TARGET not in self.df_data.columns: if (0 == len(self.list_df_data_file)) \ or (self.list_df_data_file is None) : print("\n*** WARNING : No target!!") return None else: #--------------------------------------------------------------- # Data are stored into files on harddisk #--------------------------------------------------------------- y = np.zeros(self.total_row) start_row = 0 for df_data_file in self.list_df_data_file: df_data = p5_util.object_load(df_data_file) end_row = start_row + len(df_data) y[start_row:end_row] = p9_util.convert_ser2arr( df_data[self.COLUMN_NAME_TARGET]) start_row = end_row else: return p9_util.convert_ser2arr( self.df_data[self.COLUMN_NAME_TARGET])
def build_model(dict_param_benchmark, dict_param_keras_cnn): '''Build either submission model or benchmark model based on configuration given as function parameter. Input : * dict_param_benchmark : parameters for both, submission and benchmark models. * dict_param_keras_cnn : CNN hyper-parameters for building either benchmak or submission model. Output : * the compiles CNN model * list of callbacks activated by Keras package in order to process additional operations. ''' dict_param_keras = dict_param_keras_cnn['dict_param_keras'] input_shape = dict_param_keras['input_dim'] nbClasses = dict_param_keras['nbClasses'] dropout_rate = dict_param_keras['dropout_rate'] lr = dict_param_keras['lr'] #---------------------------------------------------- # Hard-coded parameters #---------------------------------------------------- is_batch_norm = dict_param_keras['is_batch_normalized'] if dict_param_benchmark is None: model_type = 'submission' else: model_type = dict_param_benchmark['model_type'] if dict_param_benchmark['is_embedding_layer']: filename_tokenizer = p9_util_benchmark.build_filename_tokenizer( dict_param_benchmark=dict_param_benchmark) tokenizer = p5_util.object_load(filename_tokenizer) sequence_input = Input(shape=(input_shape[0], ), dtype='int32') embeddings_dimension = input_shape[1] embedding_matrix = build_embeddings_matrix( tokenizer, dict_param_benchmark_=dict_param_benchmark) embedding_layer = Embedding(len(tokenizer.word_index) + 1, embeddings_dimension, weights=[embedding_matrix], input_length=input_shape[0], trainable=False) x = embedding_layer(sequence_input) else: x = Input(shape=input_shape) sequence_input = x if 'submission' == model_type: if is_batch_norm: x = BatchNormalization()(x) x = Conv1D(256, 2, activation='relu', padding='same')(x) print(type(x)) x = MaxPooling1D(5, padding='same')(x) if is_batch_norm: x = BatchNormalization()(x) x = Conv1D(256, 3, activation='relu', padding='same')(x) x = MaxPooling1D(5, padding='same')(x) if is_batch_norm: x = BatchNormalization()(x) x = Conv1D(256, 4, activation='relu', padding='same')(x) x = MaxPooling1D(2, padding='same')(x) if False: if is_batch_norm: x = BatchNormalization()(x) x = Conv1D(256, 5, activation='relu', padding='same')(x) x = MaxPooling1D(2, padding='same')(x) if is_batch_norm: x = BatchNormalization()(x) x = Conv1D(256, 6, strides=1, activation='relu', padding='same')(x) x = MaxPooling1D(2, padding='same')(x) x = Flatten()(x) if is_batch_norm: x = BatchNormalization()(x) x = Dropout(dropout_rate)(x) x = Dense(128, activation='relu')(x) preds = Dense(2, activation='softmax')(x) elif 'benchmark' == model_type: x = Conv1D(128, 2, activation='relu', padding='same')(x) x = MaxPooling1D(5, padding='same')(x) x = Conv1D(128, 3, activation='relu', padding='same')(x) x = MaxPooling1D(5, padding='same')(x) x = Conv1D(128, 4, activation='relu', padding='same')(x) x = MaxPooling1D(40, padding='same')(x) x = Flatten()(x) x = Dropout(dropout_rate)(x) x = Dense(128, activation='relu')(x) preds = Dense(2, activation='softmax')(x) else: print("\n*** ERROR : Unknown model type = {}".format( dict_param_benchmark['model_type'])) #--------------------------------------------------------------------------- # Build a compiled model from input and output layers. #--------------------------------------------------------------------------- list_callback = list() if (sequence_input is not None) and (preds is not None): model = Model(sequence_input, preds) model.compile(loss='binary_crossentropy', optimizer=RMSprop(lr=lr), metrics=['acc']) model.summary() if dict_param_benchmark['val_score_max']: # checkpoint filepath = p9_util_benchmark.build_filename_model( dict_param_benchmark=dict_param_benchmark) checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') list_callback = [checkpoint] else: list_callback = None else: model = None return model, list_callback
def build_embeddings_matrix(tokenizer, dict_param_benchmark_=None): '''Build an embedding matrix from a tokenizer and configuration parameters. Embedding matrix has the following structure : --> Rows are words or tokens issued from a corpus of texts. --> Columns are vectors coefficients of embeddings. Embeddings matrix is saved over harddisk before its value to be returned. The embeddings matrix file name is buil from dictionary of configuration parameters. Inputs : * tokenizer : this is an operator that has been used in corpus tokenization process. It contains the vocabulary issued from corpus tokenization. Rows of resulting embedding matrix will be issued from the tokenizer vocabulary. * dict_param_benchmark : parameters structured as a dictionary. When this value is None, then dictionary defined in this file is used. Output : * embeddings matrix the represents a digitalized corpus. ''' if dict_param_benchmark_ is None: embeddings_dimension = EMBEDDINGS_DIMENSION else: embeddings_dimension = dict_param_benchmark_['embeddings_dimension'] #--------------------------------------------------------------------------- # Load embeddings #--------------------------------------------------------------------------- if dict_param_benchmark_ is None or not dict_param_benchmark_[ 'is_embedding_reloaded']: #----------------------------------------------------------------------- # In case configuration is undefined or flag to reload embeddings matrix # is False, then this last is built. #----------------------------------------------------------------------- print('Loading embeddings...') embeddings_index = {} with open(EMBEDDINGS_PATH) as f: for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs #--------------------------------------------------------------------------- # Build embeddings #--------------------------------------------------------------------------- print('Building embeddings...') embedding_matrix = np.zeros( (len(tokenizer.word_index) + 1, embeddings_dimension)) num_words_in_embedding = 0 for word, i in tokenizer.word_index.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None: num_words_in_embedding += 1 embedding_matrix[i] = embedding_vector else: #------------------------------------------------------------------- # words not found in embedding index will be all-zeros. #------------------------------------------------------------------- pass #----------------------------------------------------------------------- # Save embeddings matrix #----------------------------------------------------------------------- filename_embedding_matrix = p9_util_benchmark.build_filename_embedding_matrix( dict_param_benchmark=dict_param_benchmark_) p5_util.object_dump(embedding_matrix, filename_embedding_matrix, is_verbose=True) else: #----------------------------------------------------------------------- # Load embeddings matrix #----------------------------------------------------------------------- print('Loading embeddings matrix...') filename_embedding_matrix = p9_util_benchmark.build_filename_embedding_matrix( dict_param_benchmark=dict_param_benchmark_) embedding_matrix = p5_util.object_load(filename_embedding_matrix, is_verbose=True) return embedding_matrix
def build_cnn_datagenerator(filename_datapreparator=None, \ percent_var=1.0, dataPreparator_v2 = None, dict_param_generator = None): '''This function allows to build datagenerator object for a Keras CNN model. Such process includes : * The load of DataPreparator_v2 object * The build of DataGenerator object used from Keras as a source of data. The configuration file, p9_util_config.py is used to read configuration and hyper-parameters. Inputs : * filename_datapreparator : file name of dumped DataPreparator object. Such object is issued from data-preparation process. * percent_var : precentage of variance expected. This is used in order to build CNN architecture with multiple channels. The number of channel depends of the number of PCA dimensions fixed for dataset. * dataPreparator_v2 : Data-Preparator_v2 object used for data preparation. * dict_param_generator : parameter for CNN data generator. When None, then module 'p9_util_config.dict_param_generator' is used. Output : * DataGenerator object type. ''' if dataPreparator_v2 is None : if filename_datapreparator is None : print("\n*** ERROR : DataPreparator file name undefined!") return None #----------------------------------------------------------------------- # Load of DataPreparator object #----------------------------------------------------------------------- dataPreparator_v2 = p5_util.object_load(filename_datapreparator) else : print("Using DataPreparator given as parameter...") pass print("\nDataPreparator Dataframe shape= {}".format(dataPreparator_v2.df_data.shape)) #--------------------------------------------------------------------------- # Fixe DataGenerator parameters #--------------------------------------------------------------------------- if dict_param_generator is None : dict_param_generator = p9_util_config.dict_param_generator else : pass #--------------------------------------------------------------------------- # Update aprameters depending from DataPreparator_v2 #--------------------------------------------------------------------------- dict_param_generator['binary_threshold'] = dataPreparator_v2.threshold #--------------------------------------------------------------------------- # Extract PCA operator in order to extract the number of components # given a percentage of variance rate. #--------------------------------------------------------------------------- pca = dataPreparator_v2.xpca if pca is not None : nb_component = p3_util_plot.get_component_from_cum_variance(pca, percent_var) print("\nComponents= {} for variance= {}%".format(nb_component, percent_var*100)) dict_param_generator['keras_nb_channel'] = nb_component else : dict_param_generator['keras_nb_channel'] =0 #--------------------------------------------------------------------------- # Input dim for CNN network is defined by the tuple : # (number_of_measures, number_of_features). # # for text classification : (number_of_tokens, embedding_dim) # # In case of multiplexed dimensions, for a given dimension, then, each token # is assigned a coefficient that is the value of this digitalized token over # this given dimension. # Then, for each text, they are dataPreparator_v2.max_length tokens. # Input dim is then (dataPreparator_v2.max_length, 1). # # When embedding dimensions are not multiplexed : # If dimension are truncated using PCA operator, then input dimensions # are : (dataPreparator_v2.max_length, self.df_data['matrix_padded_truncated'].shape[1]) # # If embedding dimension are not truncated, then input dimensions are : # are : (dataPreparator_v2.max_length, self.df_data['matrix_padded'].shape[1]) # #--------------------------------------------------------------------------- if dict_param_generator['is_dimension_mux'] : nb_feature = 1 data_column_name = 'matrix_padded_truncated' else : if 'matrix_padded_truncated' in dataPreparator_v2.df_data.columns : nb_feature = dataPreparator_v2.df_data['matrix_padded_truncated'].iloc[0].shape[1] data_column_name = 'matrix_padded_truncated' elif 'matrix_padded' in dataPreparator_v2.df_data.columns : nb_feature = dataPreparator_v2.df_data['matrix_padded'].iloc[0].shape[1] data_column_name = 'matrix_padded' else : if 0 < len(dataPreparator_v2.list_df_data_file) : #--------------------------------------------------------------- # Data has been recorded on harddidk. Search if # column name exists into dataframe. #--------------------------------------------------------------- filename = dataPreparator_v2.list_df_data_file[-1] df_data = p5_util.object_load(filename) if 'matrix_padded_truncated' in df_data.columns : nb_feature = df_data['matrix_padded_truncated'].iloc[0].shape[1] data_column_name = 'matrix_padded_truncated' elif 'matrix_padded' in df_data.columns : nb_feature = df_data['matrix_padded'].iloc[0].shape[1] data_column_name = 'matrix_padded' else : print("\n*** ERROR : build_cnn_datagenerator(): No column name \'matrix_padded_truncated\' nor \'matrix_padded\' CNN dimension is undefined! ") return None else : print("\n*** ERROR : build_cnn_datagenerator(): No column name \'matrix_padded_truncated\' nor \'matrix_padded\' CNN dimension is undefined! ") return None #print(dataPreparator_v2.max_length, nb_feature) keras_input_dim = (dataPreparator_v2.max_length, nb_feature) dict_param_generator['keras_input_dim'] = keras_input_dim print("") for key, value in dict_param_generator.items() : print("{} : {}".format(key,value)) print("\nBuilding datagenerator...") generator = test_datapreparator.build_generator(dataPreparator_v2, \ dict_param_generator,\ data_column_name=data_column_name) return generator
def datapreparation_step_process(dict_param_sequence=None, is_debug=False ) : if dict_param_sequence is None : print("\n*** ERROR : parameter not defined for function!") return None #-------------------------------------------------------------------------- # Build backup / restore file name #-------------------------------------------------------------------------- data_type = dict_param_sequence["data_type"] root_file_name = dict_param_sequence["root_file_name"] n_sample = get_n_sample(dict_param_sequence) if n_sample is None : return None file_format = dict_param_sequence["file_format"] #--------------------------------------------------------------------------- # Get step number then step parameters and process it. #--------------------------------------------------------------------------- dict_param_step = dict_param_sequence['dict_param_step'] step = dict_param_sequence['step'] dict_step = dict_param_step[step] if dict_step is None : print("\n*** INFO : step= {} of automated process skiped!".format(step)) return None dict_param_dataprep = dict_step['dict_param_dataprep'] #--------------------------------------------------------------------------- # Steps processing #--------------------------------------------------------------------------- if 1 == step : root_dataset_filename = dict_step["dataset_filename"] if None is root_dataset_filename : print("\n*** ERROR : step= {}: dataset filename for this step not provided!".format(step)) return None #--------------------------------------------------------------------------- # Load dataset #--------------------------------------------------------------------------- dataset_filename = root_dataset_filename+'_'+str(data_type)+str(file_format) if not is_debug : ser_X_data_type, ser_y_data_type = p5_util.object_load(dataset_filename) X_data_type = ser_X_data_type.sample(n_sample) index = X_data_type.index y_data_type = ser_y_data_type[index] X_data_type = X_data_type.tolist() y_data_type = y_data_type.tolist() #------------------------------------------------------------------- # Apply operations over this step using fit_transform() method. #------------------------------------------------------------------- dataPreparator_v2 = DataPreparator_v2.DataPreparator_v2(**dict_param_dataprep) X = dataPreparator_v2.fit_transform_step(X_data_type,y_data_type) filename = root_file_name+'_'+str(data_type)+'_'+str(n_sample)+'_step'+str(step)+str(file_format) #--------------------------------------------------------------------------- # Save step result #--------------------------------------------------------------------------- if not is_debug : p5_util.object_dump(dataPreparator_v2,filename) print("\nStep : {} Save data-preparation into file {}".format(step,filename)) return filename elif 2 == step : print("\nStep : {}".format(step)) #----------------------------------------------------------------------- # Not used. Already at None value. #----------------------------------------------------------------------- filename = dict_step['dataprep_step_filename'] if None is filename : #------------------------------------------------------------------- # Select file issued from previous step. #------------------------------------------------------------------- filename = dict_param_sequence["previous_step_file_name"] if None is filename : print("\n*** ERROR : step= {}: filename for this step not provided!".format(step)) return None #--------------------------------------------------------------------------- # Load DataPreparator_v2 object from step1 #--------------------------------------------------------------------------- if not is_debug : dataPreparator_v2 = p5_util.object_load(filename, is_verbose=True) #------------------------------------------------------------------- # Check is this step as already took place and then, # apply a sequence of sub-steps. #------------------------------------------------------------------- dict_param_subsequence = dict_step['dict_param_subsequence'] start_substep = dict_param_subsequence['start_substep'] #----------------------------------------------------------------------- # Build file name in order to save DataPreparator_v2 oject #----------------------------------------------------------------------- filename = root_file_name+'_'+str(data_type)+'_'+str(n_sample)+'_step'+str(step)+str(file_format) if 0 < start_substep : #--------------------------------------------------------------- # Step 2 already took place. Apply sub-step from step 2. # Sub-step sequence is described into dict_param_sequence # dictionary. #--------------------------------------------------------------- dataPreparator_v2 = \ dataprepatator_subsequence_process(dict_param_sequence,\ dataPreparator_v2) if not is_debug : print("\nStep {} : Save data-preparation into file {} ...".format(step,filename)) p5_util.object_dump(dataPreparator_v2,filename) else : pass else : #------------------------------------------------------------------- # If step2 transformation didn't took place or was not completed, # then it is processed here. # # When bulk_row is >0, transformation is proceeded step # by step, saving results from each step in a file. # # This step by step process may be interrupted. Then it may be # restarted from the value= id_bulk_row. # # Otherwise, when id_bulk_row = 0, then transformation starts # from beguining. #------------------------------------------------------------------- bulk_row = dict_step['bulk_row'] dict_restart_step = dict_step['dict_restart_step'] id_bulk_row = dict_restart_step['id_bulk_row'] dataPreparator_v2.build_padded_matrix(bulk_row = bulk_row,\ root_filename='./data/df_'+str(data_type)+'_'+str(n_sample)+'_step'+str(step),\ id_bulk_row = id_bulk_row) if 0 == bulk_row : pass else : #----------------------------------------------------------- # Data are saved into files. # Clean dataframe in case of step by step process. #----------------------------------------------------------- dataPreparator_v2.total_row = dataPreparator_v2.df_data.shape[0] dataPreparator_v2.df_data = pd.DataFrame() if not is_debug : print("\nStep {} : Save data-preparation into file {} ...".format(step,filename)) p5_util.object_dump(dataPreparator_v2,filename) print("\nStep {} : Save data-preparation into file {} Done!".format(step,filename)) if 0 == bulk_row : pass else : print("\nStep {} : Step by step data-preparation : Restart process with (step, substep, previous file) = (2,1,{}) in configuration file!".format(step,filename)) return filename elif 3 == step : print("\nStep : {}".format(step)) ipca_batch_size = dict_step['ipca_batch_size'] percent_var = dict_step['percent_var'] if (percent_var > 1.) or (percent_var <= 0.) : print("\n*** ERROR : step= {}: percent_var has to belong interval ]0.,1.]; current value= {}".\ format(step, percent_var)) return None filename = dict_step['dataprep_step_filename'] if None is filename : filename = dict_param_sequence["previous_step_file_name"] if None is filename : print("\n*** ERROR : step= {}: filename for this step not provided!".format(step)) return None if not is_debug : #------------------------------------------------------------------- # Load dataPreparator_v2 object from step2 #------------------------------------------------------------------- dataPreparator_v2 = p5_util.object_load(filename) #------------------------------------------------------------------- # PCA operator for dimension reduction. #------------------------------------------------------------------- xpca = dict_step['xpca'] if xpca is not None : dataPreparator_v2.xpca = xpca else : pass #------------------------------------------------------------------- # Check is this step as already took place and then, # apply a sequence of sub-steps. #------------------------------------------------------------------- dict_param_subsequence = dict_step['dict_param_subsequence'] start_substep = dict_param_subsequence['start_substep'] end_substep = dict_param_subsequence['end_substep'] if 0 < start_substep : #--------------------------------------------------------------- # Use 2 separated steps in order to build PCA operator then # to proceed to dimension reduction. #--------------------------------------------------------------- if not is_debug : dataPreparator_v2 = \ dataprepatator_subsequence_process(dict_param_sequence,\ dataPreparator_v2) else : pass else : #--------------------------------------------------------------- # In the same step, build PCA operator and proceed to dimension # reduction. #--------------------------------------------------------------- if not is_debug : dataPreparator_v2.build_matrix_padded_truncated(ipca_batch_size, \ percent_var) else : pass if len(dict_param_step) == step : if 0 < start_substep : if len(dict_param_subsequence['dict_param_step']) == end_substep : #----------------------------------------------------------- # Lat step from sequence and last sub-step from sub-sequence # File name will not have any step value extension . #----------------------------------------------------------- filename = root_file_name+'_'+str(data_type)+'_'+str(n_sample)+str(file_format) else : #----------------------------------------------------------- # Lat step from sequence and thists is not la sub-step # from sub-sequence. # Filename will have a step value extension . #----------------------------------------------------------- filename = root_file_name+'_'+str(data_type)+'_'\ +str(n_sample)+'_step'+str(step)+'_substep'+\ str(start_substep)+str(file_format) else : #--------------------------------------------------------------- # Last step and no sub-step: file name is the final one. # File name will not have any step value extension . #--------------------------------------------------------------- filename = root_file_name+'_'+str(data_type)+'_'+str(n_sample)+str(file_format) else : filename = root_file_name+'_'+str(data_type)+'_'+str(n_sample)+'_step'+str(step)+str(file_format) if not is_debug : print("\nStep {} : Save data-preparation into file {} ...".format(step,filename)) p5_util.object_dump(dataPreparator_v2,filename) print("\nStep {} : Save data-preparation into file {} Done!".format(step,filename)) return filename
def dataprepatator_subsequence_process(dict_param_sequence, dataPreparator_v2) : ''' This functions allows to concatenate dataframes stored into multiple files. This is due to the fact data has been process by bulk in order to save memory resources. All files are read and concatenated into a single dataframe. Input : * dict_param_sequence : parameters of the data-prepatation sequence which also contain parameters to process sub-steps in a step. The current step in which this sub-step has to be processed is contained in these parameters. * dataPreparator_v2 : DataPreparator_v2 object in which df_data attributes will be updated. Output : * dataframe containing all data splitted into files. ''' df_data = None #---------------------------------------------------------------------- # Extract parameters for sub-sequences related to this step. #---------------------------------------------------------------------- step = dict_param_sequence['step'] dict_step = dict_param_sequence['dict_param_step'][step] dict_param_subsequence = dict_step['dict_param_subsequence'] n_sample = get_n_sample(dict_param_sequence) #--------------------------------------------------------------------------- # Extract start and send of the sub-steps. # They are in dict_param_subsequence dictionary. #--------------------------------------------------------------------------- start_substep = dict_param_subsequence['start_substep'] end_substep = dict_param_subsequence['end_substep'] if 2 == step : bulk_row = dict_step['bulk_row'] #--------------------------------------------------------------------------- # Extract parameters that configure this sequence of sub-steps #--------------------------------------------------------------------------- dict_param_step = dict_param_subsequence['dict_param_step'] for substep in range(start_substep,end_substep+1) : if 1 == substep : total_row = dataPreparator_v2.total_row #------------------------------------------------------------------- # In this sub-step, files are read form hardisk and aggregated # as a dataframe. # # Extract parameters that configure this sub-step. #------------------------------------------------------------------- fixed_count_file = dict_param_step[substep]['fixed_count_file'] if 0 == total_row : return None else : pass if fixed_count_file > 0 : count_file = fixed_count_file tail = 0 else : count_file = total_row//bulk_row tail = total_row%bulk_row print("Step {} : sub-step: {} : dataframe concatenation of {} files".format(step, substep,count_file)) root_filename = "./data/df_"+str(dict_param_sequence['data_type'])+"_"+str(n_sample)+"_step"+str(step) df_data = pd.DataFrame() is_intermediate = False for i in range(count_file) : is_intermediate = True filename = root_filename+"_"+str(i)+".dill" df = p5_util.object_load(filename, is_verbose=False) df_data = pd.concat([df_data,df]) print("Step {} : sub-step: {} : process status : {}/{}".\ format(step, substep, i+1,count_file), end='\r') if tail >0 : if is_intermediate : i+=1 else : i=0 filename = root_filename+"_"+str(i)+".dill" df_data = pd.concat([df_data,df]) else : print("\n*** ERROR : Step : {} / sub-step={} not yet supported".format(step, substep)) df_data = None #----------------------------------------------------------------------- # Drop unused columns from df_data in order to save memory #----------------------------------------------------------------------- if 'vector' in df_data.columns : del(df_data['vector']) if 'tokens' in df_data.columns : del(df_data['tokens']) if 'counting' in df_data.columns : del(df_data['counting']) #----------------------------------------------------------------------- # Update df_data attribute with concatenated dataframe. #----------------------------------------------------------------------- dataPreparator_v2.df_data = df_data.copy() elif 3 == step: for substep in range(start_substep,end_substep+1) : method = dict_param_subsequence['dict_param_step'][substep]['method'] if 1 == substep : parameter = dict_step['ipca_batch_size'] elif 2 == substep : parameter = dict_step['percent_var'] else : print("\n*** ERROR : dataprepatator_subsequence_process() : sub-step= {} not yet supported!".format(substep)) return None method(dataPreparator_v2, parameter) else : print("\n*** ERROR : no sub-steps supported for step = {}".format(step)) return None return dataPreparator_v2
def build_generator(dataPreparator, \ dict_param_generator,\ data_column_name='vector') : '''Builds DataGenerator object from DataPreparator object. DataGenerator object allows to feed Keras neural network estimators by pumping data recorded in a set of files named partitions. This allows to save RAM memory thanks to the use of hard disk. DataGenerator object contains all informations required in order to access data into partitions files. Input : * dataPreparator : object containing digitalized dataset along with operators used for digitalization. * dict_param_generator : dictionary of parameters used to construct DataGenerator. * data_column_name : name of the column from dataframe into dataPreparator containing digitalized dataset. Output : * DataGenerator object ''' data_type = dict_param_generator['data_type'] #--------------------------------------------------------------------------- # Consistency of inputs are checked #--------------------------------------------------------------------------- if data_type is None : print("\n*** ERROR : build_generator() : unknown data type!") return None if (data_type != "train") and (data_type != "valid"): print("\n*** ERROR : build_generator() : Unknown data_type= {}! Supported data_type : train or valid".format(data_type)) return None if (dataPreparator.list_df_data_file is None) or (0 == len(dataPreparator.list_df_data_file)) : #----------------------------------------------------------------------- # All data is stored into dataPreparator dataframe. # Retrieve data from DataPreparator dataframes. #----------------------------------------------------------------------- if data_column_name in dataPreparator.df_data.columns : X = np.array(dataPreparator.df_data[data_column_name].tolist()) y = np.array(dataPreparator.df_data.target.tolist()) else : print("\n***ERROR : column name \'{}\' out of train dataframe !".\ format(data_column_name)) return None, None #----------------------------------------------------------------------- # Make partitions #----------------------------------------------------------------------- print(y[:10]) partition_size = dict_param_generator['partition_size'] dict_partition, dict_label = p9_util.make_partition(X, \ y, \ partition_size,\ data_type=data_type, \ data_format='ndarray' ) if (dict_partition is None) or (dict_label is None) : print("\n*** ERROR : build_generators() : building partitions for data_type= {} dataset failed!".format(data_type)) return None #--------------------------------------------------------------------------- # Total number of records #--------------------------------------------------------------------------- len_dataset = X.shape[0] else : #----------------------------------------------------------------------- # All data are stored into files on harddisk. # dataPreparator handle the name of those files. # Files are read and partitions are built for each of thses files. #----------------------------------------------------------------------- dict_partition = dict() dict_label = dict() partition_size = dict_param_generator['partition_size'] print("\n*** Partition size = {}".format(partition_size)) len_dataset = 0 start_row = 0 for df_data_file in dataPreparator.list_df_data_file : df_data = p5_util.object_load(df_data_file, is_verbose=True) if data_column_name in df_data.columns : X = np.array(df_data[data_column_name].tolist()) end_row = start_row + X.shape[0] y = np.array(df_data.target.tolist()) #y = np.array(df_data.target.tolist())[start_row:end_row] #y = np.array(dataPreparator.df_data.target.tolist())[start_row:end_row] start_row = end_row len_dataset += X.shape[0] else : print("\n***ERROR : file name= {} : column name \'{}\' out of train dataframe !".\ format(df_data_file,data_column_name)) return None, None #------------------------------------------------------------------- # Make partitions; dict_partition and dict_label are updated in each # function call for making partitions #------------------------------------------------------------------- dict_partition, dict_label = p9_util.make_partition(X, \ y, \ partition_size,\ data_type=data_type, \ data_format='ndarray',\ dict_partition = dict_partition,\ dict_label = dict_label,\ is_debug=False) #--------------------------------------------------------------------------- # Build data generators #--------------------------------------------------------------------------- dataGenerator = DataGenerator.DataGenerator(dict_partition, \ dict_label, \ len_dataset,\ **dict_param_generator) #--------------------------------------------------------------------------- # Save DataGenerator #--------------------------------------------------------------------------- filename = "./data/"+str(data_type)+"_generator.dill" p5_util.object_dump(dataGenerator, filename) return dataGenerator
def load_dataset(filename, dataset_type='P7', is_label_encoded=False) : '''Load dataset from file name given as function parameter. ''' is_label_encoded = p8_util_config.IS_LABEL_ENCODED if dataset_type == 'P7' : (x_train,x_test, y_train, y_test) = p5_util.object_load(filename) number = x_train.shape[0] if p8_util_config.NN_TYPE == 'RNN' : x_train = batch_coloredimage_serial_reshape(x_train) x_test = batch_coloredimage_serial_reshape(x_test) else : pass print(x_train.shape, x_test.shape, y_train.shape, y_test.shape) if is_label_encoded : y_train=array_label_encode_from_index(y_train) y_test=array_label_encode_from_index(y_test) nClasses = max(len(np.unique(y_train)), len(np.unique(y_test))) else : nClasses = y_train.shape[1] elif dataset_type == 'MNIST' : x_train, x_valid, x_test, y_train, y_valid, y_test = load_data_mnist() nClasses = max(len(np.unique(y_train)), len(np.unique(y_test))) if False: y_train=array_label_encode_binary(y_train) y_test=array_label_encode_binary(y_test) y_valid=array_label_encode_binary(y_valid) nClasses = y_train.shape[1] elif dataset_type == 'JIGSAW' : nClasses = 0 if False : sampling_ratio = p8_util_config.SAMPLING_RATIO X_train, y_train, X_test, y_test = \ load_dataset_jigsaw(sampling_ratio=sampling_ratio) x_train, x_test = preprocess_dataset_jigsaw(X_train, X_test) else : filename = './data/X_train_encoded.dump' x_train = p5_util.object_load(filename) filename = './data/X_test_encoded.dump' x_test = p5_util.object_load(filename) filename = './data/y_test.dump' y_test = p5_util.object_load(filename) if type(y_test) is list : y_test = np.array(y_test) filename = './data/y_train.dump' y_train = p5_util.object_load(filename) if type(y_train) is list : y_train = np.array(y_train) else : pass #w_size = x_train.shape[1] #h_size = x_train.shape[2] tuple_dimension = (x_train.shape,x_test.shape,y_train.shape,y_test.shape) #print("Dimensions= {}".format(tuple_dimension)) #print("Number of classes= "+str(nClasses)) if dataset_type == 'MNIST' : return x_train, x_test, y_train, y_test, nClasses \ ,tuple_dimension[0][1:] return x_train, x_test, y_train, y_test, nClasses,tuple_dimension[0][1:]