def test_transformers1(self): ''' Prepare labeled data for single sentence BERT classification problem COVERAGE: bert_prepare_data() in bert_utils.py class BertDMH() in bert_utils.py ''' if self.data_dir is None: unittest.TestCase.skipTest( self, "DLPY_DATA_DIR is not set in the environment variables") if (self.data_dir_local is None) or (not os.path.isdir( self.data_dir_local)): unittest.TestCase.skipTest( self, "DLPY_DATA_DIR_LOCAL is not set in the environment " "variables or it does not exist.") if not self.necessary_packages_installed: unittest.TestCase.skipTest(self, "missing transformers package") if not os.path.isfile( os.path.join(self.data_dir_local, 'imdb_master.csv')): unittest.TestCase.skipTest( self, "cannot locate imdb_master.csv in DLPY_DATA_DIR_LOCAL") from transformers import BertTokenizer model_name = 'bert-base-uncased' # instantiate BERT tokenizer tokenizer = BertTokenizer.from_pretrained( model_name, cache_dir=self.data_dir_local) # read dataset for IMDB movie review sentiment classification reviews = pd.read_csv(os.path.join(self.data_dir_local, 'imdb_master.csv'), header=0, names=['type', 'review', 'label', 'file'], encoding='latin_1') input_label = 'review' # input data is review text target_label = 'label' # target data is sentiment label # extract "train" data t_idx1 = reviews['type'] == 'train' t_idx2 = reviews['label'] != 'unsup' inputs = reviews[t_idx1 & t_idx2][input_label].to_list() targets = reviews[t_idx1 & t_idx2][target_label].to_list() # limit the number of observations to 1000 if len(inputs) > 1000: inputs = inputs[:1000] targets = targets[:1000] # create numeric target labels for ii, val in enumerate(targets): inputs[ii] = inputs[ii].replace("<br />", "") if val == 'neg': targets[ii] = 1 elif val == 'pos': targets[ii] = 2 # prepare data num_tgt_var, train = bert_prepare_data(self.s, tokenizer, 128, input_a=list(inputs), target=list(targets), classification_problem=True) # check for the existence of the training table res = self.s.retrieve('table.tableexists', _messagelevel='error', name=train) self.assertTrue(res['exists'] != 0, "Training table not created.") # ensure table has the proper number of columns res = self.s.retrieve('table.columninfo', _messagelevel='error', table=train) self.assertTrue( len(res['ColumnInfo']['Column'].to_list()) == 5, "Training table has extra/missing columns.") # clean up data table if it exists try: model_tbl_opts = input_table_check(train) self.s.table.droptable(quiet=True, **model_tbl_opts) except TypeError: self.assertTrue(False, "BERT data preparation failed") # clean up tokenizer del tokenizer
def test_transformers3(self): ''' Prepare test data (no labels) for two sentence BERT classification problem COVERAGE: bert_prepare_data() in bert_utils.py class BertDMH() in bert_utils.py ''' if self.data_dir is None: unittest.TestCase.skipTest( self, "DLPY_DATA_DIR is not set in the environment variables") if (self.data_dir_local is None) or (not os.path.isdir( self.data_dir_local)): unittest.TestCase.skipTest( self, "DLPY_DATA_DIR_LOCAL is not set in the environment " "variables or it does not exist.") if not self.necessary_packages_installed: unittest.TestCase.skipTest(self, "missing transformers package") if not os.path.isfile( os.path.join(self.data_dir_local, 'qnli_train.tsv')): unittest.TestCase.skipTest( self, "cannot locate qnli_train.csv in DLPY_DATA_DIR_LOCAL") from transformers import BertTokenizer model_name = 'bert-base-uncased' # instantiate BERT tokenizer tokenizer = BertTokenizer.from_pretrained( model_name, cache_dir=self.data_dir_local) # read QNLI dataset train_data = pd.read_csv( '/dept/cas/DeepLearn/docair/glue/qnli/train.tsv', header=0, sep='\t', error_bad_lines=False, warn_bad_lines=False, names=['index', 'question', 'sentence', 'label']) input_a_label = 'question' input_b_label = 'sentence' input_a = train_data[input_a_label].to_list() input_b = train_data[input_b_label].to_list() # limit the number of observations to 1000 if len(input_a) > 1000: input_a = input_a[:1000] input_b = input_b[:1000] # prepare data num_tgt_var, test = bert_prepare_data(self.s, tokenizer, 128, input_a=input_a, input_b=input_b, classification_problem=True) # check for the existence of the training table res = self.s.retrieve('table.tableexists', _messagelevel='error', name=test) self.assertTrue(res['exists'] != 0, "Test table not created.") # ensure table has the proper number of columns res = self.s.retrieve('table.columninfo', _messagelevel='error', table=test) self.assertTrue( len(res['ColumnInfo']['Column'].to_list()) == 3, "Test table has extra/missing columns.") # clean up data table if it exists try: model_tbl_opts = input_table_check(test) self.s.table.droptable(quiet=True, **model_tbl_opts) except TypeError: self.assertTrue(False, "BERT data preparation failed") # clean up tokenizer del tokenizer
def test_transformers4(self): ''' Load a base BERT model and add classification head. COVERAGE: BERT_Model() class in bert_model.py all private class functions (e.g. _XXX) in bert_model.py compile() in bert_model.py load_weights() in bert_model.py write_block_information() in bert_utils.py get_data_spec() in bert_model.py create_data_spec() in bert_utils.py generate_target_var_names() in bert_utils.py extract_pytorch_parms() in bert_utils.py find_pytorch_tensor() in bert_utils.py ''' model_name = 'bert-base-uncased' cache_dir = self.data_dir_local try: stderr = sys.stderr sys.stderr = open(os.devnull, 'w') import h5py h5py_installed = True sys.stderr = stderr except: h5py_installed = False if self.data_dir is None: unittest.TestCase.skipTest( self, "DLPY_DATA_DIR is not set in the environment variables") if (self.data_dir_local is None) or (not os.path.isdir( self.data_dir_local)): unittest.TestCase.skipTest( self, "DLPY_DATA_DIR_LOCAL is not set in the environment " "variables or it does not exist.") if (not self.necessary_packages_installed) or (not h5py_installed): unittest.TestCase.skipTest(self, "missing transformers or h5py package") # test case parameters n_classes = 2 num_encoder_layers = 2 num_tgt_var = 1 # instantiate BERT model bert = BERT_Model(self.s, cache_dir, model_name, n_classes, num_hidden_layers=num_encoder_layers, verbose=False) # compile model bert.compile(num_target_var=num_tgt_var) if not os.path.isfile( os.path.join(cache_dir, model_name + '.kerasmodel.h5')): assertTrue(False, "HDF5 file not written.") # check for the existence of the model table res = self.s.retrieve('table.tableexists', _messagelevel='error', name=bert.model_name) self.assertTrue(res['exists'] != 0, "Model table not created.") # attempt to create CASLIB to cache directory try: caslib, extra_path, newlib = caslibify(self.s, cache_dir, task='save') do_load_weights = True except DLPyError: do_load_weights = False # attach model weights - skip if server unable to "see" cache directory if do_load_weights: bert.load_weights(os.path.join(cache_dir, model_name + '.kerasmodel.h5'), num_target_var=num_tgt_var, freeze_base_model=False) # check for the existence of the weight table res = self.s.retrieve('table.tableexists', _messagelevel='error', name=bert.model_name + '_weights') self.assertTrue(res['exists'] != 0, "Weight table not created.") # create data spec for model data_spec = bert.get_data_spec(num_tgt_var) # drop table(s) try: model_tbl_opts = input_table_check(bert.model_name) self.s.table.droptable(quiet=True, **model_tbl_opts) except TypeError: self.assertTrue(False, "Unable to drop model table.") if do_load_weights: try: model_tbl_opts = input_table_check(bert.model_name + '_weights') self.s.table.droptable(quiet=True, **model_tbl_opts) except TypeError: self.assertTrue(False, "Unable to drop weight table.") # remove HDF5 file if os.path.isfile( os.path.join(cache_dir, model_name + '.kerasmodel.h5')): os.remove(os.path.join(cache_dir, model_name + '.kerasmodel.h5')) # clean up BERT model del bert
def test_transformers2(self): ''' Prepare labeled data for single sentence BERT regression problem COVERAGE: bert_prepare_data() in bert_utils.py class BertDMH() in bert_utils.py ''' if self.data_dir is None: unittest.TestCase.skipTest( self, "DLPY_DATA_DIR is not set in the environment variables") if (self.data_dir_local is None) or (not os.path.isdir( self.data_dir_local)): unittest.TestCase.skipTest( self, "DLPY_DATA_DIR_LOCAL is not set in the environment " "variables or it does not exist.") if not self.necessary_packages_installed: unittest.TestCase.skipTest(self, "missing transformers package") if not os.path.isfile( os.path.join(self.data_dir_local, 'task1_training_edited.csv')): unittest.TestCase.skipTest( self, "cannot locate task1_training_edited.csv in DLPY_DATA_DIR_LOCAL" ) from transformers import BertTokenizer model_name = 'bert-base-uncased' # instantiate BERT tokenizer tokenizer = BertTokenizer.from_pretrained( model_name, cache_dir=self.data_dir_local) # read regression data set reviews = pd.read_csv( os.path.join(self.data_dir_local, 'task1_training_edited.csv'), header=None, names=['id', 'original', 'edit', 'grades', 'meanGrade']) inputs = reviews['original'].tolist()[1:] reviews['meanGrade'] = pd.to_numeric(reviews['meanGrade'], errors='coerce').fillna(0) targets = reviews['meanGrade'].tolist()[1:] for ii, val in enumerate(targets): targets[ii] = round(val) # limit the number of observations to 1000 if len(inputs) > 1000: inputs = inputs[:1000] targets = targets[:1000] # prepare data num_tgt_var, train, valid = bert_prepare_data( self.s, tokenizer, 128, input_a=list(inputs), target=list(targets), train_fraction=0.8, classification_problem=False) # check for the existence of the training table res = self.s.retrieve('table.tableexists', _messagelevel='error', name=train) self.assertTrue(res['exists'] != 0, "Training table not created.") # ensure table has the proper number of columns res = self.s.retrieve('table.columninfo', _messagelevel='error', table=train) self.assertTrue( len(res['ColumnInfo']['Column'].to_list()) == 5, "Training table has extra/missing columns.") # check for the existence of the validation table res = self.s.retrieve('table.tableexists', _messagelevel='error', name=valid) self.assertTrue(res['exists'] != 0, "Validation table not created.") # ensure table has the proper number of columns res = self.s.retrieve('table.columninfo', _messagelevel='error', table=valid) self.assertTrue( len(res['ColumnInfo']['Column'].to_list()) == 5, "Validation table has extra/missing columns.") # clean up training table if it exists try: model_tbl_opts = input_table_check(train) self.s.table.droptable(quiet=True, **model_tbl_opts) except TypeError: self.assertTrue(False, "BERT data preparation failed") # clean up validation table if it exists try: model_tbl_opts = input_table_check(valid) self.s.table.droptable(quiet=True, **model_tbl_opts) except TypeError: self.assertTrue(False, "BERT data preparation failed") # clean up models del tokenizer
def test_model_conversion3(self): ''' Import CNN image classification model and override attributes - instantiate a Keras LeNet model and translate to DLPy/Viya model override CNN model attributes with RNN atttributes - never would be done in practice, just to verify that new attributes written NOTE: cannot attach weights unless both client and server share the same file system COVERAGE: from_keras_model(), load_weights() in network.py keras_to_sas() in sas_keras_parse.py write_keras_hdf5() in write_keras_model_parm.py all functions in model_conversion_utils.py CNN-related function in write_sas_code.py ''' if self.data_dir is None: unittest.TestCase.skipTest( self, "DLPY_DATA_DIR is not set in the environment variables") if (self.data_dir_local is None) or (not os.path.isfile( os.path.join(self.data_dir_local, 'lenet.h5'))): unittest.TestCase.skipTest( self, "DLPY_DATA_DIR_LOCAL is not set in the environment variables or lenet.h5 file is missing" ) if self.keras_installed: from keras.models import Sequential from keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Flatten else: unittest.TestCase.skipTest(self, "keras is not installed") model = Sequential() model.add( Conv2D(20, kernel_size=(5, 5), strides=(1, 1), activation='relu', input_shape=(28, 28, 1), padding="same")) model.add( MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same')) model.add( Conv2D(50, kernel_size=(5, 5), strides=(1, 1), activation='relu', padding='same')) model.add( MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same')) model.add(Flatten()) model.add(Dense(500, activation='relu')) model.add(Dense(10, activation='softmax')) model.load_weights(os.path.join(self.data_dir_local, 'lenet.h5')) model.summary() model_name = 'lenet' model1, use_gpu = Model.from_keras_model( conn=self.s, keras_model=model, output_model_table=model_name, include_weights=True, scale=1.0 / 255.0, input_weights_file=os.path.join(self.data_dir_local, 'lenet.h5')) if os.path.isdir(self.data_dir): try: copyfile( os.path.join(os.getcwd(), 'lenet_weights.kerasmodel.h5'), os.path.join(self.data_dir, 'lenet_weights.kerasmodel.h5')) copy_success = True except: print( 'Unable to copy weights file, skipping test of overriding attributes' ) copy_success = False if copy_success: self.s.table.addcaslib(activeonadd=False, datasource={'srctype': 'path'}, name='MODEL_CONVERT', path=self.data_dir, subdirectories=True) model1.load_weights(path=os.path.join( self.data_dir, 'lenet_weights.kerasmodel.h5'), labels=False, use_gpu=use_gpu) os.remove( os.path.join(self.data_dir, 'lenet_weights.kerasmodel.h5')) # parameter for (nonexistent) RNN layers rnn_size = 10 feature_dim = 4 # output classes output_dim = 29 # maximum sequence length max_seq_len = 100 # define data specs needed to import Keras model weights tokensize = feature_dim inputs = [] for fi in range(max_seq_len): for vi in range(tokensize): inputs.append('_f%d_v%d_' % (fi, vi)) targets = ['y%d' % i for i in range(0, max_seq_len)] data_spec = [] data_spec.append( DataSpec(type_='NUMERICNOMINAL', layer=model.layers[0].name + "_input", data=inputs, numeric_nominal_parms=DataSpecNumNomOpts( length='_num_frames_', token_size=feature_dim))) data_spec.append( DataSpec(type_='NUMERICNOMINAL', layer=model.layers[-1].name, data=targets, nominals=targets, numeric_nominal_parms=DataSpecNumNomOpts( length='ylen', token_size=1))) # override model attributes from dlpy.attribute_utils import create_extended_attributes create_extended_attributes(self.s, model_name, model1.layers, data_spec) if os.path.isfile( os.path.join(os.getcwd(), 'lenet_weights.kerasmodel.h5')): os.remove(os.path.join(os.getcwd(), 'lenet_weights.kerasmodel.h5')) # clean up model table model_tbl_opts = input_table_check(model_name) self.s.table.droptable(quiet=True, **model_tbl_opts) # clean up models del model del model1
def test_model_conversion2(self): ''' Import RNN sequence to sequence models - instantiate Keras RNN models and translate to DLPy/Viya models NOTE: cannot attach weights unless both client and server share the same file system COVERAGE: from_keras_model(), load_weights() in network.py keras_to_sas() in sas_keras_parse.py write_keras_hdf5() in write_keras_model_parm.py all functions in model_conversion_utils.py CNN-related function in write_sas_code.py ''' if self.data_dir is None: unittest.TestCase.skipTest( self, "DLPY_DATA_DIR is not set in the environment variables") if (self.data_dir_local is None) or (not os.path.isfile( os.path.join(self.data_dir_local, 'lenet.h5'))): unittest.TestCase.skipTest( self, "DLPY_DATA_DIR_LOCAL is not set in the environment variables or lenet.h5 file is missing" ) if not self.keras_installed: unittest.TestCase.skipTest(self, "keras is not installed") # parameter for RNN layers rnn_size = 10 feature_dim = 4 # output classes output_dim = 29 # maximum sequence length max_seq_len = 100 # define data specs needed to import Keras model weights tokensize = feature_dim inputs = [] for fi in range(max_seq_len): for vi in range(tokensize): inputs.append('_f%d_v%d_' % (fi, vi)) targets = ['y%d' % i for i in range(0, max_seq_len)] data_spec = [] data_spec.append( DataSpec(type_='NUMERICNOMINAL', layer='the_input', data=inputs, numeric_nominal_parms=DataSpecNumNomOpts( length='_num_frames_', token_size=feature_dim))) data_spec.append( DataSpec(type_='NUMERICNOMINAL', layer='out', data=targets, nominals=targets, numeric_nominal_parms=DataSpecNumNomOpts(length='ylen', token_size=1))) # try all RNN model types for layer_type in [ 'simplernn', 'lstm', 'gru', 'cudnnlstm', 'cudnngru' ]: for bidirectional in [True, False]: model = define_keras_rnn_model(layer_type, bidirectional, rnn_size, feature_dim, output_dim) model_name = 'dlpy_model' model1, use_gpu = Model.from_keras_model( conn=self.s, keras_model=model, max_num_frames=max_seq_len, include_weights=True, output_model_table=model_name) model1.print_summary() # try to load weights, but skip any GPU-based models because worker/soloist may not have GPU if os.path.isdir(self.data_dir) and (not use_gpu): try: copyfile( os.path.join(os.getcwd(), 'dlpy_model_weights.kerasmodel.h5'), os.path.join(self.data_dir, 'dlpy_model_weights.kerasmodel.h5')) copy_success = True except: print( 'Unable to copy weights file, skipping test of attaching weights' ) copy_success = False if copy_success: model1.load_weights(path=os.path.join( self.data_dir, 'dlpy_model_weights.kerasmodel.h5'), labels=False, use_gpu=use_gpu) os.remove( os.path.join(self.data_dir, 'dlpy_model_weights.kerasmodel.h5')) else: print('GPU model, skipping test of attaching weights') if os.path.isfile( os.path.join(os.getcwd(), 'dlpy_model_weights.kerasmodel.h5')): os.remove( os.path.join(os.getcwd(), 'dlpy_model_weights.kerasmodel.h5')) # clean up models del model del model1 # clean up model table model_tbl_opts = input_table_check(model_name) self.s.table.droptable(quiet=True, **model_tbl_opts)
def test_model_conversion1(self): ''' Import CNN image classification model - instantiate a Keras LeNet model and translate to DLPy/Viya model NOTE: cannot attach weights unless both client and server share the same file system COVERAGE: from_keras_model(), load_weights() in network.py keras_to_sas() in sas_keras_parse.py write_keras_hdf5_from_file() in write_keras_model_parm.py all functions in model_conversion_utils.py all function in keras_utils.py CNN-related function in write_sas_code.py ''' if self.data_dir is None: unittest.TestCase.skipTest( self, "DLPY_DATA_DIR is not set in the environment variables") if (self.data_dir_local is None) or (not os.path.isfile( os.path.join(self.data_dir_local, 'lenet.h5'))): unittest.TestCase.skipTest( self, "DLPY_DATA_DIR_LOCAL is not set in the environment " "variables or lenet.h5 file is missing") if self.keras_installed: from keras.models import Sequential from keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Flatten else: unittest.TestCase.skipTest(self, "keras is not installed") model = Sequential() model.add( Conv2D(20, kernel_size=(5, 5), strides=(1, 1), activation='relu', input_shape=(28, 28, 1), padding="same")) model.add( MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same')) model.add( Conv2D(50, kernel_size=(5, 5), strides=(1, 1), activation='relu', padding='same')) model.add( MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same')) model.add(Flatten()) model.add(Dense(500, activation='relu')) model.add(Dense(10, activation='softmax')) model.load_weights(os.path.join(self.data_dir_local, 'lenet.h5')) model.summary() model_name = 'lenet' model1, use_gpu = Model.from_keras_model( conn=self.s, keras_model=model, output_model_table=model_name, include_weights=True, scale=1.0 / 255.0, input_weights_file=os.path.join(self.data_dir_local, 'lenet.h5')) if os.path.isdir(self.data_dir): try: copyfile( os.path.join(os.getcwd(), 'lenet_weights.kerasmodel.h5'), os.path.join(self.data_dir, 'lenet_weights.kerasmodel.h5')) copy_success = True except: print( 'Unable to copy weights file, skipping test of attaching weights' ) copy_success = False if copy_success: model1.load_weights(path=os.path.join( self.data_dir, 'lenet_weights.kerasmodel.h5'), labels=False, use_gpu=use_gpu) os.remove( os.path.join(self.data_dir, 'lenet_weights.kerasmodel.h5')) model1.print_summary() if os.path.isfile( os.path.join(os.getcwd(), 'lenet_weights.kerasmodel.h5')): os.remove(os.path.join(os.getcwd(), 'lenet_weights.kerasmodel.h5')) # clean up model table model_tbl_opts = input_table_check(model_name) self.s.table.droptable(quiet=True, **model_tbl_opts) # clean up models del model del model1
def produce_object_detections(conn, table, coord_type, max_objects=9999, num_plot=9999, fig_size=None): ''' Plot images with drawn bounding boxes. conn : CAS CAS connection object table : string or CASTable Specifies the object detection castable to be plotted. coord_type : string Specifies coordinate type of input table max_objects : int, optional Specifies the maximum number of bounding boxes to be plotted on an image. Default: 10 num_plot : int, optional Specifies the name of the castable. n_col : int, optional Specifies the number of column to plot. Default: 2 fig_size : int, optional Specifies the size of figure. ''' conn.retrieve('loadactionset', _messagelevel='error', actionset='image') input_tbl_opts = input_table_check(table) input_table = conn.CASTable(**input_tbl_opts) img_num = input_table.shape[0] num_plot = num_plot if num_plot < img_num else img_num input_table = input_table.sample(num_plot) det_label_image_table = random_name('detLabelImageTable') num_max_obj = input_table['_nObjects_'].max() max_objects = max_objects if num_max_obj > max_objects else num_max_obj with sw.option_context(print_messages=False): res = conn.image.extractdetectedobjects(casout={ 'name': det_label_image_table, 'replace': True }, coordtype=coord_type, maxobjects=max_objects, table=input_table) if res.severity > 0: for msg in res.messages: print(msg) outtable = conn.CASTable(det_label_image_table) imageRecordList = list() in_df = input_table.fetch()['Fetch'] out_df = outtable.fetch()['Fetch'] if len(out_df) == len(in_df): print( str(len(out_df)) + " equal table length assumption is met, producing message buffer") for i in range(len(out_df)): imageId = str(uuid4()) timestamp = round(datetime.now().microsecond) nbrOfBoats = int(in_df['_nObjects_'][i]) imgStr = out_df['_image_'][i] nparr = np.frombuffer(imgStr, np.uint8) #img_np = cv2.imdecode(nparr, cv2.IMREAD_COLOR) base_img = str(base64.b64encode(nparr)) occupancy_rate = 0 if nbrOfBoats > 0: surface_list = list() index = 5 for ix in range(nbrOfBoats): surface_list.append(in_df.iloc[i, index + 4] * in_df.iloc[i, index + 5]) index = index + 6 occupancy_rate = sum(surface_list) imageRecordList.append( ImageRecord(imageId, timestamp, nbrOfBoats, occupancy_rate, base_img)) with sw.option_context(print_messages=False): conn.table.droptable(det_label_image_table) return imageRecordList