def main(checkpoint, data_count, data_cols, should_train, nb_epoch, null_pct, try_reuse_data, batch_size, execution_config): maxlen = 20 max_cells = 500 # set this boolean to True to print debug messages (also terminate on *any* exception) DEBUG = True checkpoint_dir = "checkpoints/" # basic preliminaries for database calls store_name = 'nktraining' adl = client.get_adl_client(store_name) files = adl.ls('training-data/CKAN') random.shuffle(files) cnxn = gc.getConnection() cursor = cnxn.cursor() # make required database calls out, out_array_header = FetchLabeledDataFromDatabase(max_cells, cursor, adl, False) cnxn.close() with open('Categories_base_stat_geo.txt','r') as f: Categories = f.read().splitlines() Categories = sorted(Categories) category_count = len(Categories) nx,ny = out.shape if(DEBUG): # orient the user a bit print("read data is: ") print(out) print("fixed categories are: ") print(Categories) print("The size of the read raw data is %d rows by %d columns"%(nx,ny)) print("corresponding header length is: %d"%len(out_array_header)) #print(out_array_header) # specify data for the transfer-learning experiment raw_data = out[0:max_cells,:] #truncate the rows (max_cells) header = out_array_header ## ADD GEODATA import logging logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) # read-in relevant data in_file_postal = 'data/geodata/allCountries_postal.txt' reader_postal = GeonamesCountriesTxtFileReader(in_file_postal) df_postal = reader_postal.read_csv_postal() # read-in country codes in_file_cc = 'data/geodata/country_codes_csv.csv' df_countries = pd.read_csv(in_file_cc,dtype=str) # now, build up geo-data training data set N_SAMPLES = 500 # number of samples for each additional category start = time.time() data = np.asarray(df_countries['Name'].sample(n=max_cells,replace='True'))[np.newaxis].T # draw first random column data_header = list([['country','text'],]) # start with the 'country' category for i in np.arange(1,N_SAMPLES): # draw remaining random columns #print(data) data = np.concatenate((data,np.asarray(df_countries['Name'].sample(n=max_cells,replace='True'))[np.newaxis].T),axis=1) data_header.append(list(['country','text'])) # now, handle the remaining new geo categories NewCategories = list(['state','city','postal_code','latitude','longitude','country_code']) for category in NewCategories: for i in np.arange(N_SAMPLES): data = np.concatenate((data,np.asarray(df_postal[category].sample(n=max_cells,replace='True'))[np.newaxis].T),axis=1) if((category=='latitude') or (category=='longitude')): data_header.append(list([category,'float'])) elif(category=='postal_code'): # label as 'int' where appropriate (one may also need to label as 'text' sometimes as well, but going with this for now) data_header.append(list([category,'int'])) else : data_header.append(list([category,'text'])) if(DEBUG): print("DEBUG::the shape of geo data is:") print(data.shape) print("DEBUG::the length of the corresponding geo data header is:") print(len(data_header)) print("DEBUG::the time elapsed to build the geo data set is (sec):") print(time.time()-start) print("DEBUG::merging geo data with datalake data... ") raw_data = np.column_stack((raw_data,data)) header.extend(data_header) if(DEBUG): print("DEBUG::done!!") print("DEBUG::the shape of final merged data is:") print(raw_data.shape) print("DEBUG::the length of final merged data header is:") print(len(header)) ## FINISHED ADDING GEODATA ## LABEL COMBINED DATA AS CATEGORICAL/ORDINAL start_time_guess = time.time() guesses = [] print("Beginning Guessing categorical/ordinal for geo+datalake data...") category_count = 0 ordinal_count = 0 for i in np.arange(raw_data.shape[1]): tmp = guess(raw_data[:,i], for_types ='category') if tmp[0]=='category': category_count += 1 header[i].append('categorical') if ('int' in header[i]) or ('float' in header[i]) \ or ('datetime' in header[i]): ordinal_count += 1 header[i].append('ordinal') guesses.append(tmp) if(DEBUG): #print(guesses) #print(len(guesses)) print("DEBUG::The number of categorical columns is %d"%category_count) print("DEBUG::The number of ordinal columns is %d"%ordinal_count) #print(header) elapsed_time = time.time()-start_time_guess print("Total guessing time is : %.2f sec" % elapsed_time) ## FINISHED LABELING COMBINED DATA AS CATEGORICAL/ORDINAL # transpose the data raw_data = np.char.lower(np.transpose(raw_data).astype('U')) # do other processing and encode the data if execution_config is None: raise TypeError Classifier = Simon(encoder={}) # dummy text classifier config = Classifier.load_config(execution_config, checkpoint_dir) encoder = config['encoder'] checkpoint = config['checkpoint'] encoder.categories=Categories category_count = len(Categories) # need to reset this variable as it is used for stg. else above # build classifier model Classifier = Simon(encoder=encoder) # text classifier for unit test model = Classifier.generate_transfer_model(maxlen, max_cells, category_count-9, category_count, checkpoint, checkpoint_dir) model_compile = lambda m: m.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy']) model_compile(model) # encode the data and evaluate model X, y = encoder.encode_data(raw_data, header, maxlen) if(DEBUG): print("DEBUG::y is:") print(y) print("DEBUG::The encoded labels (first row) are:") print(y[0,:]) data = Classifier.setup_test_sets(X, y) ## Build p_threshold per class # p_threshold = np.sum(data.y_train,axis=0)*1/(data.y_train.shape[0]) p_threshold = 0.5 # you could also use a fixed scalar if(DEBUG): print("DEBUG::per class p_threshold is:") print(p_threshold) max_cells = encoder.cur_max_cells start = time.time() history = Classifier.train_model(batch_size, checkpoint_dir, model, nb_epoch, data) end = time.time() print("Time for training is %f sec"%(end-start)) config = { 'encoder' : encoder, 'checkpoint' : Classifier.get_best_checkpoint(checkpoint_dir) } Classifier.save_config(config, checkpoint_dir) Classifier.plot_loss(history) #comment out on docker images... Classifier.evaluate_model(max_cells, model, data, encoder, p_threshold)
def main(execution_config, DEBUG): maxlen = 20 max_cells = 500 p_threshold = 0.5 DEBUG = True # boolean to specify whether or not print DEBUG information checkpoint_dir = "pretrained_models/" with open('Categories.txt', 'r') as f: Categories = f.read().splitlines() # orient the user a bit print("fixed categories are: ") Categories = sorted(Categories) print(Categories) category_count = len(Categories) # load specified execution configuration if execution_config is None: raise TypeError Classifier = Simon(encoder={}) # dummy text classifier config = Classifier.load_config(execution_config, checkpoint_dir) encoder = config['encoder'] checkpoint = config['checkpoint'] # read unit test data dataset_name = "replicate_eval_error" # alternatively, o_38 or o_185 if (DEBUG): print("DEBUG::BEGINNING UNIT TEST...") frame = pd.read_csv('unit_test_data/' + dataset_name + '.csv', dtype='str', header=None) X = encoder.encodeDataFrame(frame) # build classifier model model = Classifier.generate_model(maxlen, max_cells, category_count) Classifier.load_weights(checkpoint, None, model, checkpoint_dir) model_compile = lambda m: m.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy']) model_compile(model) y = model.predict(X) # discard empty column edge case y[np.all(frame.isnull(), axis=0)] = 0 result = encoder.reverse_label_encode(y, p_threshold) ## LABEL COMBINED DATA AS CATEGORICAL/ORDINAL print("Beginning Guessing categorical/ordinal classifications...") start_time_guess = time.time() category_count = 0 ordinal_count = 0 raw_data = frame.as_matrix() for i in np.arange(raw_data.shape[1]): tmp = guess(raw_data[:, i], for_types='category') if tmp[0] == 'category': category_count += 1 tmp2 = list(result[0][i]) tmp2.append('categorical') result[0][i] = tuple(tmp2) result[1][i].append(1) if ('int' in result[1][i]) or ('float' in result[1][i]) \ or ('datetime' in result[1][i]): ordinal_count += 1 tmp2 = list(result[0][i]) tmp2.append('ordinal') result[0][i] = tuple(tmp2) result[1][i].append(1) elapsed_time = time.time() - start_time_guess print("Total statistical variable guessing time is : %.2f sec" % elapsed_time) ## FINISHED LABELING COMBINED DATA AS CATEGORICAL/ORDINAL print("The predicted classes and probabilities are respectively:") print(result)
def main(checkpoint, data_count, data_cols, should_train, nb_epoch, null_pct, try_reuse_data, batch_size, execution_config): maxlen = 20 max_cells = 500 p_threshold = 0.5 # set this boolean to True to print debug messages (also terminate on *any* exception) DEBUG = False checkpoint_dir = "checkpoints/" # basic preliminaries for database calls store_name = 'nktraining' adl = client.get_adl_client(store_name) files = adl.ls('training-data/CKAN') random.shuffle(files) cnxn = gc.getConnection() cursor = cnxn.cursor() # make required database calls out, out_array_header = FetchLabeledDataFromDatabase( max_cells, cursor, adl, DEBUG) cnxn.close() with open('Categories_base_stat.txt', 'r') as f: Categories = f.read().splitlines() Categories = sorted(Categories) category_count = len(Categories) nx, ny = out.shape if (DEBUG): # orient the user a bit print("read data is: ") print(out) print("fixed categories are: ") print(Categories) print("The size of the read raw data is %d rows by %d columns" % (nx, ny)) print("corresponding header length is: %d" % len(out_array_header)) #print(out_array_header) # specify data for the transfer-learning experiment raw_data = out[0:max_cells, :] #truncate the rows (max_cells) #read "post-processed" header from file, this includes categorical and ordinal classifications... # with open('datalake_labels','r',newline='\n') as myfile: # reader = csv.reader(myfile, delimiter=',') # header = [] # for row in reader: # header.append(row) # OR header = out_array_header ## LABEL COMBINED DATA AS CATEGORICAL/ORDINAL start_time_guess = time.time() guesses = [] print("Beginning Guessing categorical/ordinal for datalake data...") category_count = 0 ordinal_count = 0 for i in np.arange(raw_data.shape[1]): tmp = guess(raw_data[:, i], for_types='category') if tmp[0] == 'category': category_count += 1 header[i].append('categorical') if ('int' in header[i]) or ('float' in header[i]) \ or ('datetime' in header[i]): ordinal_count += 1 header[i].append('ordinal') guesses.append(tmp) if (DEBUG): #print(guesses) #print(len(guesses)) print("DEBUG::The number of categorical columns is %d" % category_count) print("DEBUG::The number of ordinal columns is %d" % ordinal_count) #print(header) elapsed_time = time.time() - start_time_guess print("Total guessing time is : %.2f sec" % elapsed_time) ## FINISHED LABELING COMBINED DATA AS CATEGORICAL/ORDINAL # transpose the data raw_data = np.char.lower(np.transpose(raw_data).astype('U')) # do other processing and encode the data if execution_config is None: raise TypeError Classifier = Simon(encoder={}) # dummy text classifier config = Classifier.load_config(execution_config, checkpoint_dir) encoder = config['encoder'] checkpoint = config['checkpoint'] encoder.categories = Categories # build classifier model Classifier = Simon(encoder=encoder) # text classifier for unit test model = Classifier.generate_transfer_model(maxlen, max_cells, category_count - 2, category_count, checkpoint, checkpoint_dir) model_compile = lambda m: m.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['binary_accuracy']) model_compile(model) # encode the data and evaluate model X, y = encoder.encode_data(raw_data, header, maxlen) if (DEBUG): print("DEBUG::y is:") print(y) print("DEBUG::The encoded labels (first row) are:") print(y[0, :]) data = Classifier.setup_test_sets(X, y) max_cells = encoder.cur_max_cells start = time.time() history = Classifier.train_model(batch_size, checkpoint_dir, model, nb_epoch, data) end = time.time() print("Time for training is %f sec" % (end - start)) config = { 'encoder': encoder, 'checkpoint': Classifier.get_best_checkpoint(checkpoint_dir) } Classifier.save_config(config, checkpoint_dir) Classifier.plot_loss(history) #comment out on docker images... Classifier.evaluate_model(max_cells, model, data, encoder, p_threshold)
def _produce_annotations(self, *, inputs: Inputs) -> Outputs: """ Parameters ---------- inputs: Input pandas frame Returns ------- Outputs The outputs is two lists of lists, each has length equal to number of columns in input pandas frame. Each entry of the first one is a list of strings corresponding to each column's multi-label classification. Each entry of the second one is a list of floats corresponding to prediction probabilities. """ frame = inputs # setup model as you typically would in a Simon main file maxlen = 20 max_cells = 500 p_threshold = 0.5 DEBUG = True # boolean to specify whether or not print DEBUG information checkpoint_dir = self.volumes["simon_models_1"]+"/pretrained_models/" if 'statistical_classification' in self.hyperparams.keys() and self.hyperparams['statistical_classification']: execution_config = "Base.pkl" category_list = "/Categories.txt" else: execution_config = "Base_stat_geo.pkl" category_list = "/Categories_base_stat_geo.txt" with open(self.volumes["simon_models_1"]+ category_list,'r') as f: Categories = f.read().splitlines() # orient the user a bit print("fixed categories are: ") Categories = sorted(Categories) print(Categories) category_count = len(Categories) # load specified execution configuration if execution_config is None: raise TypeError Classifier = Simon(encoder={}) # dummy text classifier config = Classifier.load_config(execution_config, checkpoint_dir) encoder = config['encoder'] checkpoint = config['checkpoint'] X = encoder.encodeDataFrame(frame) # build classifier model model = Classifier.generate_model(maxlen, max_cells, category_count) Classifier.load_weights(checkpoint, None, model, checkpoint_dir) model_compile = lambda m: m.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy']) model_compile(model) y = model.predict(X) # discard empty column edge case y[np.all(frame.isnull(),axis=0)]=0 result = encoder.reverse_label_encode(y,p_threshold) ## LABEL COMBINED DATA AS CATEGORICAL/ORDINAL category_count = 0 ordinal_count = 0 raw_data = frame.as_matrix() for i in np.arange(raw_data.shape[1]): if 'statistical_classification' in self.hyperparams.keys() and self.hyperparams['statistical_classification']: print("Beginning Guessing categorical/ordinal classifications...") tmp = guess(raw_data[:,i], for_types ='category') if tmp[0]=='category': category_count += 1 tmp2 = list(result[0][i]) tmp2.append('categorical') result[0][i] = tmp2 result[1][i].append(1) if ('int' in result[1][i]) or ('float' in result[1][i]) \ or ('datetime' in result[1][i]): ordinal_count += 1 tmp2 = list(result[0][i]) tmp2.append('ordinal') result[0][i] = tmp2 result[1][i].append(1) print("Done with statistical variable guessing") ## FINISHED LABELING COMBINED DATA AS CATEGORICAL/ORDINAL result[0][i] = d3m_List(result[0][i]) result[1][i] = d3m_List(result[1][i]) Classifier.clear_session() out_df = pandas.DataFrame.from_records(list(result)).T out_df.columns = ['semantic types','probabilities'] return out_df
def _produce_annotations(self, *, inputs: Inputs) -> Outputs: """ generates dataframe with semantic type classifications and classification probabilities for each column of original dataframe Arguments: inputs {Inputs} -- D3M dataframe Returns: Outputs -- dataframe with two columns: "semantic type classifications" and "probabilities" Each row represents a column in the original dataframe. The column "semantic type classifications" contains a list of all semantic type labels and the column "probabilities" contains a list of the model's confidence in assigning each respective semantic type label """ # load model checkpoint checkpoint_dir = (self.volumes["simon_models_1"] + "/simon_models_1/pretrained_models/") if self.hyperparams["statistical_classification"]: execution_config = "Base.pkl" category_list = "/Categories.txt" else: execution_config = "Base_stat_geo.pkl" category_list = "/Categories_base_stat_geo.txt" with open( self.volumes["simon_models_1"] + "/simon_models_1" + category_list, "r") as f: Categories = f.read().splitlines() # create model object Classifier = Simon(encoder={}) config = Classifier.load_config(execution_config, checkpoint_dir) encoder = config["encoder"] checkpoint = config["checkpoint"] model = Classifier.generate_model(self.hyperparams["max_chars"], self.hyperparams["max_rows"], len(Categories)) Classifier.load_weights(checkpoint, None, model, checkpoint_dir) model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["binary_accuracy"]) # prepare data and make predictions frame = inputs.copy() prepped_data = encoder.encodeDataFrame(frame) preds = model.predict_on_batch(tf.constant(prepped_data)) decoded_preds = encoder.reverse_label_encode( preds, self.hyperparams["p_threshold"]) # apply statistical / ordinal classification if desired if self.hyperparams["statistical_classification"]: logger.debug( "Beginning Guessing categorical/ordinal classifications...") raw_data = frame.values guesses = [ guess(raw_data[:, i], for_types="category") for i in np.arange(raw_data.shape[1]) ] for i, g in enumerate(guesses): if g[0] == "category": decoded_preds[0][i] += ("categorical", ) decoded_preds[1][i].append(1) if (("int" in decoded_preds[1][i]) or ("float" in decoded_preds[1][i]) or ("datetime" in decoded_preds[1][i])): decoded_preds[0][i] += ("ordinal", ) decoded_preds[1][i].append(1) logger.debug("Done with statistical variable guessing") # clear tf session Classifier.clear_session() out_df = pd.DataFrame.from_records(list(decoded_preds)).T out_df.columns = ["semantic types", "probabilities"] return out_df
def runModel(self, frame, p_threshold): # setup model as you typically would in a Simon main file maxlen = 20 max_cells = 500 p_threshold = 0.5 DEBUG = True # boolean to specify whether or not print DEBUG information checkpoint_dir = "/clusterfiles/scripts/pretrained_models/" with open('/clusterfiles/scripts/Categories.txt', 'r') as f: Categories = f.read().splitlines() # orient the user a bit print("fixed categories are: ") Categories = sorted(Categories) print(Categories) category_count = len(Categories) execution_config = modelName # load specified execution configuration if execution_config is None: raise TypeError Classifier = Simon(encoder={}) # dummy text classifier config = Classifier.load_config(execution_config, checkpoint_dir) encoder = config['encoder'] checkpoint = config['checkpoint'] X = encoder.encodeDataFrame(frame) # build classifier model model = Classifier.generate_model(maxlen, max_cells, category_count) Classifier.load_weights(checkpoint, None, model, checkpoint_dir) model_compile = lambda m: m.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy']) model_compile(model) y = model.predict(X) # discard empty column edge case y[np.all(frame.isnull(), axis=0)] = 0 result = encoder.reverse_label_encode(y, p_threshold) ## LABEL COMBINED DATA AS CATEGORICAL/ORDINAL print("Beginning Guessing categorical/ordinal classifications...") start_time_guess = time.time() category_count = 0 ordinal_count = 0 raw_data = frame.as_matrix() for i in np.arange(raw_data.shape[1]): tmp = guess(raw_data[:, i], for_types='category') if tmp[0] == 'category': category_count += 1 tmp2 = list(result[0][i]) tmp2.append('categorical') result[0][i] = tuple(tmp2) result[1][i].append(1) if ('int' in result[1][i]) or ('float' in result[1][i]) \ or ('datetime' in result[1][i]): ordinal_count += 1 tmp2 = list(result[0][i]) tmp2.append('ordinal') result[0][i] = tuple(tmp2) result[1][i].append(1) elapsed_time = time.time() - start_time_guess print("Total statistical variable guessing time is : %.2f sec" % elapsed_time) ## FINISHED LABELING COMBINED DATA AS CATEGORICAL/ORDINAL Classifier.clear_session() return self.encoder.encode((result))