def instruction_identifier(params): remove = get_similar_column(get_value_instruction(params['instruction']), params['data']) params['y'] = params['data'][remove] del params['data'][remove]
def neural_network_query(self, instruction, callback=False, text=[], ca_threshold=None, drop=None, preprocess=True, test_size=0.2, random_state=49, epochs=50, generate_plots=True, callback_mode='min', maximizer="val_loss", save_model=False, save_path=os.getcwd()): ''' Detects to see if it's a regression/classification problem and then calls the correct query. :param hyperparameters: all of these are hyperparameters that're passed to the algorithm :return: a model, plots, accuracy information all stored in the self.models dictionary ''' data = pd.read_csv(self.dataset) if preprocess: remove = get_similar_column(get_value_instruction(instruction), data) if len(data) < 50: raise Exception( "Only datasets larger then 50 rows are supported for neural networks" ) if len(data[remove].value_counts()) <= 50: callback_mode = 'max' maximizer = "val_accuracy" self.classification_query_ann(instruction, text=text, callback=callback, ca_threshold=ca_threshold, preprocess=preprocess, test_size=test_size, random_state=random_state, epochs=epochs, generate_plots=generate_plots, callback_mode=callback_mode, maximizer=maximizer, save_model=save_model, save_path=save_path) else: self.regression_query_ann(instruction, callback=callback, text=text, ca_threshold=ca_threshold, preprocess=preprocess, test_size=test_size, random_state=random_state, epochs=epochs, generate_plots=generate_plots, callback_mode=callback_mode, maximizer=maximizer, drop=drop, save_model=save_model, save_path=save_path) clearLog()
def csv_preprocessing(csv_file, data_path, instruction, image_column, training_ratio, height, width): df = pd.read_csv(csv_file) if instruction is None: raise BaseException( "Instruction was not given for csv file to be processed.") label = get_similar_column(get_value_instruction(instruction), df) avoid_directories = ["proc_training_set", "proc_testing_set"] data_paths = [ data_path + "/" + d for d in os.listdir(data_path) if os.path.isdir(data_path + "/" + d) and d not in avoid_directories ] file_extensions = ["jpg", "jpeg", "png", "gif"] need_file_extension = False path_included = False count = 0 while image_column is None: if count > 20: raise BaseException( f"Could not locate column containing image information.") count += 1 random_row = df.sample() for column, value in random_row.iloc[0].items(): if isinstance(value, str): if os.path.exists(data_path + "/" + (value if value[0] != "/" else value[1:])): path_included = True image_column = column break # add file extension if not included if value.split(".")[-1] in file_extensions: file = [value] else: file = [] for extension in file_extensions: file.append(value + "." + extension) # look through all data_paths for file for path in data_paths: for file_option in file: if os.path.exists(path + "/" + file_option): if file_option.split(".")[-1] in file_extensions: need_file_extension = True image_column = column break if image_column is not None: break else: if os.path.exists(data_path + "/" + df.iloc[0][image_column]): path_included = True elif df.iloc[0][image_column].split(".")[-1]: need_file_extension = True df = df[[image_column, label]].dropna() heights = [] widths = [] classifications = df[label].value_counts() if len(classifications) < 2: raise BaseException( f"{csv_file} contains {len(classifications)} classes. Need at least two classification labels." ) for key, value in classifications.items(): if value < 2: raise BaseException( f"Class: {key} contans {value} images. Need at least two images in this class." ) image_list = [] # get the median heights and widths for index, row in df.iterrows(): if path_included: p = data_path + "/" + \ (row[image_column][1:] if row[image_column][0] == "/" else row[image_column]) img = cv2.imread(p) else: for path in data_paths: if need_file_extension: for extension in file_extensions: p = path + "/" + row[image_column] + "." + extension img = cv2.imread(p) if img is not None: break else: p = path + "/" + row[image_column] img = cv2.imread(p) if img is not None: break if img is None: raise BaseException( f"{row[image_column]} could not be found in any directories.") image_list.append(img) heights.append(img.shape[0]) widths.append(img.shape[1]) height1, width1 = calculate_medians(heights, widths) if height is None: height = height1 if width is None: width = width1 # create training and testing folders create_folder(data_path, "proc_training_set") create_folder(data_path, "proc_testing_set") # create classification folders for classification in classifications.keys(): create_folder(data_path + "/proc_training_set", classification) create_folder(data_path + "/proc_testing_set", classification) data_size = [0, 0] class_count = dict.fromkeys(classifications.keys(), 1) is_rgb = [] # save images into correct folder for index, row in df.iterrows(): # resize images resized_info = process_color_channel(image_list[index], height, width) img = resized_info[0] is_rgb.append(resized_info[1]) p = "proc_" + (os.path.basename(row[image_column]) if path_included else row[image_column]) if need_file_extension: p += ".jpg" if class_count[row[label]] / \ classifications[row[label]] < training_ratio: data_size[0] += 1 class_count[row[label]] += 1 save_image(data_path + "/proc_training_set", img, p, row[label]) else: data_size[1] += 1 class_count[row[label]] += 1 save_image(data_path + "/proc_testing_set", img, p, row[label]) return { "num_categories": len(classifications), "height": height, "width": width, "train_size": data_size[0], "test_size": data_size[1], "gray_scale": not any(is_rgb) }
def image_caption_query(self, instruction, label_column=None, drop=None, epochs=10, preprocess=True, random_state=49, test_size=0.2, top_k=5000, batch_size=32, buffer_size=1000, embedding_dim=256, units=512, gpu=False, generate_plots=True, save_model_decoder=False, save_path_decoder=os.getcwd(), save_model_encoder=False, save_path_encoder=os.getcwd()): ''' function to apply predictive algorithm for image_caption generation :param many params: used to hyperparametrize the function. :return a dictionary object with all of the information for the algorithm. ''' if test_size < 0: raise Exception("Test size must be a float between 0 and 1") if test_size >= 1: raise Exception( "Test size must be a float between 0 and 1 (a test size greater than or equal to 1 results in no training " "data)") if top_k < 1: raise Exception("Top_k value must be equal to or greater than 1") if batch_size < 1: raise Exception("Batch size must be equal to or greater than 1") if buffer_size < 1: raise Exception("Buffer size must be equal to or greater than 1") if embedding_dim < 1: raise Exception( "Embedding dimension must be equal to or greater than 1") if units < 1: raise Exception("Units must be equal to or greater than 1") if epochs < 1: raise Exception( "Epoch number is less than 1 (model will not be trained)") if save_model_decoder: if not os.path.exists(save_path_decoder): raise Exception("Decoder save path does not exists") if save_model_encoder: if not os.path.exists(save_path_encoder): raise Exception("Encoder save path does not exists") if test_size == 0: testing = False else: testing = True if gpu: if tf.test.gpu_device_name(): print('Default GPU Device: {}'.format(tf.test.gpu_device_name())) else: raise Exception("Please install GPU version of Tensorflow") device = '/device:GPU:0' else: device = '/device:CPU:0' np.random.seed(random_state) tf.random.set_seed(random_state) data = DataReader(self.dataset) df = data.data_generator() if preprocess: df.fillna(0, inplace=True) if drop is not None: df.drop(drop, axis=1, inplace=True) logger("Preprocessing data") train_captions = [] img_name_vector = [] if label_column is None: label = instruction else: label = label_column x = get_path_column(df) y = get_similar_column(get_value_instruction(label), df) logger("->", "Target Column Found: {}".format(y)) for row in df.iterrows(): if preprocess: caption = '<start> ' + row[1][y] + ' <end>' image_id = row[1][x] image_path = image_id img_name_vector.append(image_path) train_captions.append(caption) image_model = tf.keras.applications.InceptionV3(include_top=False, weights='imagenet') new_input = image_model.input hidden_layer = image_model.layers[-1].output logger("Extracting features from model") image_features_extract_model = tf.keras.Model(new_input, hidden_layer) image_dataset = tf.data.Dataset.from_tensor_slices( sorted(set(img_name_vector))) image_dataset = image_dataset.map( load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(16) for img, path in image_dataset: batch_features = image_features_extract_model(img) batch_features = tf.reshape( batch_features, (batch_features.shape[0], -1, batch_features.shape[3])) for bf, p in zip(batch_features, path): path_of_feature = p.numpy().decode("utf-8") np.save(path_of_feature, bf.numpy()) logger("->", "Tokenizing top {} words".format(top_k)) tokenizer = tf.keras.preprocessing.text.Tokenizer( num_words=top_k, oov_token="<unk>", filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ') tokenizer.fit_on_texts(train_captions) tokenizer.word_index['<pad>'] = 0 tokenizer.index_word[0] = '<pad>' train_seqs = tokenizer.texts_to_sequences(train_captions) cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post') vocab_size = top_k + 1 # num_steps = len(img_name_vector) // batch_size if testing: img_name_train, img_name_val, cap_train, cap_val = train_test_split( img_name_vector, cap_vector, test_size=test_size, random_state=0) else: img_name_train = img_name_vector cap_train = cap_vector dataset = tf.data.Dataset.from_tensor_slices((img_name_train, cap_train)) dataset = dataset.map(lambda item1, item2: tf.numpy_function( map_func, [item1, item2], [tf.float32, tf.int32]), num_parallel_calls=tf.data.experimental.AUTOTUNE) # Shuffle and batch logger("Shuffling dataset") dataset = dataset.shuffle(buffer_size).batch(batch_size) dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) if testing: dataset_val = tf.data.Dataset.from_tensor_slices( (img_name_val, cap_val)) dataset_val = dataset_val.map( lambda item1, item2: tf.numpy_function(map_func, [item1, item2], [tf.float32, tf.int32]), num_parallel_calls=tf.data.experimental.AUTOTUNE) # Shuffle and batch dataset_val = dataset_val.shuffle(buffer_size).batch(batch_size) dataset_val = dataset_val.prefetch( buffer_size=tf.data.experimental.AUTOTUNE) logger("Establishing encoder decoder framework") encoder = CNN_Encoder(embedding_dim) decoder = RNN_Decoder(embedding_dim, units, vocab_size) optimizer = tf.keras.optimizers.Adam() loss_object = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction='none') def loss_function(real, pred): mask = tf.math.logical_not(tf.math.equal(real, 0)) loss_ = loss_object(real, pred) mask = tf.cast(mask, dtype=loss_.dtype) loss_ *= mask return tf.reduce_mean(loss_) @tf.function def train_step(img_tensor, target): with tf.device(device): loss = 0 # initializing the hidden state for each batch # because the captions are not related from image to image hidden = decoder.reset_state(batch_size=target.shape[0]) dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * target.shape[0], 1) with tf.GradientTape() as tape: features = encoder(img_tensor) for i in range(1, target.shape[1]): # passing the features through the decoder predictions, hidden, _ = decoder(dec_input, features, hidden) loss += loss_function(target[:, i], predictions) # using teacher forcing dec_input = tf.expand_dims(target[:, i], 1) total_loss = (loss / int(target.shape[1])) trainable_variables = encoder.trainable_variables + decoder.trainable_variables gradients = tape.gradient(loss, trainable_variables) optimizer.apply_gradients(zip(gradients, trainable_variables)) return loss, total_loss @tf.function def val_step(img_tensor, target): with tf.device(device): loss = 0 # initializing the hidden state for each batch # because the captions are not related from image to image hidden = decoder.reset_state(batch_size=target.shape[0]) dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * target.shape[0], 1) with tf.GradientTape() as tape: features = encoder(img_tensor) for i in range(1, target.shape[1]): # passing the features through the decoder predictions, hidden, _ = decoder(dec_input, features, hidden) loss += loss_function(target[:, i], predictions) # using teacher forcing dec_input = tf.expand_dims(target[:, i], 1) total_loss = (loss / int(target.shape[1])) return total_loss logger("Training model...") with tf.device(device): loss_plot_train = [] loss_plot_val = [] for epoch in range(epochs): total_loss = 0 total_loss_val = 0 for (batch, (img_tensor, target)) in enumerate(dataset): batch_loss, t_loss = train_step(img_tensor, target) total_loss += t_loss loss_plot_train.append(total_loss.numpy()) if testing: for (batch, (img_tensor, target)) in enumerate(dataset_val): batch_loss, t_loss = train_step(img_tensor, target) total_loss_val += t_loss loss_plot_val.append(total_loss_val.numpy()) dir_name = os.path.dirname(img_name_vector[0]) files = os.listdir(dir_name) for item in files: if item.endswith(".npy"): os.remove(os.path.join(dir_name, item)) plots = {} if generate_plots: logger("Generating plots") plots.update({ "loss": libra.plotting.nonkeras_generate_plots.plot_loss( loss_plot_train, loss_plot_val) }) logger("->", "Final training loss: {}".format(str(total_loss.numpy()))) total_loss = total_loss.numpy() if testing: total_loss_val = total_loss_val.numpy() total_loss_val_str = str(total_loss_val) else: total_loss_val = 0 total_loss_val_str = str("0, No validation done") logger("->", "Final validation loss: {}".format(total_loss_val_str)) if save_model_decoder: logger("Saving decoder checkpoint...") encoder.save_weights(save_path_decoder + "decoderImgCap.ckpt") if save_model_encoder: logger("Saving encoder checkpoint...") encoder.save_weights(save_path_encoder + "encoderImgCap.ckpt") logger("Storing information in client object under key 'image_caption'") self.models["image_caption"] = { "decoder": decoder, "encoder": encoder, "tokenizer": tokenizer, "feature_extraction": image_features_extract_model, "plots": plots, 'losses': { 'Training loss': total_loss, 'Validation loss': total_loss_val } } clearLog() return self.models["image_caption"]
def dimensionality_RF(instruction, dataset, target="", y="", n_features=10): global counter dataReader = DataReader(dataset) if target == "": data = dataReader.data_generator() data.fillna(0, inplace=True) remove = get_similar_column(get_value_instruction(instruction), data) data, y, target, full_pipeline = initial_preprocesser(data, instruction, True, 0.2, [], 0.2, random_state=49) le = preprocessing.LabelEncoder() X_train = data['train'] y_train = y['train'] X_test = data['test'] y_test = y['test'] y_train = le.fit_transform(y_train) y_test = le.fit_transform(y_test) first_classifier = tree.DecisionTreeClassifier() first_classifier.fit(X_train, y_train) first_classifier_acc = accuracy_score(first_classifier.predict(X_test), y_test) accuracy_scores = [first_classifier_acc] columns = [] datas = [] datas.append(dataset) columns.append([]) for i, x in product(range(3, 10), range(4, len(X_train.columns))): feature_model = RandomForestRegressor(random_state=1, max_depth=x) feature_model.fit(X_train, y_train) importances = feature_model.feature_importances_ indices = np.argsort(importances)[-x:] columns.append(X_train.columns[indices]) X_temp_train = X_train[X_train.columns[indices]] X_temp_test = X_test[X_train.columns[indices]] val = pd.DataFrame(np.r_[X_temp_train, X_temp_test]) val[target] = np.r_[y_train, y_test] datas.append(val) vr = tree.DecisionTreeClassifier() vr.fit(X_temp_train, y_train) accuracy_scores.append(accuracy_score(vr.predict(X_temp_test), y_test)) the_index = accuracy_scores.index(max(accuracy_scores)) print(accuracy_scores) return datas[the_index], accuracy_scores[0], max(accuracy_scores), list( columns[the_index])
def dimensionality_ICA(instruction, dataset, target="", y=""): global counter dataReader = DataReader(dataset) if target == "": data = dataReader.data_generator() data.fillna(0, inplace=True) remove = get_similar_column(get_value_instruction(instruction), data) data, y, target, full_pipeline = initial_preprocesser(data, instruction, True, 0.2, [], 0.2, random_state=49) X_train = data['train'] X_test = data['test'] y_train = y['train'] y_test = y['test'] pca = FastICA(n_components=len(X_train.columns)) X_train_mod = pca.fit_transform(X_train) X_test_mod = pca.fit_transform(X_test) clf = tree.DecisionTreeClassifier() clf.fit(X_train, y_train) clf_mod = tree.DecisionTreeClassifier() clf_mod.fit(X_train_mod, y_train) acc = [] sets = [] acc.append(accuracy_score(clf_mod.predict(X_test_mod), y_test)) frame = pd.DataFrame( pd.DataFrame(X_train_mod).append(pd.DataFrame(X_test_mod))) frame[target] = np.r_[y_train, y_test] sets.append(frame) for i in range(2, len(X_train.columns)): pca = FastICA(n_components=i) X_train_mod = pca.fit_transform(X_train) X_test_mod = pca.fit_transform(X_test) frame = pd.DataFrame( pd.DataFrame(X_train_mod).append(pd.DataFrame(X_test_mod))) frame[target] = np.r_[y_train, y_test] sets.append(frame) clf_mod = tree.DecisionTreeClassifier() clf_mod.fit(X_train_mod, y_train) acc.append(accuracy_score(clf_mod.predict(X_test_mod), y_test)) del i data_modified = sets[acc.index(max(acc))] score = max(acc) return data_modified, score, ((len(X_train.columns) + 1) - len(data_modified.columns))
def dimensionality_reduc( instruction, dataset, arr=[ "RF", "PCA", "KPCA", "ICA"], inplace=False): ''' function to perform dimensionality reduction on the dataset (retrieve only features with most relevance from multidimensional space of the dataset) :param instruction: command sent to client instance in written query :param dataset: data instantiated in client instance passed to the algorithm :param arr: list of options of algorithm/dimension reducing techniques options to choose from :param inplace: option to keep features that were deemed as not important intact in the dataset ''' global counter dataReader = DataReader(dataset) logger("loading dataset...") data = dataReader.data_generator() data.fillna(0, inplace=True) logger("getting most similar column from instruction...") target = get_similar_column(get_value_instruction(instruction), data) y = data[target] del data[target] le = preprocessing.LabelEncoder() y = le.fit_transform(y) data = structured_preprocesser(data) perms = [] overall_storage = [] finals = [] logger("generating dimensionality permutations...") for i in range(1, len(arr) + 1): for elem in list(permutations(arr, i)): perms.append(elem) logger("running each possible permutation...") logger("realigning tensors...") for path in perms: currSet = data for element in path: if element == "RF": data_mod, beg_acc, final_acc, col_removed = dimensionality_RF( instruction, currSet, target, y) elif element == "PCA": data_mod, beg_acc, final_acc, col_removed = dimensionality_PCA( instruction, currSet, target, y) elif element == "KPCA": data_mod, beg_acc, final_acc, col_removed = dimensionality_KPCA( instruction, currSet, target, y) elif element == "ICA": data_mod, beg_acc, final_acc, col_removed = dimensionality_ICA( instruction, currSet, target, y) overall_storage.append( list([data_mod, beg_acc, final_acc, col_removed])) currSet = data_mod finals.append(overall_storage[len(overall_storage) - 1]) logger("Fetching Best Accuracies...") accs = [] logger("->", "Baseline Accuracy: " + str(finals[0][1])) # print("----------------------------") col_name = [["Permutation ", "| Final Accuracy "]] printtable(col_name, max(len(word) for row in col_name for word in row) + 5) for i, element in product(range(len(finals)), finals): values = [] values.append(str(perms[i])) values.append("| " + str(element[2])) datax = [] datax.append(values) printtable(datax, max(len(word) for row in col_name for word in row) + 5) del values, datax if finals[0][1] < element[2]: accs.append(list([str(perms[i]), "| " + str(element[2])])) print("") logger("->", " Best Accuracies") # print("----------------------------") col_name = [["Permutation ", "| Final Accuracy "]] printtable(col_name, max(len(word) for row in col_name for word in row) + 5) printtable(accs, col_width) if inplace: data.to_csv(dataset)
def dimensionality_RF(instruction, dataset, target="", y="", n_features=10): ''' function to reduce dimensionality in dataset via random forest method :param instruction: command sent to client instance in written query. :param dataset: data instantiated in client instance passed to the algorithm :param target: column name of response variable/feature :param y: dictionary of train/test data values associated with response variable/feature :param n_features: maximum number of features to choose to analyze/select ''' global counter dataReader = DataReader("./data/" + get_last_file()[0]) if target == "": data = dataReader.data_generator() data.fillna(0, inplace=True) remove = get_similar_column(get_value_instruction(instruction), data) data = structured_preprocesser(data) y = data[remove] del data[remove] le = preprocessing.LabelEncoder() y = le.fit_transform(y) X_train, X_test, y_train, y_test = train_test_split( dataset, y, test_size=0.2, random_state=49) first_classifier = tree.DecisionTreeClassifier() first_classifier.fit(X_train, y_train) first_classifier_acc = accuracy_score( first_classifier.predict(X_test), y_test) accuracy_scores = [first_classifier_acc] columns = [] datas = [] datas.append(dataset) columns.append([]) for i, x in product(range(3, 10), range(4, len(dataset.columns))): feature_model = RandomForestRegressor(random_state=1, max_depth=i) feature_model.fit(X_train, y_train) importances = feature_model.feature_importances_ indices = np.argsort(importances)[-x:] columns.append(dataset.columns[indices]) X_temp_train = X_train[dataset.columns[indices]] X_temp_test = X_test[dataset.columns[indices]] val = pd.DataFrame(np.r_[X_temp_train, X_temp_test]) val[target] = np.r_[y_train, y_test] datas.append(val) vr = tree.DecisionTreeClassifier() vr.fit(X_temp_train, y_train) accuracy_scores.append(accuracy_score(vr.predict(X_temp_test), y_test)) the_index = accuracy_scores.index(max(accuracy_scores)) return datas[the_index], accuracy_scores[0], max( accuracy_scores), list(columns[the_index])
def dimensionality_reduc( instruction, dataset, arr=[ "RF", "PCA", "KPCA", "ICA"], inplace=False): global currLog global counter dataReader = DataReader(dataset) logger("loading dataset...") data = dataReader.data_generator() data.fillna(0, inplace=True) logger("getting most similar column from instruction...") target = get_similar_column(get_value_instruction(instruction), data) y = data[target] del data[target] le = preprocessing.LabelEncoder() y = le.fit_transform(y) data = structured_preprocesser(data) perms = [] overall_storage = [] finals = [] logger("generating dimensionality permutations...") for i in range(1, len(arr) + 1): for elem in list(permutations(arr, i)): perms.append(elem) logger("running each possible permutation...") logger("realigning tensors...") for path in perms: currSet = data for element in path: if element == "RF": data_mod, beg_acc, final_acc, col_removed = dimensionality_RF( instruction, currSet, target, y) elif element == "PCA": data_mod, beg_acc, final_acc, col_removed = dimensionality_PCA( instruction, currSet, target, y) elif element == "KPCA": data_mod, beg_acc, final_acc, col_removed = dimensionality_KPCA( instruction, currSet, target, y) elif element == "ICA": data_mod, beg_acc, final_acc, col_removed = dimensionality_ICA( instruction, currSet, target, y) overall_storage.append( list([data_mod, beg_acc, final_acc, col_removed])) currSet = data_mod finals.append(overall_storage[len(overall_storage) - 1]) logger("Fetching Best Accuracies...") accs = [] print("") print("Baseline Accuracy: " + str(finals[0][1])) print("----------------------------") for i, element in product(range(len(finals)), finals): print("Permutation --> " + str(perms[i]) + " | Final Accuracy --> " + str(element[2])) if finals[0][1] < element[2]: accs.append(list(["Permutation --> " + str(perms[i]) + " | Final Accuracy --> " + str(element[2])])) print("") print("Best Accuracies") print("----------------------------") for element in accs: print(element) if inplace: data.to_csv(dataset)