def main(argv): _, arguments = getopt.getopt(argv, "") # arguments = argv.split("'")[1::2] new_file_path = arguments.pop(0) packages_path_with_class_label = arguments preprocess(new_file_path, packages_path_with_class_label) load_model()
def __init__(self, trainX, trainY, testX, testY, **kwargs): # Gather train and test data for a feature selection phase testIdx = len(trainX) rawData = pd.DataFrame(data=np.concatenate([trainX, testX])) labels = pd.DataFrame(np.concatenate([trainY, testY])) #get first 179 records and label column df = preprocess(rawData, labels, False) #remove last column before scaling df = df.iloc[:, :-1] #append class labels again df = preprocess(df, labels, False) # split to upsample (data = X, labels = y) rows, columns = df.shape X = df.values[:,0:columns - 1] y = df.values[:,[columns - 1]].flatten() # Regain train and test sets, scaled and upsampled self.trainX = X[:testIdx] self.trainY = y[:testIdx] self.testX = df.iloc[testIdx:,:-1] self.testY = y[testIdx:] # Upsample np.random.seed(42) self.trainX, self.trainY = doUpsamling(self.trainX, self.trainY) #again create labels and convert train sets into dataframes newNames = ['x' + str(i+1) for i in range(self.trainX.shape[1])] self.trainX = pd.DataFrame(data=self.trainX[:], columns=[newNames]) self.trainY = pd.DataFrame(data=self.trainY[:], columns=['cancer']) #combine training sets before feature selection combinedDf = preprocess(self.trainX, self.trainY, False) #use ANOVA on training-only set to employ feature reduction sortedAnovaResults, significantValues, reducedDf = reduceDimentions(combinedDf, 'ANOVA', 0.01, reduce = True) #append class labels again self.reducedDf = reducedDf.join(combinedDf.iloc[:,-1]) #Build the dtModel with best parameters classifier = DecisionTreeClassifier(**kwargs) self.dtModel = classifier.fit(self.reducedDf.iloc[:, :-1], self.trainY)
def __init__(self, trainX, trainY, testX, testY, k=1): # Gather train and test data for a feature selection phase testIdx = len(trainX) rawData = pd.DataFrame(data=np.concatenate([trainX, testX])) labels = pd.DataFrame(np.concatenate([trainY, testY])) df = preprocess(rawData, labels, False) # Reduce dimensions on the dataset using ANOVA sortedAnovaResults, significantValues, reducedDf = reduceDimentions(df, 'ANOVA', 0.01, reduce = True) # Attach the labels of cancer again df = preprocess(reducedDf, labels, False) # split to upsample (data = X, labels = y) rows, columns = df.shape X = df.values[:,0:columns - 1] y = df.values[:,[columns - 1]].flatten() # Regain train and test sets, scaled and upsampled trainX = X[:testIdx] trainY = y[:testIdx] self.testX = X[testIdx:] self.testY = y[testIdx:] # Upsample np.random.seed(42) trainX, trainY = doUpsamling(trainX, trainY) # scale training and test data using min max scaler scaler = preprocessing.MinMaxScaler().fit(trainX) X_train_scaled = scaler.transform(trainX) self.testX = scaler.transform(self.testX) # Do the kNN with the optimal parameter self.knn = KNeighborsClassifier(n_neighbors=k) self.knn.fit(X_train_scaled, trainY)
def _general_model_fn(features, pipeline_config, result_folder, dataset_info, feature_extractor, mode, num_gpu, visualization_file_names, eval_dir): num_classes = pipeline_config.dataset.num_classes add_background_class = pipeline_config.train_config.loss.name == 'softmax' if add_background_class: assert (num_classes == 1) num_classes += 1 image_batch = features[standard_fields.InputDataFields.image_decoded] if mode == tf.estimator.ModeKeys.PREDICT: annotation_mask_batch = None else: annotation_mask_batch = features[ standard_fields.InputDataFields.annotation_mask] if mode == tf.estimator.ModeKeys.TRAIN: # Augment images image_batch, annotation_mask_batch = preprocessor.apply_data_augmentation( pipeline_config.train_config.data_augmentation_options, images=image_batch, gt_masks=annotation_mask_batch, batch_size=pipeline_config.train_config.batch_size) # General preprocessing image_batch_preprocessed = preprocessor.preprocess( image_batch, pipeline_config.dataset.val_range, scale_input=pipeline_config.dataset.scale_input) network_output = feature_extractor.build_network( image_batch_preprocessed, is_training=mode == tf.estimator.ModeKeys.TRAIN, num_classes=num_classes, use_batch_norm=pipeline_config.model.use_batch_norm, bn_momentum=pipeline_config.model.batch_norm_momentum, bn_epsilon=pipeline_config.model.batch_norm_epsilon, activation_fn=activation_fn_builder.build(pipeline_config.model)) if mode == tf.estimator.ModeKeys.TRAIN: # Record model variable summaries for var in tf.trainable_variables(): tf.summary.histogram('ModelVars/' + var.op.name, var) network_output_shape = network_output.get_shape().as_list() if mode != tf.estimator.ModeKeys.PREDICT: if (network_output_shape[1:3] != annotation_mask_batch.get_shape().as_list()[1:3]): annotation_mask_batch = image_utils.central_crop( annotation_mask_batch, desired_size=network_output.get_shape().as_list()[1:3]) annotation_mask_batch = tf.cast(tf.clip_by_value( annotation_mask_batch, 0, 1), dtype=tf.int64) assert (len(annotation_mask_batch.get_shape()) == 4) assert (annotation_mask_batch.get_shape().as_list()[:3] == network_output.get_shape().as_list()[:3]) # We should not apply the loss to evaluation. This would just cause # our loss to be minimum for f2 score, but we also get the same # optimum if we just optimzie for f1 score if (pipeline_config.train_config.loss.use_weighted and mode == tf.estimator.ModeKeys.TRAIN): patient_ratio = dataset_info[ standard_fields.PickledDatasetInfo.patient_ratio] cancer_pixels = tf.reduce_sum(tf.to_float(annotation_mask_batch)) healthy_pixels = tf.to_float( tf.size(annotation_mask_batch)) - cancer_pixels batch_pixel_ratio = tf.div(healthy_pixels, cancer_pixels + 1.0) loss_weight = ( ((batch_pixel_ratio * patient_ratio) + pipeline_config.train_config.loss.weight_constant_add) * pipeline_config.train_config.loss.weight_constant_multiply) else: loss_weight = tf.constant(1.0) if mode == tf.estimator.ModeKeys.PREDICT: loss = None else: loss = _loss(tf.reshape(annotation_mask_batch, [-1]), tf.reshape(network_output, [-1, num_classes]), loss_name=pipeline_config.train_config.loss.name, pos_weight=loss_weight) loss = tf.identity(loss, name='ModelLoss') tf.summary.scalar(loss.op.name, loss, family='Loss') total_loss = tf.identity(loss, name='TotalLoss') if mode == tf.estimator.ModeKeys.TRAIN: if pipeline_config.train_config.add_regularization_loss: regularization_losses = tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES) if regularization_losses: regularization_loss = tf.add_n(regularization_losses, name='RegularizationLoss') total_loss = tf.add_n([loss, regularization_loss], name='TotalLoss') tf.summary.scalar(regularization_loss.op.name, regularization_loss, family='Loss') tf.summary.scalar(total_loss.op.name, total_loss, family='Loss') total_loss = tf.check_numerics(total_loss, 'LossTensor is inf or nan.') scaffold = None update_ops = [] if mode == tf.estimator.ModeKeys.TRAIN: if pipeline_config.train_config.optimizer.use_moving_average: # EMA's are currently not supported with tf's DistributionStrategy. # Reenable once they fixed the bugs logging.warn( 'EMA is currently not supported with tf DistributionStrategy.') exit(1) pipeline_config.train_config.optimizer.use_moving_average = False # The swapping saver will swap the trained variables with their moving # averages before saving, thus removing the need to care for moving # averages during evaluation # scaffold = tf.train.Scaffold(saver=optimizer.swapping_saver()) optimizer, optimizer_summary_vars = optimizer_builder.build( pipeline_config.train_config.optimizer) for var in optimizer_summary_vars: tf.summary.scalar(var.op.name, var, family='LearningRate') grads_and_vars = optimizer.compute_gradients(total_loss) update_ops.append( optimizer.apply_gradients(grads_and_vars, global_step=tf.train.get_global_step())) graph_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) update_ops.append(graph_update_ops) update_op = tf.group(*update_ops, name='update_barrier') with tf.control_dependencies([update_op]): if mode == tf.estimator.ModeKeys.PREDICT: train_op = None else: train_op = tf.identity(total_loss) if mode == tf.estimator.ModeKeys.TRAIN: logging.info("Total number of trainable parameters: {}".format( np.sum([ np.prod(v.get_shape().as_list()) for v in tf.trainable_variables() ]))) # Training Hooks are not working with MirroredStrategy. Fixed in 1.13 #print_hook = session_hooks.PrintHook( # file_name=features[standard_fields.InputDataFields.image_file], # batch_pixel_ratio=batch_pixel_ratio) return tf.estimator.EstimatorSpec(mode, loss=total_loss, train_op=train_op, scaffold=scaffold) elif mode == tf.estimator.ModeKeys.EVAL: if pipeline_config.train_config.loss.name == 'sigmoid': scaled_network_output = tf.nn.sigmoid(network_output)[:, :, :, 0] elif pipeline_config.train_config.loss.name == 'softmax': assert (network_output.get_shape().as_list()[-1] == 2) scaled_network_output = tf.nn.softmax(network_output)[:, :, :, 1] # Metrics metric_dict, statistics_dict = metric_utils.get_metrics( scaled_network_output, annotation_mask_batch, tp_thresholds=np.array(pipeline_config.metrics_tp_thresholds, dtype=np.float32), parallel_iterations=min(pipeline_config.eval_config.batch_size, util_ops.get_cpu_count())) vis_hook = session_hooks.VisualizationHook( result_folder=result_folder, visualization_file_names=visualization_file_names, file_name=features[standard_fields.InputDataFields.image_file], image_decoded=image_batch, annotation_decoded=features[ standard_fields.InputDataFields.annotation_decoded], predicted_mask=scaled_network_output, eval_dir=eval_dir) patient_metric_hook = session_hooks.PatientMetricHook( statistics_dict=statistics_dict, patient_id=features[standard_fields.InputDataFields.patient_id], result_folder=result_folder, tp_thresholds=pipeline_config.metrics_tp_thresholds, eval_dir=eval_dir) return tf.estimator.EstimatorSpec( mode, loss=total_loss, train_op=train_op, evaluation_hooks=[vis_hook, patient_metric_hook], eval_metric_ops=metric_dict) elif mode == tf.estimator.ModeKeys.PREDICT: if pipeline_config.train_config.loss.name == 'sigmoid': scaled_network_output = tf.nn.sigmoid(network_output)[:, :, :, 0] elif pipeline_config.train_config.loss.name == 'softmax': assert (network_output.get_shape().as_list()[-1] == 2) scaled_network_output = tf.nn.softmax(network_output)[:, :, :, 1] vis_hook = session_hooks.VisualizationHook( result_folder=result_folder, visualization_file_names=None, file_name=features[standard_fields.InputDataFields.image_file], image_decoded=image_batch, annotation_decoded=None, predicted_mask=scaled_network_output, eval_dir=eval_dir) predicted_mask = tf.stack([ scaled_network_output * 255, tf.zeros_like(scaled_network_output), tf.zeros_like(scaled_network_output) ], axis=3) predicted_mask_overlay = tf.clip_by_value( features[standard_fields.InputDataFields.image_decoded] * 0.5 + predicted_mask, 0, 255) return tf.estimator.EstimatorSpec( mode, prediction_hooks=[vis_hook], predictions={ 'image_file': features[standard_fields.InputDataFields.image_file], 'prediction': predicted_mask_overlay }) else: assert (False)
from utils.preprocessor import reduceDimentions from utils.scaler import scale from utils.dimentionallityReduction import doPCA from utils.dimentionallityReduction import doTSNE from utils.dimentionallityReduction import doANOVA print("FIRST EXPERIMENT (1) WITH StandardScaler") print('\n') print("========================") print("Start Get Clean Data") # Read the data rawData = pd.read_csv("data/data.csv", header=None) labels = pd.read_csv("data/labels.csv", header=None) # Call the custom function, True if you need to slice the dataset df = preprocess(rawData, labels, True) ###### Check-out our data ######## #print("The appended resulting dataframe is: \n", df) print("========================") print("End Get Clean Data") print("========================") print("Start scaling") # Get the values to array form for the scaler to work df = df.values # scale training and test data using standar scaler scaler = preprocessing.StandardScaler().fit(df) # Scale the df :) df_scaled = scaler.transform(df)
import pandas as pd import numpy as np import graphviz import io import pydotplus import imageio as imgo # Read the data rawData = pd.read_csv("data/data.csv", header=None) labels = pd.read_csv("data/labels.csv", header=None) #set test to last 180 entries in the rawData set realTest = rawData.tail(n=180) #get first 179 records and label column df = preprocess(rawData, labels, True) #remove last column before scaling df = df.iloc[:, :-1] #append class labels again prepDf = preprocess(df, labels, False) #define relevant Features #relevantFeatures = ['x39','x123','x130','x56','x157','x32'] #best dimensions?? #define train and target sets train,test =train_test_split(prepDf, test_size=0.30, random_state = 45) x_train = train.iloc[:, :-1] x_test = test.iloc[:, :-1] y_train = train['cancer']
import seaborn as sns # Custom Imports from utils.preprocessor import preprocess from utils.preprocessor import reduceDimentions from utils.scaler import scale from utils.dimentionallityReduction import doPCA from utils.dimentionallityReduction import doTSNE from utils.dimentionallityReduction import doANOVA # Read the data rawData = pd.read_csv("data/data.csv", header=None) labels = pd.read_csv("data/labels.csv", header=None) # Call the custom function, True if you need to slice the dataset df = preprocess(rawData, labels, True) ###### Check-out our data ######## print("The appended resulting dataframe is: \n", df) # Perform scailing on the data scaledDf = scale(df) print(scaledDf) # Perform PCA with 2 var on the scaledDf # Count the number of columns rows, columns = scaledDf.shape print('The number of columns is: ', columns) # Remove the last column which contains the label results from the scaled df scaledDf = scaledDf.drop(['x' + str(columns)], axis=1) print("The scaled resulting dataframe is: \n", scaledDf)