Python preprocessの例、utils.preprocessor.preprocess Pythonの例

コード例 #1

0

ファイルを表示

def main(argv):
    _, arguments = getopt.getopt(argv, "")
    # arguments = argv.split("'")[1::2]
    new_file_path = arguments.pop(0)
    packages_path_with_class_label = arguments
    preprocess(new_file_path, packages_path_with_class_label)
    load_model()

コード例 #2

0

ファイルを表示

    def __init__(self, trainX, trainY, testX, testY, **kwargs):
        # Gather train and test data for a feature selection phase
        testIdx = len(trainX)
        rawData = pd.DataFrame(data=np.concatenate([trainX, testX]))
        labels = pd.DataFrame(np.concatenate([trainY, testY]))

        #get first 179 records and label column
        df = preprocess(rawData, labels, False)
        #remove last column before scaling
        df = df.iloc[:, :-1]
        #append class labels again
        df = preprocess(df, labels, False)

        # split to upsample (data = X, labels = y)
        rows, columns = df.shape
        X = df.values[:,0:columns - 1]
        y = df.values[:,[columns - 1]].flatten()

        # Regain train and test sets, scaled and upsampled
        self.trainX      = X[:testIdx]
        self.trainY      = y[:testIdx]
        self.testX  = df.iloc[testIdx:,:-1]
        self.testY  = y[testIdx:]

        # Upsample
        np.random.seed(42)
        self.trainX, self.trainY = doUpsamling(self.trainX, self.trainY)

        #again create labels and convert train sets into dataframes
        newNames = ['x' + str(i+1) for i in range(self.trainX.shape[1])]
        self.trainX = pd.DataFrame(data=self.trainX[:], columns=[newNames])
        self.trainY = pd.DataFrame(data=self.trainY[:], columns=['cancer'])
        #combine training sets before feature selection
        combinedDf = preprocess(self.trainX, self.trainY, False)

        #use ANOVA on training-only set to employ feature reduction
        sortedAnovaResults, significantValues, reducedDf = reduceDimentions(combinedDf,
            'ANOVA', 0.01, reduce = True)

        #append class labels again
        self.reducedDf = reducedDf.join(combinedDf.iloc[:,-1])

        #Build the dtModel with best parameters
        classifier = DecisionTreeClassifier(**kwargs)
        self.dtModel = classifier.fit(self.reducedDf.iloc[:, :-1], self.trainY)

コード例 #3

0

ファイルを表示

    def __init__(self, trainX, trainY, testX, testY, k=1):
        # Gather train and test data for a feature selection phase
        testIdx = len(trainX)
        rawData = pd.DataFrame(data=np.concatenate([trainX, testX]))
        labels = pd.DataFrame(np.concatenate([trainY, testY]))

        df = preprocess(rawData, labels, False)
        # Reduce dimensions on the dataset using ANOVA
        sortedAnovaResults, significantValues, reducedDf = reduceDimentions(df,
            'ANOVA', 0.01, reduce = True)

        # Attach the labels of cancer again
        df = preprocess(reducedDf, labels, False)

        # split to upsample (data = X, labels = y)
        rows, columns = df.shape
        X = df.values[:,0:columns - 1]
        y = df.values[:,[columns - 1]].flatten()

        # Regain train and test sets, scaled and upsampled
        trainX      = X[:testIdx]
        trainY      = y[:testIdx]
        self.testX  = X[testIdx:]
        self.testY  = y[testIdx:]

        # Upsample
        np.random.seed(42)
        trainX, trainY = doUpsamling(trainX, trainY)

        # scale training and test data using min max scaler
        scaler = preprocessing.MinMaxScaler().fit(trainX)
        X_train_scaled  = scaler.transform(trainX)
        self.testX      = scaler.transform(self.testX)

        # Do the kNN with the optimal parameter
        self.knn = KNeighborsClassifier(n_neighbors=k)
        self.knn.fit(X_train_scaled, trainY)

コード例 #4

0

ファイルを表示

def _general_model_fn(features, pipeline_config, result_folder, dataset_info,
                      feature_extractor, mode, num_gpu,
                      visualization_file_names, eval_dir):
    num_classes = pipeline_config.dataset.num_classes
    add_background_class = pipeline_config.train_config.loss.name == 'softmax'
    if add_background_class:
        assert (num_classes == 1)
        num_classes += 1

    image_batch = features[standard_fields.InputDataFields.image_decoded]

    if mode == tf.estimator.ModeKeys.PREDICT:
        annotation_mask_batch = None
    else:
        annotation_mask_batch = features[
            standard_fields.InputDataFields.annotation_mask]

    if mode == tf.estimator.ModeKeys.TRAIN:
        # Augment images
        image_batch, annotation_mask_batch = preprocessor.apply_data_augmentation(
            pipeline_config.train_config.data_augmentation_options,
            images=image_batch,
            gt_masks=annotation_mask_batch,
            batch_size=pipeline_config.train_config.batch_size)

    # General preprocessing
    image_batch_preprocessed = preprocessor.preprocess(
        image_batch,
        pipeline_config.dataset.val_range,
        scale_input=pipeline_config.dataset.scale_input)

    network_output = feature_extractor.build_network(
        image_batch_preprocessed,
        is_training=mode == tf.estimator.ModeKeys.TRAIN,
        num_classes=num_classes,
        use_batch_norm=pipeline_config.model.use_batch_norm,
        bn_momentum=pipeline_config.model.batch_norm_momentum,
        bn_epsilon=pipeline_config.model.batch_norm_epsilon,
        activation_fn=activation_fn_builder.build(pipeline_config.model))

    if mode == tf.estimator.ModeKeys.TRAIN:
        # Record model variable summaries
        for var in tf.trainable_variables():
            tf.summary.histogram('ModelVars/' + var.op.name, var)

    network_output_shape = network_output.get_shape().as_list()
    if mode != tf.estimator.ModeKeys.PREDICT:
        if (network_output_shape[1:3] !=
                annotation_mask_batch.get_shape().as_list()[1:3]):
            annotation_mask_batch = image_utils.central_crop(
                annotation_mask_batch,
                desired_size=network_output.get_shape().as_list()[1:3])

        annotation_mask_batch = tf.cast(tf.clip_by_value(
            annotation_mask_batch, 0, 1),
                                        dtype=tf.int64)

        assert (len(annotation_mask_batch.get_shape()) == 4)
        assert (annotation_mask_batch.get_shape().as_list()[:3] ==
                network_output.get_shape().as_list()[:3])

    # We should not apply the loss to evaluation. This would just cause
    # our loss to be minimum for f2 score, but we also get the same
    # optimum if we just optimzie for f1 score
    if (pipeline_config.train_config.loss.use_weighted
            and mode == tf.estimator.ModeKeys.TRAIN):
        patient_ratio = dataset_info[
            standard_fields.PickledDatasetInfo.patient_ratio]
        cancer_pixels = tf.reduce_sum(tf.to_float(annotation_mask_batch))
        healthy_pixels = tf.to_float(
            tf.size(annotation_mask_batch)) - cancer_pixels

        batch_pixel_ratio = tf.div(healthy_pixels, cancer_pixels + 1.0)

        loss_weight = (
            ((batch_pixel_ratio * patient_ratio) +
             pipeline_config.train_config.loss.weight_constant_add) *
            pipeline_config.train_config.loss.weight_constant_multiply)
    else:
        loss_weight = tf.constant(1.0)

    if mode == tf.estimator.ModeKeys.PREDICT:
        loss = None
    else:
        loss = _loss(tf.reshape(annotation_mask_batch, [-1]),
                     tf.reshape(network_output, [-1, num_classes]),
                     loss_name=pipeline_config.train_config.loss.name,
                     pos_weight=loss_weight)
        loss = tf.identity(loss, name='ModelLoss')
        tf.summary.scalar(loss.op.name, loss, family='Loss')

        total_loss = tf.identity(loss, name='TotalLoss')

        if mode == tf.estimator.ModeKeys.TRAIN:
            if pipeline_config.train_config.add_regularization_loss:
                regularization_losses = tf.get_collection(
                    tf.GraphKeys.REGULARIZATION_LOSSES)
                if regularization_losses:
                    regularization_loss = tf.add_n(regularization_losses,
                                                   name='RegularizationLoss')
                    total_loss = tf.add_n([loss, regularization_loss],
                                          name='TotalLoss')
                    tf.summary.scalar(regularization_loss.op.name,
                                      regularization_loss,
                                      family='Loss')

        tf.summary.scalar(total_loss.op.name, total_loss, family='Loss')
        total_loss = tf.check_numerics(total_loss, 'LossTensor is inf or nan.')

    scaffold = None
    update_ops = []
    if mode == tf.estimator.ModeKeys.TRAIN:
        if pipeline_config.train_config.optimizer.use_moving_average:
            # EMA's are currently not supported with tf's DistributionStrategy.
            # Reenable once they fixed the bugs
            logging.warn(
                'EMA is currently not supported with tf DistributionStrategy.')
            exit(1)
            pipeline_config.train_config.optimizer.use_moving_average = False
            # The swapping saver will swap the trained variables with their moving
            # averages before saving, thus removing the need to care for moving
            # averages during evaluation
            # scaffold = tf.train.Scaffold(saver=optimizer.swapping_saver())

        optimizer, optimizer_summary_vars = optimizer_builder.build(
            pipeline_config.train_config.optimizer)
        for var in optimizer_summary_vars:
            tf.summary.scalar(var.op.name, var, family='LearningRate')

        grads_and_vars = optimizer.compute_gradients(total_loss)

        update_ops.append(
            optimizer.apply_gradients(grads_and_vars,
                                      global_step=tf.train.get_global_step()))

    graph_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    update_ops.append(graph_update_ops)
    update_op = tf.group(*update_ops, name='update_barrier')
    with tf.control_dependencies([update_op]):
        if mode == tf.estimator.ModeKeys.PREDICT:
            train_op = None
        else:
            train_op = tf.identity(total_loss)

    if mode == tf.estimator.ModeKeys.TRAIN:
        logging.info("Total number of trainable parameters: {}".format(
            np.sum([
                np.prod(v.get_shape().as_list())
                for v in tf.trainable_variables()
            ])))

        # Training Hooks are not working with MirroredStrategy. Fixed in 1.13
        #print_hook = session_hooks.PrintHook(
        #  file_name=features[standard_fields.InputDataFields.image_file],
        #  batch_pixel_ratio=batch_pixel_ratio)
        return tf.estimator.EstimatorSpec(mode,
                                          loss=total_loss,
                                          train_op=train_op,
                                          scaffold=scaffold)
    elif mode == tf.estimator.ModeKeys.EVAL:
        if pipeline_config.train_config.loss.name == 'sigmoid':
            scaled_network_output = tf.nn.sigmoid(network_output)[:, :, :, 0]
        elif pipeline_config.train_config.loss.name == 'softmax':
            assert (network_output.get_shape().as_list()[-1] == 2)
            scaled_network_output = tf.nn.softmax(network_output)[:, :, :, 1]

            # Metrics
        metric_dict, statistics_dict = metric_utils.get_metrics(
            scaled_network_output,
            annotation_mask_batch,
            tp_thresholds=np.array(pipeline_config.metrics_tp_thresholds,
                                   dtype=np.float32),
            parallel_iterations=min(pipeline_config.eval_config.batch_size,
                                    util_ops.get_cpu_count()))

        vis_hook = session_hooks.VisualizationHook(
            result_folder=result_folder,
            visualization_file_names=visualization_file_names,
            file_name=features[standard_fields.InputDataFields.image_file],
            image_decoded=image_batch,
            annotation_decoded=features[
                standard_fields.InputDataFields.annotation_decoded],
            predicted_mask=scaled_network_output,
            eval_dir=eval_dir)
        patient_metric_hook = session_hooks.PatientMetricHook(
            statistics_dict=statistics_dict,
            patient_id=features[standard_fields.InputDataFields.patient_id],
            result_folder=result_folder,
            tp_thresholds=pipeline_config.metrics_tp_thresholds,
            eval_dir=eval_dir)

        return tf.estimator.EstimatorSpec(
            mode,
            loss=total_loss,
            train_op=train_op,
            evaluation_hooks=[vis_hook, patient_metric_hook],
            eval_metric_ops=metric_dict)
    elif mode == tf.estimator.ModeKeys.PREDICT:
        if pipeline_config.train_config.loss.name == 'sigmoid':
            scaled_network_output = tf.nn.sigmoid(network_output)[:, :, :, 0]
        elif pipeline_config.train_config.loss.name == 'softmax':
            assert (network_output.get_shape().as_list()[-1] == 2)
            scaled_network_output = tf.nn.softmax(network_output)[:, :, :, 1]

        vis_hook = session_hooks.VisualizationHook(
            result_folder=result_folder,
            visualization_file_names=None,
            file_name=features[standard_fields.InputDataFields.image_file],
            image_decoded=image_batch,
            annotation_decoded=None,
            predicted_mask=scaled_network_output,
            eval_dir=eval_dir)

        predicted_mask = tf.stack([
            scaled_network_output * 255,
            tf.zeros_like(scaled_network_output),
            tf.zeros_like(scaled_network_output)
        ],
                                  axis=3)

        predicted_mask_overlay = tf.clip_by_value(
            features[standard_fields.InputDataFields.image_decoded] * 0.5 +
            predicted_mask, 0, 255)

        return tf.estimator.EstimatorSpec(
            mode,
            prediction_hooks=[vis_hook],
            predictions={
                'image_file':
                features[standard_fields.InputDataFields.image_file],
                'prediction': predicted_mask_overlay
            })
    else:
        assert (False)

コード例 #5

0

ファイルを表示

ファイル: preprocess_impact.py プロジェクト: charx7/rug-ids-2018

from utils.preprocessor import reduceDimentions
from utils.scaler import scale
from utils.dimentionallityReduction import doPCA
from utils.dimentionallityReduction import doTSNE
from utils.dimentionallityReduction import doANOVA

print("FIRST EXPERIMENT (1) WITH StandardScaler")
print('\n')
print("========================")
print("Start Get Clean Data")
# Read the data
rawData = pd.read_csv("data/data.csv", header=None)
labels = pd.read_csv("data/labels.csv", header=None)

# Call the custom function, True if you need to slice the dataset
df = preprocess(rawData, labels, True)
###### Check-out our data ########
#print("The appended resulting dataframe is: \n", df)

print("========================")
print("End Get Clean Data")

print("========================")
print("Start scaling")

# Get the values to array form for the scaler to work
df = df.values
# scale training and test data using standar scaler
scaler = preprocessing.StandardScaler().fit(df)
# Scale the df :)
df_scaled = scaler.transform(df)

コード例 #6

0

ファイルを表示

import pandas as pd
import numpy as np
import graphviz
import io
import pydotplus
import imageio as imgo

# Read the data
rawData = pd.read_csv("data/data.csv", header=None)
labels = pd.read_csv("data/labels.csv", header=None)

#set test to last 180 entries in the rawData set
realTest = rawData.tail(n=180)

#get first 179 records and label column
df = preprocess(rawData, labels, True)

#remove last column before scaling
df = df.iloc[:, :-1]

#append class labels again
prepDf = preprocess(df, labels, False)

#define relevant Features
#relevantFeatures = ['x39','x123','x130','x56','x157','x32'] #best dimensions??

#define train and target sets
train,test =train_test_split(prepDf, test_size=0.30, random_state = 45)
x_train = train.iloc[:, :-1]
x_test = test.iloc[:, :-1]
y_train = train['cancer']

コード例 #7

0

ファイルを表示

import seaborn as sns

# Custom Imports
from utils.preprocessor import preprocess
from utils.preprocessor import reduceDimentions
from utils.scaler import scale
from utils.dimentionallityReduction import doPCA
from utils.dimentionallityReduction import doTSNE
from utils.dimentionallityReduction import doANOVA

# Read the data
rawData = pd.read_csv("data/data.csv", header=None)
labels = pd.read_csv("data/labels.csv", header=None)

# Call the custom function, True if you need to slice the dataset
df = preprocess(rawData, labels, True)
###### Check-out our data ########
print("The appended resulting dataframe is: \n", df)

# Perform scailing on the data
scaledDf = scale(df)
print(scaledDf)

# Perform PCA with 2 var on the scaledDf
# Count the number of columns
rows, columns = scaledDf.shape
print('The number of columns is: ', columns)

# Remove the last column which contains the label results from the scaled df
scaledDf = scaledDf.drop(['x' + str(columns)], axis=1)
print("The scaled resulting dataframe is: \n", scaledDf)