Esempio n. 1
0
def project_to_latent_space(vggface_path, pca_path):
    """
        Part 2a of the paper where we project VGG-face extracted features
        to latent dimensional space using PCA       
    """

    X = np.load(os.path.join(vggface_path, 'X.npy'))

    nb_components = 40
    pca = PCA(nb_components).fit(X)
    X_transformed = pca.transform(X)
    np.save(os.path.join(pca_path, 'X_latent.npy'), X_transformed)

    # save model to run
    run = Run.get_submitted_run()
    model_filepath = os.path.join("outputs", 'pca.pkl')

    joblib.dump(pca, model_filepath)
    run.register_model(model_name="pca", model_path=model_filepath)

    # track explained variance per number of principal components in run

    exp_variance = np.cumsum(
        np.round(pca.explained_variance_ratio_, decimals=4) * 100)
    run = Run.get_submitted_run()
    run.log_list('Explained variance', exp_variance.tolist())
Esempio n. 2
0
    def _get_automl_settings(automl_settings, logger):
        automl_settings_obj = None
        current_run = Run.get_submitted_run()
        found_data_store = False
        data_store = None

        start = time.time()

        try:
            experiment = current_run.experiment

            parent_run_id = _get_parent_run_id(current_run._run_id)
            print("parent run id {}".format(parent_run_id))

            automl_settings_obj = _AutoMLSettings.from_string_or_dict(automl_settings)
            data_store = experiment.workspace.get_default_datastore()
            found_data_store = True
        except Exception as e:
            logger.warning("getting data store, fallback to default {}".format(e))
            print("failed to get default data store  {}".format(e))
            found_data_store = False

        end = time.time()
        print("Caching supported {}, time taken for get default DS {}".format(sdk_has_cache_capability and found_data_store, (end - start)))

        return automl_settings_obj, found_data_store, data_store
Esempio n. 3
0
def fit_classifier(vggface_path, pca_path, output_path):
    """
        Part 2b of the paper where we fit classifer on   
            latent matrix       
    """

    X = np.load(os.path.join(pca_path, 'X_latent.npy'))
    y = np.load(os.path.join(vggface_path, 'y.npy'))

    run = Run.get_submitted_run()

    knn = KNeighborsClassifier()
    cv_results = cross_validate(knn,
                                X,
                                y,
                                scoring=['accuracy'],
                                cv=5,
                                verbose=1,
                                return_train_score=True,
                                n_jobs=1)

    # track train accuracy in run

    train_accuracy = round(np.mean(cv_results['train_accuracy']), 2)
    run.log("mean training accuracy", train_accuracy)

    test_accuracy = round(np.mean(cv_results['test_accuracy']), 2)
    run.log("mean testing accuracy", test_accuracy)

    # register model to run
    model_filepath = os.path.join(output_path, 'classifier.pkl')
    joblib.dump(knn, model_filepath)

    run.register_model(model_name="classifier", model_path=model_filepath)
Esempio n. 4
0
def main():
    # get command-line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_dir',
                        type=str,
                        help='directory of training data')
    parser.add_argument('--test_dir',
                        type=str,
                        help='directory of test data',
                        default=None)
    parser.add_argument('--output_dir',
                        type=str,
                        help='output directory',
                        default="./outputs")
    parser.add_argument('--classifier', type=str, default='svm')
    parser.add_argument('--number_of_samples',
                        type=int,
                        help='Amount of training samples',
                        default="400")
    parser.add_argument('--color_insensitive',
                        type=int,
                        help='True if color may not be a feature',
                        default=0)
    parser.add_argument('--shape',
                        help='the width and height e.g. (128,128)',
                        type=int,
                        default='64')
    parser.add_argument('--fbeta_beta', type=float, default='0.5')
    parser.add_argument('--is_local', type=bool, default=False)
    args = parser.parse_args()

    os.makedirs(args.output_dir, exist_ok=True)
    print(f'scikit-learn version: {sklearn.__version__}')
    print("data directory is: " + args.data_dir)
    shape = (args.shape, args.shape)
    run = None
    if not args.is_local:
        run = Run.get_submitted_run()
        run.log('fbeta_beta', args.fbeta_beta)
        run.log('classifier', args.classifier)
        run.log('shape', args.shape)
        run.log('number_of_samples', args.number_of_samples)
        run.log('shape_and_samples',
                int(str(args.number_of_samples) + str(args.shape)))
        run.tag('color_insensitive', str(args.color_insensitive))
        run.log('color_insensitive', args.color_insensitive)
        run.log('data_dir', args.data_dir)
        run.log('test_dir', args.test_dir)
    train(args.data_dir,
          args.test_dir,
          args.classifier,
          args.number_of_samples,
          shape=shape,
          output_directory=args.output_dir,
          beta=args.fbeta_beta,
          color_insensitive=args.color_insensitive,
          is_local=args.is_local,
          run=run)
Esempio n. 5
0
    def setup_wrapper(script_directory, dataprep_json, entry_point,
                      automl_settings, task_type, preprocess,
                      enable_subsampling, num_iterations, **kwargs):
        automl_settings_obj = _AutoMLSettings.from_string_or_dict(
            automl_settings)

        logger, sdk_has_custom_dimension_logger = _init_logger(
            automl_settings_obj)
        try:
            child_run_id = Run.get_submitted_run()._run_id
            parent_run_id = _get_parent_run_id(child_run_id)
            if sdk_has_custom_dimension_logger:
                logger.update_default_properties({
                    "parent_run_id": parent_run_id,
                    "child_run_id": child_run_id
                })
            logger.info("[ParentRunId:{}]: remote setup script begins.".format(
                parent_run_id))
            script_directory = _init_directory(directory=script_directory,
                                               logger=logger)

            logger.info("Preparing data for set problem info now.")

            fit_iteration_parameters_dict = _prepare_data(
                dataprep_json=dataprep_json,
                automl_settings_obj=automl_settings_obj,
                script_directory=script_directory,
                entry_point=entry_point,
                logger=logger)
            fit_iteration_parameters_dict = _get_auto_cv_dict(
                fit_iteration_parameters_dict, automl_settings_obj, logger)

            print("Setting Problem Info now.")
            _set_problem_info_for_setup(
                fit_iteration_parameters_dict=fit_iteration_parameters_dict,
                automl_settings_obj=automl_settings_obj,
                task_type=task_type,
                preprocess=preprocess,
                enable_subsampling=enable_subsampling,
                num_iterations=num_iterations,
                logger=logger)
        except Exception as e:
            logger.error("setup_wrapper meets exceptions. {}".format(e))
            log_traceback(e, logger)
            raise Exception(e)

        _post_setup(logger=logger)
        logger.info("[ParentRunId:{}]: remote setup script finishes.".format(
            parent_run_id))
        return  # PLACEHOLDER for RemoteScript helper functions
def main():
    # get command-line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_dir', type=str, help='directory of training data')
    parser.add_argument('--test_dir', type=str, help='directory of test data', default=None)
    parser.add_argument('--output_dir', type=str, help='output directory', default="./outputs")
    parser.add_argument('--classifier', type=str, default='svm')
    parser.add_argument('--number_of_samples', help='the width and height e.g. (128,128)', type=int, default='320')
    parser.add_argument('--fbeta_beta', type=float, default='0.5')
    parser.add_argument('--is_local', type=bool, default=False)
    args = parser.parse_args()
    with open(f'{args.output_dir}/results.csv', 'w') as csvfile:
        fieldnames = ['number_of_samples', 'shape', 'color_insensitive', 'accuracy', 'f_score']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

    run = None
    if not args.is_local:
        run = Run.get_submitted_run()
        run.log('data_dir', args.data_dir)
        run.log('test_dir', args.test_dir)
        run.log('fbeta_beta', args.fbeta_beta)

    os.makedirs(args.output_dir, exist_ok=True)
    print(f'scikit-learn version: {sklearn.__version__}')
    print("data directory is: " + args.data_dir)

    for shape in ['32', '64', '128', '256']:
        shape = (int(shape), int(shape))
        for color_insensitive in [0, 1]:
            start = time.time()
            X, y = get_data(args.data_dir, args.number_of_samples, shape, color_insensitive)
            loading_time = time.time() - start
            log(run, 'data_loading_time', loading_time)
            for number_of_samples in [args.number_of_samples/8, args.number_of_samples/4,
                                      args.number_of_samples/2, args.number_of_samples]:
                print(number_of_samples, shape, color_insensitive)
                X_sliced, y_sliced = shuffle(X, y, n_samples=int(number_of_samples))
                X_train, X_test, y_train, y_test = train_test_split(X_sliced, y_sliced, test_size=0.3, random_state=42)
                train(X_train, y_train, X_test, y_test, args.classifier, number_of_samples=int(number_of_samples),
                      shape=shape, output_directory=args.output_dir, beta=args.fbeta_beta,
                      color_insensitive=color_insensitive, is_local=args.is_local, run=run)
Esempio n. 7
0
    def _set_problem_info_for_setup(fit_iteration_parameters_dict,
                                   automl_settings_obj, task_type, preprocess,
                                   enable_subsampling, num_iterations,
                                   logger):
        current_run = Run.get_submitted_run()
        logger.info("Start to set problem info for the setup for run id {}.".format(current_run._run_id))
        logger.info("Setup experiment.")
        try:
            experiment = current_run.experiment
            parent_run_id = _get_parent_run_id(current_run._run_id)
            data_store = experiment.workspace.get_default_datastore()
            found_data_store = True
            logger.info("Using data store.")
        except Exception as e:
            logger.warning("Getting data store, fallback to default {}".format(e))
            found_data_store = False

        logger.info("Caching supported {}.".format(sdk_has_cache_capability and found_data_store))
        print("caching supported {}".format(sdk_has_cache_capability and found_data_store))
        if sdk_has_validate_data_dict:
            # The newest version of validate_training_data_dict should contains check_x_y
            logger.info("Using validate_training_data_dict now.")
            validate_training_data_dict(data_dict=fit_iteration_parameters_dict, automl_settings=automl_settings_obj)
        else:
            logger.info("Using validate_training_data now.")
            validate_training_data(X=fit_iteration_parameters_dict.get('X'),
                                      y=fit_iteration_parameters_dict.get('y'),
                                      X_valid=fit_iteration_parameters_dict.get('X_valid'),
                                      y_valid=fit_iteration_parameters_dict.get('y_valid'),
                                      sample_weight=fit_iteration_parameters_dict.get('sample_weight'),
                                      sample_weight_valid=fit_iteration_parameters_dict.get('sample_weight_valid'),
                                      cv_splits_indices=fit_iteration_parameters_dict.get('cv_splits_indices'),
                                      automl_settings=automl_settings_obj)
            check_x_y(fit_iteration_parameters_dict.get('X'), fit_iteration_parameters_dict.get('y'), automl_settings_obj)
        if sdk_has_cache_capability and found_data_store:
            data_splits_validated = True
            try:
                start = time.time()
                transformed_data_context = _get_transformed_data_context(
                    X=fit_iteration_parameters_dict.get('X'),
                    y=fit_iteration_parameters_dict.get('y'),
                    X_valid=fit_iteration_parameters_dict.get('X_valid'),
                    y_valid=fit_iteration_parameters_dict.get('y_valid'),
                    sample_weight=fit_iteration_parameters_dict.get('sample_weight'),
                    sample_weight_valid=fit_iteration_parameters_dict.get('sample_weight_valid'),
                    x_raw_column_names=fit_iteration_parameters_dict.get('x_raw_column_names'),
                    cv_splits_indices=fit_iteration_parameters_dict.get('cv_splits_indices'),
                    automl_settings_obj=automl_settings_obj,
                    data_store=data_store,
                    run_target='remote',
                    parent_run_id=parent_run_id,
                    logger=logger
                )
                end = time.time()
                print("time taken for transform {}".format(end-start))
                logger.info("time taken for transform {}".format(end-start))
                if sdk_has_validate_data_splits:
                    try:
                        logger.info("Validating data splits now.")
                        _validate_data_splits(X=transformed_data_context.X,
                                              y=transformed_data_context.y,
                                              X_valid=transformed_data_context.X_valid,
                                              y_valid=transformed_data_context.y_valid,
                                              cv_splits=transformed_data_context.cv_splits,
                                              automl_settings=automl_settings_obj)
                        data_splits_validated = True
                    except Exception as data_split_exception:
                        data_splits_validated = False
                        logger.error("Meeting validation errors {}.".format(data_split_exception))
                        log_traceback(data_split_exception, logger)
                        raise data_split_exception
                logger.info("Start setting problem info.")
                automl.set_problem_info(transformed_data_context.X, transformed_data_context.y,
                                        automl_settings_obj.task_type,
                                        current_run=current_run,
                                        preprocess=automl_settings_obj.preprocess,
                                        lag_length=automl_settings_obj.lag_length,
                                        transformed_data_context=transformed_data_context,
                                        enable_cache=automl_settings_obj.enable_cache,
                                        subsampling=enable_subsampling)
            except Exception as e:
                if sdk_has_validate_data_splits and not data_splits_validated:
                    logger.error("sdk_has_validate_data_splits is True and data_splits_validated is False {}.".format(e))
                    log_traceback(e, logger)
                    raise e
                else:
                    logger.warning("Setup failed, fall back to old model {}".format(e))
                    print("Setup failed, fall back to old model {}".format(e))
                    automl.set_problem_info(
                        X=fit_iteration_parameters_dict.get('X'),
                        y=fit_iteration_parameters_dict.get('y'),
                        task_type=task_type, current_run=current_run,
                        preprocess=preprocess, subsampling=enable_subsampling
                    )
        else:
            logger.info("Start setting problem info using old model.")
            if sdk_has_validate_data_splits:
                _validate_data_splits(X=fit_iteration_parameters_dict.get('X'),
                                      y=fit_iteration_parameters_dict.get('y'),
                                      X_valid=fit_iteration_parameters_dict.get('X_valid'),
                                      y_valid=fit_iteration_parameters_dict.get('y_valid'),
                                      cv_splits=fit_iteration_parameters_dict.get('cv_splits_indices'),
                                      automl_settings=automl_settings_obj)
            automl.set_problem_info(
                X=fit_iteration_parameters_dict.get('X'),
                y=fit_iteration_parameters_dict.get('y'),
                task_type=task_type, current_run=current_run,
                preprocess=preprocess, subsampling=enable_subsampling
            )
Esempio n. 8
0
    def _get_data_from_dataprep(dataprep_json, automl_settings_obj, logger):
        current_run = Run.get_submitted_run()
        parent_run_id = _get_parent_run_id(current_run._run_id)
        print("[ParentRunId:{}]: Start getting data using dataprep.".format(parent_run_id))
        logger.info("[ParentRunId:{}]: Start getting data using dataprep.".format(parent_run_id))
        try:
            import azureml.train.automl._dataprep_utilities as dataprep_utilities
        except Exception as e:
            e.error_type = ErrorTypes.Unclassified
            log_traceback(e, logger)
            logger.error(e)
            raise e

        fit_iteration_parameters_dict = dict()

        class RetrieveNumpyArrayError(Exception):
            def __init__(self):
                super().__init__()

        try:
            print("Resolving Dataflows...")
            logger.info("Resolving Dataflows...")
            dataprep_json_obj = json.loads(dataprep_json)
            if 'activities' in dataprep_json_obj: # json is serialized dataflows
                dataflow_dict = dataprep_utilities.load_dataflows_from_json(
                    dataprep_json)
                for k in ['X', 'X_valid', 'sample_weight', 'sample_weight_valid']:
                    fit_iteration_parameters_dict[k] = dataprep_utilities.try_retrieve_pandas_dataframe(dataflow_dict.get(k))
                for k in ['y', 'y_valid']:
                    try:
                        fit_iteration_parameters_dict[k] = dataprep_utilities.try_retrieve_numpy_array(dataflow_dict.get(k))
                    except IndexError:
                        raise RetrieveNumpyArrayError()

                cv_splits_dataflows = []
                i = 0
                while 'cv_splits_indices_{0}'.format(i) in dataflow_dict:
                    cv_splits_dataflows.append(
                        dataflow_dict['cv_splits_indices_{0}'.format(i)])
                    i = i + 1
                fit_iteration_parameters_dict['cv_splits_indices'] = None if len(cv_splits_dataflows) == 0 \
                    else dataprep_utilities.try_resolve_cv_splits_indices(cv_splits_dataflows)
            else: # json is dataprep options
                print('Creating Dataflow from options...\r\nOptions:')
                logger.info('Creating Dataflow from options...')
                print(dataprep_json_obj)
                datastore_name = dataprep_json_obj['datastoreName'] # mandatory
                data_path = dataprep_json_obj['dataPath'] # mandatory
                label_column = dataprep_json_obj['label'] # mandatory
                separator = dataprep_json_obj.get('columnSeparator', ',')
                header = dataprep_json_obj.get('promoteHeader', True)
                encoding = dataprep_json_obj.get('encoding', None)
                quoting = dataprep_json_obj.get('ignoreNewlineInQuotes', False)
                skip_rows = dataprep_json_obj.get('skipRows', 0)
                feature_columns = dataprep_json_obj.get('features', [])

                from azureml.core import Datastore
                import azureml.dataprep as dprep
                if header:
                    header = dprep.PromoteHeadersMode.CONSTANTGROUPED
                else:
                    header = dprep.PromoteHeadersMode.NONE
                try:
                    encoding = dprep.FileEncoding[encoding]
                except:
                    encoding = dprep.FileEncoding.UTF8

                ws = Run.get_context().experiment.workspace
                datastore = Datastore(ws, datastore_name)
                dflow = dprep.read_csv(path=datastore.path(data_path),
                                        separator=separator,
                                        header=header,
                                        encoding=encoding,
                                        quoting=quoting,
                                        skip_rows=skip_rows)

                if len(feature_columns) == 0:
                    X = dflow.drop_columns(label_column)
                else:
                    X = dflow.keep_columns(feature_columns)

                print('Inferring types for feature columns...')
                logger.info('Inferring types for feature columns...')
                sct = X.builders.set_column_types()
                sct.learn()
                sct.ambiguous_date_conversions_drop()
                X = sct.to_dataflow()

                y = dflow.keep_columns(label_column)
                if automl_settings_obj.task_type.lower() == 'regression':
                    y = y.to_number(label_column)

                print('X:')
                print(X)
                logger.info('X:')
                logger.info(X)

                print('y:')
                print(y)
                logger.info('y:')
                logger.info(y)

                try:
                    from azureml.train.automl._dataprep_utilities import try_retrieve_pandas_dataframe_adb
                    _X = try_retrieve_pandas_dataframe_adb(X)
                    fit_iteration_parameters_dict['X'] = _X.values
                    fit_iteration_parameters_dict['x_raw_column_names'] = _X.columns.values
                except ImportError:
                    logger.info("SDK version does not support column names extraction, fallback to old path")
                    fit_iteration_parameters_dict['X'] = dataprep_utilities.try_retrieve_pandas_dataframe(X)

                try:
                    fit_iteration_parameters_dict['y'] = dataprep_utilities.try_retrieve_numpy_array(y)
                except IndexError:
                    raise RetrieveNumpyArrayError()

            logger.info("Finish getting data using dataprep.")
            return fit_iteration_parameters_dict
        except Exception as e:
            print("[ParentRunId:{0}]: Error from resolving Dataflows: {1} {2}".format(parent_run_id, e.__class__, e))
            logger.error("[ParentRunId:{0}]: Error from resolving Dataflows: {1} {2}".format(parent_run_id, e.__class__, e))
            if isinstance(e, RetrieveNumpyArrayError):
                logger.debug("Label column (y) does not exist in user's data.")
                e.error_type = ErrorTypes.User
            elif "The provided path is not valid." in str(e):
                logger.debug("User's data is not accessible from remote run.")
                e.error_type = ErrorTypes.User
            elif "Required secrets are missing. Please call use_secrets to register the missing secrets." in str(e):
                logger.debug("User should use Datastore to data that requires secrets.")
                e.error_type = ErrorTypes.User
            else:
                e.error_type = ErrorTypes.Client
            log_traceback(e, logger)
            raise RuntimeError("Error during extracting Dataflows")
Esempio n. 9
0
    os.environ["AZUREML_SERVICE_ENDPOINT"] = parameters["SERVICE_ENDPOINT"]
    return setup_run()

def setup_run():
    global script_directory
    setup_wrapper(
        script_directory=script_directory,
        dataprep_json=dataprep_json,
        entry_point=entry_point,
        automl_settings=automl_settings,
        task_type=task_type,
        preprocess=preprocess,
        enable_subsampling=enable_subsampling,
        num_iterations=num_iterations
    )
    return "Setup run completed successfully!"

if __name__ == '__main__':
    try:
        result = setup_run()
    except Exception as e:
        errors = {'errors': {'exception': e,
                           'traceback': traceback.format_exc()}}
        try:
            current_run = Run.get_submitted_run()
            current_run.add_properties(errors)
        except Exception:
            pass
        raise
    print(result)
Esempio n. 10
0
from __future__ import print_function, division
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torchvision import datasets, models, transforms
import numpy as np
import time
import os
import copy
import argparse

from azureml.core.run import Run
# get the Azure ML run object
run = Run.get_submitted_run()


def load_data(data_dir):
    """Load the train/val data."""

    # Data augmentation and normalization for training
    # Just normalization for validation
    data_transforms = {
        'train':
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
Esempio n. 11
0
def get_logger():
    try:
        return Run.get_submitted_run()
    except:
        return LocalLogger()
Esempio n. 12
0
def train():
    # Import data
    mnist = input_data.read_data_sets(FLAGS.data_dir,
                                      fake_data=FLAGS.fake_data)

    sess = tf.InteractiveSession()
    # Create a multilayer model.

    # Input placeholders
    with tf.name_scope('input'):
        x = tf.placeholder(tf.float32, [None, 784], name='x-input')
        y_ = tf.placeholder(tf.int64, [None], name='y-input')

    with tf.name_scope('input_reshape'):
        image_shaped_input = tf.reshape(x, [-1, 28, 28, 1])
        tf.summary.image('input', image_shaped_input, 10)

    # We can't initialize these variables to 0 - the network will get stuck.
    def weight_variable(shape):
        """Create a weight variable with appropriate initialization."""
        initial = tf.truncated_normal(shape, stddev=0.1)
        return tf.Variable(initial)

    def bias_variable(shape):
        """Create a bias variable with appropriate initialization."""
        initial = tf.constant(0.1, shape=shape)
        return tf.Variable(initial)

    def variable_summaries(var):
        """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
        with tf.name_scope('summaries'):
            mean = tf.reduce_mean(var)
            tf.summary.scalar('mean', mean)
            with tf.name_scope('stddev'):
                stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
            tf.summary.scalar('stddev', stddev)
            tf.summary.scalar('max', tf.reduce_max(var))
            tf.summary.scalar('min', tf.reduce_min(var))
            tf.summary.histogram('histogram', var)

    def nn_layer(input_tensor,
                 input_dim,
                 output_dim,
                 layer_name,
                 act=tf.nn.relu):
        """Reusable code for making a simple neural net layer.

    It does a matrix multiply, bias add, and then uses ReLU to nonlinearize.
    It also sets up name scoping so that the resultant graph is easy to read,
    and adds a number of summary ops.
    """
        # Adding a name scope ensures logical grouping of the layers in the graph.
        with tf.name_scope(layer_name):
            # This Variable will hold the state of the weights for the layer
            with tf.name_scope('weights'):
                weights = weight_variable([input_dim, output_dim])
                variable_summaries(weights)
            with tf.name_scope('biases'):
                biases = bias_variable([output_dim])
                variable_summaries(biases)
            with tf.name_scope('Wx_plus_b'):
                preactivate = tf.matmul(input_tensor, weights) + biases
                tf.summary.histogram('pre_activations', preactivate)
            activations = act(preactivate, name='activation')
            tf.summary.histogram('activations', activations)
            return activations

    hidden1 = nn_layer(x, 784, 500, 'layer1')

    with tf.name_scope('dropout'):
        keep_prob = tf.placeholder(tf.float32)
        tf.summary.scalar('dropout_keep_probability', keep_prob)
        dropped = tf.nn.dropout(hidden1, keep_prob)

    # Do not apply softmax activation yet, see below.
    y = nn_layer(dropped, 500, 10, 'layer2', act=tf.identity)

    with tf.name_scope('cross_entropy'):
        # The raw formulation of cross-entropy,
        #
        # tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.softmax(y)),
        #                               reduction_indices=[1]))
        #
        # can be numerically unstable.
        #
        # So here we use tf.losses.sparse_softmax_cross_entropy on the
        # raw logit outputs of the nn_layer above, and then average across
        # the batch.
        with tf.name_scope('total'):
            cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=y_,
                                                                   logits=y)
    tf.summary.scalar('cross_entropy', cross_entropy)

    with tf.name_scope('train'):
        train_step = tf.train.AdamOptimizer(
            FLAGS.learning_rate).minimize(cross_entropy)

    with tf.name_scope('accuracy'):
        with tf.name_scope('correct_prediction'):
            correct_prediction = tf.equal(tf.argmax(y, 1), y_)
        with tf.name_scope('accuracy'):
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    tf.summary.scalar('accuracy', accuracy)

    # Merge all the summaries and write them out to
    # /tmp/tensorflow/mnist/logs/mnist_with_summaries (by default)
    merged = tf.summary.merge_all()
    train_writer = tf.summary.FileWriter(FLAGS.log_dir + '/train', sess.graph)
    test_writer = tf.summary.FileWriter(FLAGS.log_dir + '/test')
    tf.global_variables_initializer().run()
    saver = tf.train.Saver()

    # Train the model, and also write summaries.
    # Every 10th step, measure test-set accuracy, and write test summaries
    # All other steps, run train_step on training data, & add training summaries

    def feed_dict(train):
        """Make a TensorFlow feed_dict: maps data onto Tensor placeholders."""
        if train or FLAGS.fake_data:
            xs, ys = mnist.train.next_batch(100, fake_data=FLAGS.fake_data)
            k = FLAGS.dropout
        else:
            xs, ys = mnist.test.images, mnist.test.labels
            k = 1.0
        return {x: xs, y_: ys, keep_prob: k}

    for i in range(FLAGS.max_steps):
        if i % 10 == 0:  # Record summaries and test-set accuracy
            summary, acc = sess.run([merged, accuracy],
                                    feed_dict=feed_dict(False))
            test_writer.add_summary(summary, i)
            print('Accuracy at step %s: %s' % (i, acc))
            if i % 50 == 0:
                Run.get_submitted_run().log('Accuracy_test', acc)
        else:  # Record train set summaries, and train
            if i % 100 == 99:  # Record execution stats
                run_options = tf.RunOptions(
                    trace_level=tf.RunOptions.FULL_TRACE)
                run_metadata = tf.RunMetadata()
                summary, _ = sess.run([merged, train_step],
                                      feed_dict=feed_dict(True),
                                      options=run_options,
                                      run_metadata=run_metadata)
                train_writer.add_run_metadata(run_metadata, 'step%03d' % i)
                train_writer.add_summary(summary, i)
                print('Adding run metadata for', i)
            else:  # Record a summary
                summary, _ = sess.run([merged, train_step],
                                      feed_dict=feed_dict(True))
                train_writer.add_summary(summary, i)
    train_writer.close()
    test_writer.close()
    Run.get_submitted_run().log('Accuracy', acc)
    os.makedirs('./outputs/model', exist_ok=True)
    saver.save(sess, './outputs/model/mnist.model')
Esempio n. 13
0
                        type=bool,
                        default=False,
                        help='If true, uses fake data for unit testing.')
    parser.add_argument('--max_steps',
                        type=int,
                        default=1000,
                        help='Number of steps to run trainer.')
    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.001,
                        help='Initial learning rate')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.9,
                        help='Keep probability for training dropout.')
    parser.add_argument('--data_dir',
                        type=str,
                        default=os.path.join(os.getenv('TEST_TMPDIR', '/tmp'),
                                             'tensorflow/mnist/input_data'),
                        help='Directory for storing input data')
    parser.add_argument('--log_dir',
                        type=str,
                        default=os.path.join(
                            os.getenv('TEST_TMPDIR', '/tmp'),
                            'tensorflow/mnist/logs/mnist_with_summaries'),
                        help='Summaries log directory')
    FLAGS, unparsed = parser.parse_known_args()
    Run.get_submitted_run().log('learning_rate', FLAGS.learning_rate)
    Run.get_submitted_run().log('dropout', FLAGS.dropout)
    tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
Esempio n. 14
0
def main(unused_argv):
    data_root = os.path.join("outputs", "MNIST")
    mnist = None
    tf_config = os.environ.get("TF_CONFIG")
    if not tf_config or tf_config == "":
        raise ValueError("TF_CONFIG not found.")
    tf_config_json = json.loads(tf_config)
    cluster = tf_config_json.get('cluster')
    job_name = tf_config_json.get('task', {}).get('type')
    task_index = tf_config_json.get('task', {}).get('index')
    job_name = "worker" if job_name == "master" else job_name
    sentinel_path = os.path.join(data_root, "complete.txt")
    if job_name == "worker" and task_index == 0:
        mnist = input_data.read_data_sets(data_root, one_hot=True)
        with open(sentinel_path, 'w+') as f:
            f.write("download complete")
    else:
        while not os.path.exists(sentinel_path):
            time.sleep(0.01)
        mnist = input_data.read_data_sets(data_root, one_hot=True)

    if FLAGS.download_only:
        sys.exit(0)

    print("job name = %s" % job_name)
    print("task index = %d" % task_index)
    print("number of GPUs = %d" % FLAGS.num_gpus)

    # Construct the cluster and start the server
    cluster_spec = tf.train.ClusterSpec(cluster)

    # Get the number of workers.
    num_workers = len(cluster_spec.task_indices("worker"))

    if not FLAGS.existing_servers:
        # Not using existing servers. Create an in-process server.
        server = tf.train.Server(cluster_spec,
                                 job_name=job_name,
                                 task_index=task_index)
        if job_name == "ps":
            server.join()

    is_chief = (task_index == 0)
    if FLAGS.num_gpus > 0:
        # Avoid gpu allocation conflict: now allocate task_num -> #gpu
        # for each worker in the corresponding machine
        gpu = (task_index % FLAGS.num_gpus)
        worker_device = "/job:worker/task:%d/gpu:%d" % (task_index, gpu)
    elif FLAGS.num_gpus == 0:
        # Just allocate the CPU to worker server
        cpu = 0
        worker_device = "/job:worker/task:%d/cpu:%d" % (task_index, cpu)
    # The device setter will automatically place Variables ops on separate
    # parameter servers (ps). The non-Variable ops will be placed on the workers.
    # The ps use CPU and workers use corresponding GPU
    with tf.device(
            tf.train.replica_device_setter(worker_device=worker_device,
                                           ps_device="/job:ps/cpu:0",
                                           cluster=cluster)):
        global_step = tf.Variable(0, name="global_step", trainable=False)

        # Variables of the hidden layer
        hid_w = tf.Variable(tf.truncated_normal(
            [IMAGE_PIXELS * IMAGE_PIXELS, FLAGS.hidden_units],
            stddev=1.0 / IMAGE_PIXELS),
                            name="hid_w")
        hid_b = tf.Variable(tf.zeros([FLAGS.hidden_units]), name="hid_b")

        # Variables of the softmax layer
        sm_w = tf.Variable(tf.truncated_normal([FLAGS.hidden_units, 10],
                                               stddev=1.0 /
                                               math.sqrt(FLAGS.hidden_units)),
                           name="sm_w")
        sm_b = tf.Variable(tf.zeros([10]), name="sm_b")

        # Ops: located on the worker specified with task_index
        x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS])
        y_ = tf.placeholder(tf.float32, [None, 10])

        hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
        hid = tf.nn.relu(hid_lin)

        y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))
        cross_entropy = -tf.reduce_sum(
            y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))

        opt = tf.train.AdamOptimizer(FLAGS.learning_rate)

        if FLAGS.sync_replicas:
            if FLAGS.replicas_to_aggregate is None:
                replicas_to_aggregate = num_workers
            else:
                replicas_to_aggregate = FLAGS.replicas_to_aggregate

            opt = tf.train.SyncReplicasOptimizer(
                opt,
                replicas_to_aggregate=replicas_to_aggregate,
                total_num_replicas=num_workers,
                name="mnist_sync_replicas")

        train_step = opt.minimize(cross_entropy, global_step=global_step)

        if FLAGS.sync_replicas:
            local_init_op = opt.local_step_init_op
            if is_chief:
                local_init_op = opt.chief_init_op

            ready_for_local_init_op = opt.ready_for_local_init_op

            # Initial token and chief queue runners required by the sync_replicas mode
            chief_queue_runner = opt.get_chief_queue_runner()
            sync_init_op = opt.get_init_tokens_op()

        init_op = tf.global_variables_initializer()
        train_dir = tempfile.mkdtemp()

        if FLAGS.sync_replicas:
            sv = tf.train.Supervisor(
                is_chief=is_chief,
                logdir=train_dir,
                init_op=init_op,
                local_init_op=local_init_op,
                ready_for_local_init_op=ready_for_local_init_op,
                recovery_wait_secs=1,
                global_step=global_step)
        else:
            sv = tf.train.Supervisor(is_chief=is_chief,
                                     logdir=train_dir,
                                     init_op=init_op,
                                     recovery_wait_secs=1,
                                     global_step=global_step)

        sess_config = tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=False,
            device_filters=["/job:ps",
                            "/job:worker/task:%d" % task_index])

        # The chief worker (task_index==0) session will prepare the session,
        # while the remaining workers will wait for the preparation to complete.
        if is_chief:
            print("Worker %d: Initializing session..." % task_index)
        else:
            print("Worker %d: Waiting for session to be initialized..." %
                  task_index)

        if FLAGS.existing_servers:
            server_grpc_url = "grpc://" + task_index
            print("Using existing server at: %s" % server_grpc_url)

            sess = sv.prepare_or_wait_for_session(server_grpc_url,
                                                  config=sess_config)
        else:
            sess = sv.prepare_or_wait_for_session(server.target,
                                                  config=sess_config)

        print("Worker %d: Session initialization complete." % task_index)

        if FLAGS.sync_replicas and is_chief:
            # Chief worker will start the chief queue runner and call the init op.
            sess.run(sync_init_op)
            sv.start_queue_runners(sess, [chief_queue_runner])

        # Perform training
        time_begin = time.time()
        print("Training begins @ %f" % time_begin)

        local_step = 0
        while True:
            # Training feed
            batch_xs, batch_ys = mnist.train.next_batch(FLAGS.batch_size)
            train_feed = {x: batch_xs, y_: batch_ys}

            _, step = sess.run([train_step, global_step], feed_dict=train_feed)
            local_step += 1

            now = time.time()
            print("%f: Worker %d: training step %d done (global step: %d)" %
                  (now, task_index, local_step, step))

            if step >= FLAGS.train_steps:
                break

        time_end = time.time()
        print("Training ends @ %f" % time_end)
        training_time = time_end - time_begin
        print("Training elapsed time: %f s" % training_time)

        # Validation feed
        val_feed = {x: mnist.validation.images, y_: mnist.validation.labels}
        val_xent = sess.run(cross_entropy, feed_dict=val_feed)
        print("After %d training step(s), validation cross entropy = %g" %
              (FLAGS.train_steps, val_xent))
        if job_name == "worker" and task_index == 0:
            run = Run.get_submitted_run()
            run.log("CrossEntropy", val_xent)