Example #1
0
 def server_utilization_updater(self):
     from digits.webapp import scheduler, socketio
     from digits import device_query
     devices = []
     gpus = len(self.resources['gpus'])
     if gpus:
         for index in range(0, gpus):
             device = device_query.get_device(index)
             if device:
                 devices.append((index, device))
             else:
                 raise RuntimeError(
                     'Failed to load gpu information for GPU #"%s"' % index)
     while True:
         data_gpu = []
         for index, device in devices:
             update = {'name': device.name, 'index': index}
             nvml_info = device_query.get_nvml_info(index)
             if nvml_info is not None:
                 update.update(nvml_info)
             data_gpu.append(update)
         socketio.emit('server update', {
             'update': 'gpus_utilization',
             'data_gpu': data_gpu,
         },
                       namespace='/jobs',
                       room='job_management')
         gevent.sleep(1)
Example #2
0
    def hw_socketio_updater(self, gpus):
        """
        This thread sends SocketIO messages about hardware utilization
        to connected clients

        Arguments:
        gpus -- a list of identifiers for the GPUs currently being used
        """
        from digits.webapp import app, socketio

        devices = []
        if gpus is not None:
            for index in gpus:
                device = device_query.get_device(index)
                if device:
                    devices.append((index, device))
                else:
                    raise RuntimeError('Failed to load gpu information for GPU #"%s"' % index)

        # this thread continues until killed in after_run()
        while True:
            # CPU (Non-GPU) Info
            data_cpu = {}
            if hasattr(self, "p") and self.p is not None:
                data_cpu["pid"] = self.p.pid
                try:
                    ps = psutil.Process(self.p.pid)  # 'self.p' is the system call object
                    if ps.is_running():
                        if psutil.version_info[0] >= 2:
                            data_cpu["cpu_pct"] = ps.cpu_percent(interval=1)
                            data_cpu["mem_pct"] = ps.memory_percent()
                            data_cpu["mem_used"] = ps.memory_info().rss
                        else:
                            data_cpu["cpu_pct"] = ps.get_cpu_percent(interval=1)
                            data_cpu["mem_pct"] = ps.get_memory_percent()
                            data_cpu["mem_used"] = ps.get_memory_info().rss
                except psutil.NoSuchProcess:
                    # In rare case of instant process crash or PID went zombie (report nothing)
                    pass

            data_gpu = []
            for index, device in devices:
                update = {"name": device.name, "index": index}
                nvml_info = device_query.get_nvml_info(index)
                if nvml_info is not None:
                    update.update(nvml_info)
                data_gpu.append(update)

            with app.app_context():
                html = flask.render_template("models/gpu_utilization.html", data_gpu=data_gpu, data_cpu=data_cpu)

                socketio.emit(
                    "task update",
                    {"task": self.html_id(), "update": "gpu_utilization", "html": html},
                    namespace="/jobs",
                    room=self.job_id,
                )
            gevent.sleep(1)
Example #3
0
    def gpu_socketio_updater(self, gpus):
        """
        This thread sends SocketIO messages about GPU utilization
        to connected clients

        Arguments:
        gpus -- a list of identifiers for the GPUs currently being used
        """
        from digits.webapp import app, socketio

        devices = []
        for index in gpus:
            device = device_query.get_device(index)
            if device:
                devices.append((index, device))
        if not devices:
            raise RuntimeError('Failed to load gpu information for "%s"' %
                               gpus)

        # this thread continues until killed in after_run()
        while True:
            data = []

            for index, device in devices:
                update = {'name': device.name, 'index': index}
                nvml_info = device_query.get_nvml_info(index)
                if nvml_info is not None:
                    update.update(nvml_info)
                data.append(update)

            with app.app_context():
                html = flask.render_template('models/gpu_utilization.html',
                                             data=data)

                socketio.emit(
                    'task update',
                    {
                        'task': self.html_id(),
                        'update': 'gpu_utilization',
                        'html': html,
                    },
                    namespace='/jobs',
                    room=self.job_id,
                )
            gevent.sleep(1)
Example #4
0
    def gpu_socketio_updater(self, gpus):
        """
        This thread sends SocketIO messages about GPU utilization
        to connected clients

        Arguments:
        gpus -- a list of identifiers for the GPUs currently being used
        """
        from digits.webapp import app, socketio

        devices = []
        for index in gpus:
            device = device_query.get_device(index)
            if device:
                devices.append((index, device))
        if not devices:
            raise RuntimeError('Failed to load gpu information for "%s"' % gpus)

        # this thread continues until killed in after_run()
        while True:
            data = []

            for index, device in devices:
                update = {'name': device.name, 'index': index}
                nvml_info = device_query.get_nvml_info(index)
                if nvml_info is not None:
                    update.update(nvml_info)
                data.append(update)

            with app.app_context():
                html = flask.render_template('models/gpu_utilization.html',
                        data = data)

                socketio.emit('task update',
                        {
                            'task': self.html_id(),
                            'update': 'gpu_utilization',
                            'html': html,
                            },
                        namespace='/jobs',
                        room=self.job_id,
                        )
            gevent.sleep(1)
Example #5
0
    def get_gpu_info(self):
        data_gpu = []
        devices = []
        gpus = None
        if 'gpus' in self.current_resources:
            gpus = [identifier for (identifier, value) in self.current_resources['gpus']]
        if gpus is None:
            return None

        for index in gpus:
            device = device_query.get_device(index)
            if device:
                devices.append((index, device))
            else:
                raise RuntimeError('Failed to load gpu information for GPU #"%s"' % index)

        for index, device in devices:
            update = {'name': device.name, 'index': index}
            nvml_info = device_query.get_nvml_info(index)
            if nvml_info is not None:
                update.update(nvml_info)
            data_gpu.append(update)
        return data_gpu
Example #6
0
    def hw_socketio_updater(self, gpus):
        """
        This thread sends SocketIO messages about hardware utilization
        to connected clients

        Arguments:
        gpus -- a list of identifiers for the GPUs currently being used
        """
        from digits.webapp import app, socketio

        devices = []
        if gpus is not None:
            for index in gpus:
                device = device_query.get_device(index)
                if device:
                    devices.append((index, device))
                else:
                    raise RuntimeError(
                        'Failed to load gpu information for GPU #"%s"' % index)

        # this thread continues until killed in after_run()
        while True:
            # CPU (Non-GPU) Info
            data_cpu = {}
            if hasattr(self, 'p') and self.p is not None:
                data_cpu['pid'] = self.p.pid
                try:
                    ps = psutil.Process(
                        self.p.pid)  # 'self.p' is the system call object
                    if ps.is_running():
                        if psutil.version_info[0] >= 2:
                            data_cpu['cpu_pct'] = ps.cpu_percent(interval=1)
                            data_cpu['mem_pct'] = ps.memory_percent()
                            data_cpu['mem_used'] = ps.memory_info().rss
                        else:
                            data_cpu['cpu_pct'] = ps.get_cpu_percent(
                                interval=1)
                            data_cpu['mem_pct'] = ps.get_memory_percent()
                            data_cpu['mem_used'] = ps.get_memory_info().rss
                except psutil.NoSuchProcess:
                    # In rare case of instant process crash or PID went zombie (report nothing)
                    pass

            data_gpu = []
            for index, device in devices:
                update = {'name': device.name, 'index': index}
                nvml_info = device_query.get_nvml_info(index)
                if nvml_info is not None:
                    update.update(nvml_info)
                data_gpu.append(update)

            with app.app_context():
                html = flask.render_template('models/gpu_utilization.html',
                                             data_gpu=data_gpu,
                                             data_cpu=data_cpu)

                socketio.emit(
                    'task update',
                    {
                        'task': self.html_id(),
                        'update': 'gpu_utilization',
                        'html': html,
                    },
                    namespace='/jobs',
                    room=self.job_id,
                )
            gevent.sleep(1)
Example #7
0
class ModelForm(Form):

    ### Methods

    def selection_exists_in_choices(form, field):
        found = False
        for choice in field.choices:
            if choice[0] == field.data:
                found = True
        if not found:
            raise validators.ValidationError(
                "Selected job doesn't exist. Maybe it was deleted by another user."
            )

    def validate_NetParameter(form, field):
        fw = frameworks.get_framework_by_id(form['framework'].data)
        try:
            # below function raises a BadNetworkException in case of validation error
            fw.validate_network(field.data)
        except frameworks.errors.BadNetworkError as e:
            raise validators.ValidationError('Bad network: %s' % e.message)

    def validate_file_exists(form, field):
        from_client = bool(form.python_layer_from_client.data)

        filename = ''
        if not from_client and field.type == 'StringField':
            filename = field.data

        if filename == '': return

        if not os.path.isfile(filename):
            raise validators.ValidationError(
                'Server side file, %s, does not exist.' % filename)

    def validate_py_ext(form, field):
        from_client = bool(form.python_layer_from_client.data)

        filename = ''
        if from_client and field.type == 'FileField':
            filename = flask.request.files[field.name].filename
        elif not from_client and field.type == 'StringField':
            filename = field.data

        if filename == '': return

        (root, ext) = os.path.splitext(filename)
        if ext != '.py' and ext != '.pyc':
            raise validators.ValidationError(
                'Python file, %s, needs .py or .pyc extension.' % filename)

    ### Fields

    # The options for this get set in the view (since they are dynamic)
    dataset = utils.forms.SelectField(
        'Select Dataset',
        choices=[],
        tooltip="Choose the dataset to use for this model.")

    python_layer_from_client = utils.forms.BooleanField(
        u'Use client-side file', default=False)

    python_layer_client_file = utils.forms.FileField(
        u'Client-side file',
        validators=[validate_py_ext],
        tooltip=
        "Choose a Python file on the client containing layer definitions.")
    python_layer_server_file = utils.forms.StringField(
        u'Server-side file',
        validators=[validate_file_exists, validate_py_ext],
        tooltip=
        "Choose a Python file on the server containing layer definitions.")

    train_epochs = utils.forms.IntegerField(
        'Training epochs',
        validators=[validators.NumberRange(min=1)],
        default=30,
        tooltip="How many passes through the training data?")

    snapshot_interval = utils.forms.FloatField(
        'Snapshot interval (in epochs)',
        default=1,
        validators=[
            validators.NumberRange(min=0),
        ],
        tooltip="How many epochs of training between taking a snapshot?")

    val_interval = utils.forms.FloatField(
        'Validation interval (in epochs)',
        default=1,
        validators=[validators.NumberRange(min=0)],
        tooltip=
        "How many epochs of training between running through one pass of the validation data?"
    )

    random_seed = utils.forms.IntegerField(
        'Random seed',
        validators=[
            validators.NumberRange(min=0),
            validators.Optional(),
        ],
        tooltip=
        "If you provide a random seed, then back-to-back runs with the same model and dataset should give identical results."
    )

    batch_size = utils.forms.MultiIntegerField(
        'Batch size',
        validators=[
            utils.forms.MultiNumberRange(min=1),
            utils.forms.MultiOptional(),
        ],
        tooltip=
        "How many images to process at once. If blank, values are used from the network definition."
    )

    batch_accumulation = utils.forms.IntegerField(
        'Batch Accumulation',
        validators=[
            validators.NumberRange(min=1),
            validators.Optional(),
        ],
        tooltip=
        "Accumulate gradients over multiple batches (useful when you need a bigger batch size for training but it doesn't fit in memory)."
    )

    ### Solver types

    solver_type = utils.forms.SelectField(
        'Solver type',
        choices=[
            ('SGD', 'Stochastic gradient descent (SGD)'),
            ('NESTEROV', "Nesterov's accelerated gradient (NAG)"),
            ('ADAGRAD', 'Adaptive gradient (AdaGrad)'),
            ('RMSPROP', 'RMSprop'),
            ('ADADELTA', 'AdaDelta'),
            ('ADAM', 'Adam'),
        ],
        default='SGD',
        tooltip="What type of solver will be used?",
    )

    def validate_solver_type(form, field):
        fw = frameworks.get_framework_by_id(form.framework)
        if fw is not None:
            if not fw.supports_solver_type(field.data):
                raise validators.ValidationError(
                    'Solver type not supported by this framework')

    ### Learning rate

    learning_rate = utils.forms.MultiFloatField(
        'Base Learning Rate',
        default=0.01,
        validators=[
            utils.forms.MultiNumberRange(min=0),
        ],
        tooltip=
        "Affects how quickly the network learns. If you are getting NaN for your loss, you probably need to lower this value."
    )

    lr_policy = wtforms.SelectField('Policy',
                                    choices=[
                                        ('fixed', 'Fixed'),
                                        ('step', 'Step Down'),
                                        ('multistep',
                                         'Step Down (arbitrary steps)'),
                                        ('exp', 'Exponential Decay'),
                                        ('inv', 'Inverse Decay'),
                                        ('poly', 'Polynomial Decay'),
                                        ('sigmoid', 'Sigmoid Decay'),
                                    ],
                                    default='step')

    lr_step_size = wtforms.FloatField('Step Size', default=33)
    lr_step_gamma = wtforms.FloatField('Gamma', default=0.1)
    lr_multistep_values = wtforms.StringField('Step Values', default="50,85")

    def validate_lr_multistep_values(form, field):
        if form.lr_policy.data == 'multistep':
            for value in field.data.split(','):
                try:
                    float(value)
                except ValueError:
                    raise validators.ValidationError('invalid value')

    lr_multistep_gamma = wtforms.FloatField('Gamma', default=0.5)
    lr_exp_gamma = wtforms.FloatField('Gamma', default=0.95)
    lr_inv_gamma = wtforms.FloatField('Gamma', default=0.1)
    lr_inv_power = wtforms.FloatField('Power', default=0.5)
    lr_poly_power = wtforms.FloatField('Power', default=3)
    lr_sigmoid_step = wtforms.FloatField('Step', default=50)
    lr_sigmoid_gamma = wtforms.FloatField('Gamma', default=0.1)

    ### Network

    # Use a SelectField instead of a HiddenField so that the default value
    # is used when nothing is provided (through the REST API)
    method = wtforms.SelectField(
        u'Network type',
        choices=[
            ('standard', 'Standard network'),
            ('previous', 'Previous network'),
            ('pretrained', 'Pretrained network'),
            ('custom', 'Custom network'),
        ],
        default='standard',
    )

    ## framework - hidden field, set by Javascript to the selected framework ID
    framework = wtforms.HiddenField(
        'framework',
        validators=[
            validators.AnyOf(
                [fw.get_id() for fw in frameworks.get_frameworks()],
                message='The framework you choose is not currently supported.')
        ],
        default=frameworks.get_frameworks()[0].get_id())

    # The options for this get set in the view (since they are dependent on the data type)
    standard_networks = wtforms.RadioField(
        'Standard Networks',
        validators=[
            validate_required_iff(method='standard'),
        ],
    )

    previous_networks = wtforms.RadioField(
        'Previous Networks',
        choices=[],
        validators=[
            validate_required_iff(method='previous'),
            selection_exists_in_choices,
        ],
    )

    pretrained_networks = wtforms.RadioField(
        'Pretrained Networks',
        choices=[],
        validators=[
            validate_required_iff(method='pretrained'),
            selection_exists_in_choices,
        ],
    )

    custom_network = utils.forms.TextAreaField(
        'Custom Network',
        validators=[
            validate_required_iff(method='custom'),
            validate_NetParameter,
        ],
    )

    custom_network_snapshot = utils.forms.TextField(
        'Pretrained model(s)',
        tooltip=
        "Paths to pretrained model files, separated by '%s'. Only edit this field if you understand how fine-tuning works in caffe or torch."
        % os.path.pathsep)

    def validate_custom_network_snapshot(form, field):
        if form.method.data == 'custom':
            for filename in field.data.strip().split(os.path.pathsep):
                if filename and not os.path.exists(filename):
                    raise validators.ValidationError(
                        'File "%s" does not exist' % filename)

    # Select one of several GPUs
    select_gpu = wtforms.RadioField(
        'Select which GPU you would like to use',
        choices=[('next', 'Next available')] + [(
            index,
            '#%s - %s (%s memory)' %
            (index, get_device(index).name,
             sizeof_fmt(
                 get_nvml_info(index)['memory']['total']
                 if get_nvml_info(index) and 'memory' in get_nvml_info(index)
                 else get_device(index).totalGlobalMem)),
        ) for index in config_value('gpu_list').split(',') if index],
        default='next',
    )

    # Select N of several GPUs
    select_gpus = utils.forms.SelectMultipleField(
        'Select which GPU[s] you would like to use',
        choices=[(
            index,
            '#%s - %s (%s memory)' %
            (index, get_device(index).name,
             sizeof_fmt(
                 get_nvml_info(index)['memory']['total']
                 if get_nvml_info(index) and 'memory' in get_nvml_info(index)
                 else get_device(index).totalGlobalMem)),
        ) for index in config_value('gpu_list').split(',') if index],
        tooltip=
        "The job won't start until all of the chosen GPUs are available.")

    # XXX For testing
    # The Flask test framework can't handle SelectMultipleFields correctly
    select_gpus_list = wtforms.StringField(
        'Select which GPU[s] you would like to use (comma separated)')

    def validate_select_gpus(form, field):
        if form.select_gpus_list.data:
            field.data = form.select_gpus_list.data.split(',')

    # Use next available N GPUs
    select_gpu_count = wtforms.IntegerField(
        'Use this many GPUs (next available)',
        validators=[
            validators.NumberRange(min=1,
                                   max=len(
                                       config_value('gpu_list').split(',')))
        ],
        default=1,
    )

    def validate_select_gpu_count(form, field):
        if field.data is None:
            if form.select_gpus.data:
                # Make this field optional
                field.errors[:] = []
                raise validators.StopValidation()

    model_name = utils.forms.StringField(
        'Model Name',
        validators=[validators.DataRequired()],
        tooltip=
        "An identifier, later used to refer to this model in the Application.")

    # allows shuffling data during training (for frameworks that support this, as indicated by
    # their Framework.can_shuffle_data() method)
    shuffle = utils.forms.BooleanField(
        'Shuffle Train Data',
        default=True,
        tooltip='For every epoch, shuffle the data before training.')
Example #8
0
class ModelForm(Form):

    ### Methods

    def selection_exists_in_choices(form, field):
        found = False
        for choice in field.choices:
            if choice[0] == field.data:
                found = True
        if not found:
            raise validators.ValidationError(
                "Selected job doesn't exist. Maybe it was deleted by another user."
            )

    def validate_NetParameter(form, field):
        pb = caffe_pb2.NetParameter()
        try:
            text_format.Merge(field.data, pb)
        except text_format.ParseError as e:
            raise validators.ValidationError('Not a valid NetParameter: %s' %
                                             e)

    ### Fields

    # The options for this get set in the view (since they are dynamic)
    dataset = wtforms.SelectField('Select Dataset', choices=[])

    train_epochs = wtforms.IntegerField(
        'Training epochs',
        validators=[validators.NumberRange(min=1)],
        default=30,
    )

    snapshot_interval = wtforms.FloatField(
        'Snapshot interval (in epochs)',
        default=1,
        validators=[
            validators.NumberRange(min=0),
        ],
    )

    val_interval = wtforms.FloatField(
        'Validation interval (in epochs)',
        default=1,
        validators=[validators.NumberRange(min=0)],
    )

    random_seed = wtforms.IntegerField(
        'Random seed',
        validators=[
            validators.NumberRange(min=0),
            validators.Optional(),
        ],
    )

    batch_size = wtforms.IntegerField(
        'Batch size',
        validators=[
            validators.NumberRange(min=1),
            validators.Optional(),
        ],
    )

    ### Solver types

    solver_type = wtforms.SelectField(
        'Solver type',
        choices=[
            ('SGD', 'Stochastic gradient descent (SGD)'),
            ('ADAGRAD', 'Adaptive gradient (AdaGrad)'),
            ('NESTEROV', "Nesterov's accelerated gradient (NAG)"),
        ],
        default='SGD')

    ### Learning rate

    learning_rate = wtforms.FloatField('Base Learning Rate',
                                       default=0.01,
                                       validators=[
                                           validators.NumberRange(min=0),
                                       ])

    lr_policy = wtforms.SelectField('Policy',
                                    choices=[
                                        ('fixed', 'Fixed'),
                                        ('step', 'Step Down'),
                                        ('multistep',
                                         'Step Down (arbitrary steps)'),
                                        ('exp', 'Exponential Decay'),
                                        ('inv', 'Inverse Decay'),
                                        ('poly', 'Polynomial Decay'),
                                        ('sigmoid', 'Sigmoid Decay'),
                                    ],
                                    default='step')

    lr_step_size = wtforms.FloatField('Step Size', default=33)
    lr_step_gamma = wtforms.FloatField('Gamma', default=0.1)
    lr_multistep_values = wtforms.StringField('Step Values', default="50,85")

    def validate_lr_multistep_values(form, field):
        if form.lr_policy.data == 'multistep':
            for value in field.data.split(','):
                try:
                    float(value)
                except ValueError:
                    raise validators.ValidationError('invalid value')

    lr_multistep_gamma = wtforms.FloatField('Gamma', default=0.5)
    lr_exp_gamma = wtforms.FloatField('Gamma', default=0.95)
    lr_inv_gamma = wtforms.FloatField('Gamma', default=0.1)
    lr_inv_power = wtforms.FloatField('Power', default=0.5)
    lr_poly_power = wtforms.FloatField('Power', default=3)
    lr_sigmoid_step = wtforms.FloatField('Step', default=50)
    lr_sigmoid_gamma = wtforms.FloatField('Gamma', default=0.1)

    ### Network

    # Use a SelectField instead of a HiddenField so that the default value
    # is used when nothing is provided (through the REST API)
    method = wtforms.SelectField(
        u'Network type',
        choices=[
            ('standard', 'Standard network'),
            ('previous', 'Previous network'),
            ('custom', 'Custom network'),
        ],
        default='standard',
    )

    # The options for this get set in the view (since they are dependent on the data type)
    standard_networks = wtforms.RadioField(
        'Standard Networks',
        validators=[
            validate_required_iff(method='standard'),
        ],
    )

    previous_networks = wtforms.RadioField(
        'Previous Networks',
        choices=[],
        validators=[
            validate_required_iff(method='previous'),
            selection_exists_in_choices,
        ],
    )

    custom_network = wtforms.TextAreaField(
        'Custom Network',
        validators=[
            validate_required_iff(method='custom'),
            validate_NetParameter,
        ])

    custom_network_snapshot = wtforms.TextField('Pretrained model')

    def validate_custom_network_snapshot(form, field):
        if form.method.data == 'custom':
            snapshot = field.data.strip()
            if snapshot:
                if not os.path.exists(snapshot):
                    raise validators.ValidationError('File does not exist')

    # Select one of several GPUs
    select_gpu = wtforms.RadioField(
        'Select which GPU you would like to use',
        choices=[('next', 'Next available')] + [(
            index,
            '#%s - %s%s' % (
                index,
                get_device(index).name,
                ' (%s memory)' %
                sizeof_fmt(get_nvml_info(index)['memory']['total'])
                if get_nvml_info(index) and 'memory' in get_nvml_info(index)
                else '',
            ),
        ) for index in config_value('gpu_list').split(',') if index],
        default='next',
    )

    # Select N of several GPUs
    select_gpus = wtforms.SelectMultipleField(
        'Select which GPU[s] you would like to use',
        choices=[(
            index,
            '#%s - %s%s' % (
                index,
                get_device(index).name,
                ' (%s memory)' %
                sizeof_fmt(get_nvml_info(index)['memory']['total'])
                if get_nvml_info(index) and 'memory' in get_nvml_info(index)
                else '',
            ),
        ) for index in config_value('gpu_list').split(',') if index])

    # XXX For testing
    # The Flask test framework can't handle SelectMultipleFields correctly
    select_gpus_list = wtforms.StringField(
        'Select which GPU[s] you would like to use (comma separated)')

    def validate_select_gpus(form, field):
        if form.select_gpus_list.data:
            field.data = form.select_gpus_list.data.split(',')

    # Use next available N GPUs
    select_gpu_count = wtforms.IntegerField(
        'Use this many GPUs (next available)',
        validators=[
            validators.NumberRange(min=1,
                                   max=len(
                                       config_value('gpu_list').split(',')))
        ],
        default=1,
    )

    def validate_select_gpu_count(form, field):
        if field.data is None:
            if form.select_gpus.data:
                # Make this field optional
                field.errors[:] = []
                raise validators.StopValidation()

    model_name = wtforms.StringField('Model Name',
                                     validators=[validators.DataRequired()])