Esempio n. 1
0
    def output_experiment_detail(self, res_path):
        exp = nni.get_experiment_id()
        trail = nni.get_trial_id()
        trail_path = os.path.join(res_path, exp, trail)
        if not os.path.exists(os.path.join(res_path, exp)):
            os.mkdir(os.path.join(res_path, exp))
        if not os.path.exists(trail_path):
            os.mkdir(trail_path)

        # TODO whatever you want
        p_loss = self.plot_line(LOSS_PLOT, show_plot=False)
        p_acc = self.plot_line(ACCURACY_PLOT, show_plot=False)
        p_auc = self.plot_line(AUC_PLOT, show_plot=False)

        measures_table = [
            ["train_loss_vec"] + [str(x) for x in self.loss_train_vec],
            ["train_acc_vec"] + [str(x) for x in self.accuracy_train_vec],
            ["train_auc_vec"] + [str(x) for x in self.auc_train_vec],
            ["dev_loss_vec"] + [str(x) for x in self.loss_dev_vec],
            ["dev_acc_vec"] + [str(x) for x in self.accuracy_dev_vec],
            ["dev_auc_vec"] + [str(x) for x in self.auc_dev_vec],
            ["test_loss_vec"] + [str(x) for x in self.loss_test_vec],
            ["test_acc_vec"] + [str(x) for x in self.accuracy_test_vec],
            ["test_auc_vec"] + [str(x) for x in self.auc_test_vec]
        ]
        with open(os.path.join(res_path, exp, trail, "measures_by_epochs.csv", "wt"), newline="") as f:
            writer = csv.writer(f)
            writer.writerows(measures_table)
Esempio n. 2
0
 def __init__(self, model_key, run_id, run_dir):
     self.model_key = model_key
     self.run_id = run_id
     self.run_dir = run_dir
     self.trial_id = nni.get_trial_id()
     self.exp_id = nni.get_experiment_id()
     self.scoring = accuracy_score
Esempio n. 3
0
def get_nni_or_mlflow_experiment_and_trial() -> Tuple[Optional[str], Optional[str]]:
    """ Helper function which returns NNI experiment name and trial ID if NNI isn't in Standalone mode or, otherwise, returns MLFlow experiment name and run ID if there is an active MLFlow run. 
    Returns (None, None) if NNI is in standalone mode and there is no active MLFLow run.
    """
    if is_nni_run_standalone():
        exp, run = deepcv.utils.mlflow_get_experiment_run_info()
        return (None, None) if exp is None else (exp.name, str(run.run_id))
    return (nni.get_experiment_id(), nni.get_trial_id())
Esempio n. 4
0
def main(args, params):
    """
    Main program:
      - Prepare dataset
      - Build network
      - Train the model
      - Report accuracy to tuner
      - Save best current metrics
      - Save best current model
    """

    (x_train, y_train), (x_test, y_test) = load_dataset()
    _logger.info('Dataset loaded')

    model = tf.keras.Sequential([
        tf.keras.layers.Conv2D(filters=32,
                               kernel_size=params['conv_size'],
                               activation='relu'),
        tf.keras.layers.MaxPool2D(pool_size=2),
        tf.keras.layers.Conv2D(filters=64,
                               kernel_size=params['conv_size'],
                               activation='relu'),
        tf.keras.layers.MaxPool2D(pool_size=2),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(units=params['hidden_size'], activation='relu'),
        tf.keras.layers.Dropout(rate=params['dropout_rate']),
        tf.keras.layers.Dense(units=10, activation='softmax')
    ])
    model.compile(
        optimizer=tf.keras.optimizers.Adam(lr=params['learning_rate']),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy'])
    _logger.info('Model built')

    # Setup TensorBoard
    log_dir = '{output}/tensorboard/'.format(
        output=args.output) + nni.get_trial_id()
    tensorboard = tf.keras.callbacks.TensorBoard(log_dir=log_dir,
                                                 histogram_freq=1)

    model.fit(x_train,
              y_train,
              batch_size=params['batch_size'],
              epochs=params['epochs'],
              callbacks=[ReportIntermediates(), tensorboard],
              validation_data=(x_test, y_test))
    _logger.info('Training completed')

    loss, accuracy = model.evaluate(x_test, y_test, verbose=0)
    # send final accuracy to NNI tuner and web UI
    nni.report_final_result(accuracy)
    # save the best metrics so they are displayed in the Workflow Task
    is_best_accuracy = save_best_metrics(loss, accuracy)
    _logger.info('Final accuracy reported: %s', accuracy)

    # save the model if accuracy is better than previous model
    if is_best_accuracy:
        save_data(args, params, model)
Esempio n. 5
0
def main(args):
    #### basic torch setup
    use_cuda = not args['no_cuda'] and torch.cuda.is_available()  # use cuda
    device = torch.device("cuda" if use_cuda else "cpu")
    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

    torch.manual_seed(args['seed'])  # seed

    #### data pipeline
    data_dir = os.path.join(args['data_dir'], nni.get_trial_id())

    train_loader = torch.utils.data.DataLoader(datasets.MNIST(
        data_dir,
        train=True,
        download=True,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])),
                                               batch_size=args['batch_size'],
                                               shuffle=True,
                                               **kwargs)
    test_loader = torch.utils.data.DataLoader(datasets.MNIST(
        data_dir,
        train=False,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])),
                                              batch_size=1000,
                                              shuffle=True,
                                              **kwargs)

    #### define model
    hidden_size = args['hidden_size']

    model = Net(hidden_size=hidden_size).to(device)
    optimizer = optim.SGD(model.parameters(),
                          lr=args['lr'],
                          momentum=args['momentum'])

    #### train
    for epoch in range(1, args['epochs'] + 1):
        train(args, model, device, train_loader, optimizer, epoch)
        test_acc = test(args, model, device, test_loader)

        if epoch < args['epochs']:
            # report intermediate result
            nni.report_intermediate_result(test_acc)
            logger.debug('test accuracy %g', test_acc)
            logger.debug('Pipe send intermediate result done.')
        else:
            # report final result
            nni.report_final_result(test_acc)
            logger.debug('Final result is %g', test_acc)
            logger.debug('Send final result done.')
Esempio n. 6
0
def train_eval(esargs, RCV_CONFIG, seqid):
    """ train and eval the model
    """
    global net
    global best_acc
    global bs_explore
    global gpus
    global hp_path

    best_acc = 0
    parse_rev_args(RCV_CONFIG, esargs)
    # train procedure
    trial_id = nni.get_trial_id()
    available_devices = os.environ["CUDA_VISIBLE_DEVICES"]
    gpus = len(available_devices.split(","))

    is_training = True
    filenames = ds.get_filenames(args.train_data_dir)
    dataset = tf.data.Dataset.from_tensor_slices(filenames)
    dataset = dataset.flat_map(tf.data.TFRecordDataset)
    ds_train = ds.process_record_dataset(
        dataset=dataset,
        is_training=is_training,
        batch_size=bs_explore,
        shuffle_buffer=shuffle_buffer,
        parse_record_fn=ds.parse_record,
        num_epochs=args.epochs,
        npc=args.num_parallel_calls,
        num_gpus=gpus,
        examples_per_epoch=examples_per_epoch if is_training else None,
        dtype=tf.float32)

    is_training = False
    filenames = ds.get_filenames(args.val_data_dir)
    dataset = tf.data.Dataset.from_tensor_slices(filenames)
    dataset = dataset.flat_map(tf.data.TFRecordDataset)
    ds_val = ds.process_record_dataset(dataset=dataset,
                                       is_training=is_training,
                                       batch_size=bs_explore,
                                       shuffle_buffer=shuffle_buffer,
                                       parse_record_fn=ds.parse_record,
                                       num_epochs=args.epochs,
                                       npc=args.num_parallel_calls,
                                       num_gpus=gpus,
                                       examples_per_epoch=None,
                                       dtype=tf.float32)

    # run epochs and patience
    loopnum = seqid // args.slave
    patience = min(int(6 + (2 * loopnum)), 20)
    if loopnum == 0:
        run_epochs = int(args.warmup_1)
    elif loopnum == 1:
        run_epochs = int(args.warmup_2)
    elif loopnum == 2:
        run_epochs = int(args.warmup_3)
    else:
        run_epochs = int(args.epochs)

    # if loopnum < 4:
    #     patience = int(8 + (2 * loopnum))
    #     run_epochs = int(10 + (20 * loopnum))
    # else:
    #     patience = 16
    #     run_epochs = args.epochs

    # lr strategy

    def scheduler2(epoch):
        lr_max = args.initial_lr
        total_epochs = args.epochs
        lr_each_epoch = lr_max - lr_max * epoch / total_epochs
        return lr_each_epoch

    callback = tf.keras.callbacks.LearningRateScheduler(scheduler2)

    # save weights
    checkpoint_dir = os.environ["HOME"] + "/nni/experiments/" + str(
        nni.get_experiment_id()) + "/checkpoint/" + str(nni.get_trial_id())
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    checkpoint_filepath = checkpoint_dir + "/weights." + "epoch." + str(
        run_epochs) + ".hdf5"
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        monitor='val_accuracy',
        mode='max',
        save_best_only=True,
        save_freq='epoch',
        save_weights_only=True,
    )

    history = net.fit(ds_train,
                      epochs=run_epochs,
                      steps_per_epoch=Ntrain // bs_explore // gpus,
                      validation_data=ds_val,
                      validation_steps=Nvalidation // bs_explore // gpus,
                      verbose=1,
                      shuffle=False,
                      callbacks=[
                          SendMetrics(hp_path), callback,
                          EarlyStopping(min_delta=0.001, patience=patience),
                          model_checkpoint_callback
                      ])

    # trial report final acc to tuner
    acc = 0
    acc_list = history.history['val_accuracy']
    for acc_n in acc_list:
        if float(acc_n) > acc:
            acc = float(acc_n)
    try:
        # predict acc
        if run_epochs >= 10 and run_epochs < 80:
            epoch_x = range(1, len(acc_list) + 1)
            pacc = utils.predict_acc(trial_id, epoch_x, acc_list, 90, True)
            best_acc = float(pacc)
    except Exception as E:
        print("Predict failed.")
    if acc > best_acc:
        best_acc = acc
    logger.debug("Final result is: %.3f", acc)
    return best_acc, history.epoch[-1]
Esempio n. 7
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--save_dir', help='Location of checkpoint files')
    parser.add_argument('--vocab_file', help='Vocabulary file')
    parser.add_argument('--train_prefix', help='Prefix for train files')
    args = parser.parse_args()

    ### NNI modification ###
    params = {
        'epoch': 1,
        'batch_size': 8,
        'optimizer': 'Adam',
        'inter_op_parallelism_threads': 1,
        'intra_op_parallelism_threads': 2,
        'infer_shapes': 0,
        'place_pruned_graph': 0,
        'enable_bfloat16_sendrecv': 0,
        'do_common_subexpression_elimination': 0,
        'max_folded_constant': 2,
        'do_function_inlining': 0,
        'global_jit_level': 1,
        'tf_gpu_thread_mode': "global"
    }
    tuned_params = nni.get_next_parameter()
    params.update(tuned_params)
    t_id = nni.get_trial_id()
    ### NNI modification ###

    main(args)
Esempio n. 8
0
 def test_get_trial_id(self):
     self.assertEqual(nni.get_trial_id(), 'fakeidtr')
Esempio n. 9
0

if __name__ == "__main__":
    example_start_time = time.time()
    net = None
    args = get_args()
    try:
        experiment_path = os.environ[
            "HOME"] + "/mountdir/nni/experiments/" + str(
                nni.get_experiment_id())
        lock = multiprocessing.Lock()
        context = zmq.Context()
        socket = context.socket(zmq.REQ)
        tmpstr = 'tcp://' + args.ip + ':800081'
        socket.connect(tmpstr)
        os.makedirs(experiment_path + "/trials/" + str(nni.get_trial_id()))

        get_next_parameter_start = time.time()
        nni.get_next_parameter(socket)
        get_next_parameter_end = time.time()

        while True:
            lock.acquire()
            with open(experiment_path + "/graph.txt", "a+") as f:
                f.seek(0)
                lines = f.readlines()
            lock.release()
            if lines:
                break

        if len(lines) > args.slave:
Esempio n. 10
0
class ClassifyParam:
    local_model_path = os.path.join(
        'data', 'cache', 'classify_{}_{}.model'.format(nni.get_experiment_id(),
                                                       nni.get_trial_id()))
    top_n_list = list(range(1, 11)) + [15, 20]
    parser.add_argument('--report_metric','-rm',type=str,default=None)
    parser.add_argument('--nni','-n', action='store_true')





    # Add data and model specific args
    parser = LearningDataSet.add_model_specific_args(parser)
    parser = NeuralNetwork.add_model_specific_args(parser)
    # add all the available trainer options to argparse
    parser = pl.Trainer.add_argparse_args(parser)

    args = parser.parse_args()
    if args.nni:
        trail_id = nni.get_trial_id()
        results_folder = os.path.join(args.results_path, args.project_name,trail_id)
        tuned_params = nni.get_next_parameter()
    else:
        results_folder = os.path.join(args.results_path, args.project_name)
        tuned_params = None

    if args.seed is not None:
        seed = args.seed
        torch.manual_seed(seed)
        np.random.seed(seed)
        random.seed(seed)

    arg_groups = {}

    for group in parser._action_groups:
Esempio n. 12
0
class NextLocParam:
    local_model_path = os.path.join(
        'data', 'cache', 'next_loc_{}_{}.model'.format(nni.get_experiment_id(),
                                                       nni.get_trial_id()))
    local_result_path = os.path.join('data', 'cache', 'next_loc_result.h5')
    top_n_list = [1, 2, 3, 4, 5, 10, 20]
    def generate_parameters(self, parameter_id, **kwargs):
        """
        Returns a set of trial neural architecture, as a serializable object.

        Parameters
        ----------
        parameter_id : int
        """
        #If there is no history, slave node will use the fake model.
        if not self.history:
            print("If there is no history, generate_parameters should not be called!")
            exit(1)
        total_start=time.time()
        rate = 1

        if (os.path.exists(os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/generate_time") and os.path.exists(os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/train_time")):
            with open(os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/generate_time", "r") as f:
                generate_time = float(f.read())
            with open(os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/train_time", "r") as f:
                train_time = float(f.read())
            if (generate_time != 0) and (train_time != 0):
                realrate = int(train_time / generate_time)
                if (realrate < 5) and (realrate > 1):
                    rate = int(realrate)
                if (realrate <= 1):
                    rate = 1

        for i in range(rate):
            start=time.time()
            new_father_id = None
            generated_graph = None
            if not self.training_queue:
                new_father_id, generated_graph = self.generate()
                father_id,json_out,new_model_id = self.total_data[parameter_id]
                self.training_queue.append((generated_graph, new_father_id, new_model_id))
                #self.descriptors.append(generated_graph.extract_descriptor())
            else:
                print("training_queue should be an empty list.")
                exit(1)

            graph, father_id, model_id = self.training_queue.pop(0)
        # from graph to json
            json_model_path = os.path.join(self.path, str(model_id) + ".json")
            json_out = graph_to_json(graph, json_model_path)
            end=time.time()
        #self.total_data[parameter_id] = (json_out, father_id, model_id)
            json_and_id="json_out="+str(json_out)+"+father_id="+str(father_id)+"+parameter_id="+str(parameter_id)+"+history="+"True"
            lock.acquire()
            with open(os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/trials/" + str(nni.get_trial_id()) + "/output.log","a+") as f:
                f.write("single_generate=" + str(end - start)+"\n")

            with open(os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/graph.txt","a+") as f:
                f.write(json_and_id+"\n")
            lock.release()
        total_end=time.time()
        lock.acquire()
        with open(os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/trials/" + str(nni.get_trial_id()) + "/output.log","a+") as f:
            f.write("total_generate=" + str(total_end - total_start)+"\n")
        lock.release()

        totime = total_end - total_start
        if totime<0:
            totime = 0-totime

        with open (os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/generate_time","w+") as f:
            gt = totime/rate
            f.write(str(gt))
Esempio n. 14
0
    def _start_mlflow_run(self, run_params: Dict[str, Any],
                          pipeline: Pipeline):
        """ Log basic informations to MLFlow about pipeline if this pipeline is tagged with 'train' (creates a new MLFLow experiment and/or run named after training pipeline if it doesn't exists yet)
        NOTE: If NNI is in dry run mode (mode used to generate NNI Classic NAS search space JSON file from a model which contains NNI NAS Mutables `LayerChoice` and/or `InputChoice`) we avoid creating any new MLFlow experiment/run nor logging anything else to mlflow during this dry run
        """
        node_tags = functools.reduce(set.union,
                                     [n.tags for n in pipeline.nodes])
        if not deepcv.meta.nni_tools.is_nni_gen_search_space_mode() and (
                'train' in run_params['tags'] or 'train' in node_tags):
            if mlflow.active_run() is None:
                # Create MLFlow run in an experiment named after pipeline involved in training and log various pipeline/datasets informations to mlflow. If we are running an NNI hp/nas search, mlflow experiment and run will be named after NNI experiment and trial ids for better consitency.
                # TODO: find another way to name experiment as pipeline name is only available when running `kedro run --pipeline=<pipeline_name>` (e.g. special tag to node after which experiment is named)

                if not deepcv.meta.nni_tools.is_nni_run_standalone(
                ):  # 'STANDALONE' is NNI default experiment ID if python process haven't been started by NNI
                    nni_experiment = nni.get_experiment_id()
                    mlflow.set_experiment(nni_experiment)
                    mlflow.start_run(run_name=nni.get_trial_id())
                    # Flag indicating whether we are using NNI HP or Classic NAS API (Hyperparameter and/or Classic Neural Architecture search using NNI)
                    mlflow.set_tag('nni_standalone_mode', False)
                    mlflow.set_tag('nni_experiment_id', nni_experiment)
                    mlflow.set_tag('nni_trial_id', nni.get_trial_id())
                    mlflow.set_tag('nni_sequence_id', nni.get_sequence_id())
                else:
                    pipeline_name = run_params['pipeline_name'].lower(
                    ) if run_params['pipeline_name'] else 'default'
                    mlflow.set_experiment(
                        f'{self.project_ctx.project_name.lower()}_{pipeline_name}'
                    )
                    mlflow.start_run(
                        run_name=
                        f'{pipeline_name.lower()}_run_{run_params["run_id"]}')
                    mlflow.set_tag('nni_standalone_mode', True)

            # Log basic informations about Kedro training pipeline to mlflow
            mlflow.set_tags({
                f'kedro_node_tag_{i}': tag
                for i, tag in enumerate(node_tags)
            })
            mlflow.log_params({n: v for n, v in run_params.items() if v})
            mlflow.log_param('pipeline.json', pipeline.to_json())
            mlflow.log_param('pipeline.describe', pipeline.describe())
            mlflow.log_param('pipeline.pipeline_datasets',
                             pipeline.data_sets())
            """ The following code creates special mlflow tags about current repository infos, which is not done by mlflow when starting an MLFlow run from code instead of from `mlflow run` command
            Code inspired from [`mlflow.projects._create_run`](https://www.mlflow.org/docs/latest/_modules/mlflow/projects.html) which doesn't seems to be called by `mlflow.start_run`
            """
            tags = {
                mlflow.utils.mlflow_tags.MLFLOW_SOURCE_NAME:
                self.project_ctx.package_name,
                mlflow.utils.mlflow_tags.MLFLOW_SOURCE_TYPE:
                mlflow.entities.SourceType.to_string(
                    mlflow.entities.SourceType.PROJECT),
                mlflow.utils.mlflow_tags.MLFLOW_PROJECT_ENTRY_POINT:
                inspect.getsourcefile(type(self.project_ctx))
            }
            try:
                repo = git.Repo(self.project_ctx.project_path,
                                search_parent_directories=True)
                git_repo_url = repo.remote(
                ).url if 'origin' in repo.remotes else (
                    repo.remotes[0].url if len(repo.remotes) > 0 else '')
                git_repo_url = re.sub(
                    r'git@([.\w]+):', r'https://\1/', git_repo_url).rstrip(
                        '.git')  # Convert SSH git URL to http URL
                mlflow.log_param(
                    'commit_url',
                    git_repo_url + f'/commit/{repo.head.commit.hexsha}/')

                # We also set MLFLOW_SOURCE_NAME to repo URL so that MLFlow web UI is able to parse it and render commit and source hyperlinks (MLFLow only supports github URLs for now)
                tags.update({
                    mlflow.utils.mlflow_tags.MLFLOW_SOURCE_NAME:
                    git_repo_url
                    if git_repo_url else self.project_ctx.project_name,
                    mlflow.utils.mlflow_tags.MLFLOW_GIT_BRANCH:
                    repo.active_branch.name,
                    mlflow.utils.mlflow_tags.MLFLOW_GIT_REPO_URL:
                    git_repo_url,
                    mlflow.utils.mlflow_tags.MLFLOW_GIT_COMMIT:
                    repo.head.commit.hexsha
                })

                # Change mlflow user to be git repository user instead of system user (if any git user is specified)
                git_config_reader = repo.config_reader()
                git_config_reader.read()
                user = git_config_reader.get_value('user',
                                                   'name',
                                                   default=None)
                email = git_config_reader.get_value('user',
                                                    'email',
                                                    default=None)
                if user or email:
                    tags[mlflow.utils.mlflow_tags.MLFLOW_USER] = (
                        str(user) + (f' <{email}>' if email else '')
                    ) if user else str(email)
            except (ImportError, OSError, ValueError, IOError, KeyError,
                    git.GitError, configparser.Error) as e:
                logging.warning(
                    f'Failed to import Git or to get repository informations. Error: {e}'
                )

            mlflow.set_tags(tags)
Esempio n. 15
0
def get_next_parameter(socket):
    """
    Get the hyper paremeters generated by tuner. For a multiphase experiment, it returns a new group of hyper
    parameters at each call of get_next_parameter. For a non-multiphase (multiPhase is not configured or set to False)
    experiment, it returns hyper parameters only on the first call for each trial job, it returns None since second call.
    This API should be called only once in each trial job of an experiment which is not specified as multiphase.

    Returns
    -------
    dict
        A dict object contains the hyper parameters generated by tuner, the keys of the dict are defined in
        search space. Returns None if no more hyper parameters can be generated by tuner.
    """
    global _params
    #_params = platform.get_next_parameter()
    # v1.1
    father_id = -1
    start = time.time()
    _params = platform.get_next_parameter()
    end = time.time()

    if _params is None:
        return None
    socket.send_pyobj({"type": "get_next_parameter"})
    message = socket.recv_pyobj()

    tuner = message["tuner"]

    if tuner.history:
        p0 = multiprocessing.Process(target=tuner.generate_parameters,
                                     args=(int(get_sequence_id()), ))
        p0.start()
        trial_concurrency = os.popen(
            'cat /etc/slurm-llnl/slurm.conf|grep NodeName|wc -l')
        trial_concurrency = int(trial_concurrency.read().strip())
        if get_sequence_id() < trial_concurrency:
            lock.acquire()
            with open(
                    os.environ["HOME"] + "/mountdir/nni/experiments/" +
                    str(nni.get_experiment_id()) + "/graph.txt", "a+") as f:
                json_and_id = 'json_out=' + str(
                    _params['parameters']) + '+history' + "=False or True?"
                f.write(json_and_id + "\n")
            lock.release()
    else:
        socket.send_pyobj({"type": "generated_parameter"})
        message = socket.recv_pyobj()
        lock.acquire()
        with open(
                os.environ["HOME"] + "/mountdir/nni/experiments/" +
                str(nni.get_experiment_id()) + "/trials/" +
                str(nni.get_trial_id()) + "/output.log", "a+") as f:
            f.write(" generate=" + str(end - start) + "\n")

        with open(
                os.environ["HOME"] + "/mountdir/nni/experiments/" +
                str(nni.get_experiment_id()) + "/graph.txt", "a+") as f:
            json_and_id = 'json_out=' + str(
                _params['parameters']) + '+history' + "=False"
            f.write(json_and_id + "\n")
        lock.release()
Esempio n. 16
0
def main(args):
    use_cuda = not args['no_cuda'] and torch.cuda.is_available()

    torch.manual_seed(args['seed'])

    device = torch.device("cuda" if use_cuda else "cpu")

    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

    data_dir = os.path.join(args['data_dir'], nni.get_trial_id())

    train_loader = torch.utils.data.DataLoader(
        datasets.MNIST(data_dir, train=True, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])),
        batch_size=args['batch_size'], shuffle=True, **kwargs)
    test_loader = torch.utils.data.DataLoader(
        datasets.MNIST(data_dir, train=False, transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307,), (0.3081,))
        ])),
        batch_size=1000, shuffle=True, **kwargs)

    hidden_size = args['hidden_size']

    model = Net(hidden_size=hidden_size).to(device)

    save_checkpoint_dir = args['save_checkpoint_dir']
    save_checkpoint_path = os.path.join(save_checkpoint_dir, 'model.pth')
    load_checkpoint_path = os.path.join(args['load_checkpoint_dir'], 'model.pth')

    if os.path.isfile(load_checkpoint_path):
        model_state_dict = load_checkpoint(load_checkpoint_path)
        logger.info("test : " + load_checkpoint_path)
        logger.info(type(model_state_dict))
        model.load_state_dict(model_state_dict)

    optimizer = optim.SGD(model.parameters(), lr=args['lr'],
                          momentum=args['momentum'])

    #epoch is perturbation interval
    for epoch in range(1, args['epochs'] + 1):
        train(args, model, device, train_loader, optimizer, epoch)
        test_acc = test(args, model, device, test_loader)

        if epoch < args['epochs']:
            # report intermediate result
            nni.report_intermediate_result(test_acc)
            logger.debug('test accuracy %g', test_acc)
            logger.debug('Pipe send intermediate result done.')
        else:
            # report final result
            nni.report_final_result(test_acc)
            logger.debug('Final result is %g', test_acc)
            logger.debug('Send final result done.')

    if not os.path.exists(save_checkpoint_dir):
        os.makedirs(save_checkpoint_dir)
    save_checkpoint(model, save_checkpoint_path)
Esempio n. 17
0
def estimate(esargs):
    global best_acc
    global trainloader
    global testloader
    global net
    global criterion
    global optimizer
    global rank
    #重置早停对象
    early_stop = utils.EarlyStopping(mode="max")
    global best_acc
    best_acc = 0
    lr_explore = esargs['learning_rate']
    bs_explore = int(esargs['batch_size'])
    global trainloader
    transform_train, transform_test = utils.data_transforms_cifar10(args)
    trainset = torchvision.datasets.CIFAR10(root="/root/mountdir/data/",
                                            train=True,
                                            download=True,
                                            transform=transform_train)
    trainsampler = DistributedSampler(trainset)
    trainloader = torch.utils.data.DataLoader(trainset,
                                              batch_size=bs_explore,
                                              shuffle=False,
                                              num_workers=args.workers,
                                              pin_memory=False,
                                              sampler=trainsampler)

    op = optim.SGD(net.parameters(),
                   lr=lr_explore,
                   momentum=0.9,
                   weight_decay=5e-4)

    for ep in range(args.epochs):
        current_ep = ep + 1
        if rank == 0:
            if os.popen("grep epoch " + experiment_path + "/trials/" +
                        str(nni.get_trial_id()) + "/output.log").read():
                os.system("sed -i '/^epoch/cepoch=" + str(ep + 1) + "' " +
                          experiment_path + "/trials/" +
                          str(nni.get_trial_id()) + "/output.log")
            else:
                os.system("sed -i '$a\\epoch=" + str(ep + 1) + "' " +
                          experiment_path + "/trials/" +
                          str(nni.get_trial_id()) + "/output.log")
        try:
            train_acc = train(ep, op)
        except Exception as exception:
            f11 = open('/root/log', 'a+')
            f11.write('###### training is error \n')
            f11.write(str(exception) + "\n")
            f11.close()
            acclist.append(0)
            return 0, current_ep
        test_acc, best_acc = test(ep)
        logger.debug(test_acc)
        if early_stop.step(test_acc):
            break
    list = [best_acc, bs_explore, str(lr_explore)[0:7]]
    reslist.append(list)
    acclist.append(best_acc)
    return best_acc, current_ep
def train_search(config,
                 params=None,
                 warm_start_NN=None,
                 restore_old_checkpoint=False,
                 workers=1,
                 verbosity=0):
    """
    train_search is practically the same as the train function from training_torch, just made for NNI experiments

    :param config:
    :param params:
    :param warm_start_NN:
    :param restore_old_checkpoint:
    :param workers:
    :param verbosity:
    :return:
    """
    if verbosity == 0:
        logger.setLevel(logging.INFO)
    if verbosity >= 1:
        logger.setLevel(logging.DEBUG)
    start = time.time()

    logger.info('Preparing Datasets')

    train_dataset, validation_dataset = prepare_dataset_torch(config)
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=params['batch_size'],
                                               shuffle=True)
    test_loader = torch.utils.data.DataLoader(validation_dataset,
                                              batch_size=params['batch_size'],
                                              shuffle=True)

    logger.info('Initializing Torch Network')

    net = map_model(config, params)

    logger.info('Optimizer Initialize')
    optimizer = map_optimizer(params['optimizer'], net.parameters(),
                              params['learning_rate'])
    loss_func = map_loss_func(params['loss'])
    criterion = torch.nn.MSELoss()

    if config['scheduler']:
        scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer, milestones=config['scheduler_milestones'], gamma=0.1)
    else:
        scheduler = None

    epochs = config['epochs']

    # Track the losses to determine early stopping
    avg_train_loss = []
    avg_valid_loss = []

    # initalize the early_stopping object
    early_stopping = EarlyStopping(verbose=True, trace_func=logger.info)

    logger.info('Start Training!')
    for epoch in range(epochs):

        train_loss, validation_loss, RMSE = train_epoch(
            net,
            optimizer,
            loss_func,
            train_loader=train_loader,
            test_loader=test_loader,
            scheduler=scheduler,
            criterion=criterion)

        nni.report_intermediate_result(-math.log10(RMSE))
        if early_stopping is not None:
            early_stopping(validation_loss, net, RMSE)
            RMSE = early_stopping.RMSE

        avg_train_loss.append(train_loss)
        avg_valid_loss.append(validation_loss)

        logger.info(
            'Epoch {}; Train Loss: {:.5}; Valid Loss: {:.5}; Best Validation RMSE: {:.5}'
            .format(epoch, train_loss, validation_loss, RMSE))
        print(
            'Epoch {}; Train Loss: {:.5}; Valid Loss: {:.5}; Validation RMSE: {:.5}'
            .format(epoch, train_loss, validation_loss, RMSE))
        if early_stopping.early_stop:
            logger.info('Early Stopping')
            RMSE = early_stopping.RMSE
            break

    nni.report_final_result(-math.log10(RMSE))
    end = time.time()
    logger.info(
        'Training Completed: Time elapsed: {:.2} Seconds'.format(end - start))
    plot_against_scaling(net,
                         validation_dataset,
                         criterion,
                         trial_id=str(nni.get_trial_id()),
                         exp_id=str(nni.get_experiment_id()))
Esempio n. 19
0
def prepare_hyper_search(cfg_kwargs: dict,
                         reporthook=None, final_reporthook=None,
                         primary_key=None, max_key=True, reporter_cls=None, with_keys: (list, str, None) = None,
                         final_keys: (list, str, None) = None,
                         dump=False, disable=False):
    """
    Updated in v1.3.18

    从 nni package 中获取超参,更新配置文件参数。当 nni 不可用或不是 nni 搜索模式时,参数将不会改变。

    .. code-block :: python

        cfg_kwargs, reporthook, final_reporthook, tag = prepare_hyper_search(
            cfg_kwargs, Configuration, reporthook, final_reporthook, primary_key="macro_avg:f1"
        )

        _cfg = Configuration(**cfg_kwargs)
        model = Model(_cfg)
        ...

        for epoch in range(_cfg.begin_epoch, _cfg.end_epoch):
            for batch_data in dataset:
                train_model(batch_data)

            data = evaluate_model()
            reporthook(data)

        final_reporthook()

    Parameters
    ----------
    cfg_kwargs: dict
        待传入cfg的参数
    reporthook
    final_reporthook
    primary_key:
        评估模型用的主键,
        ``nni.report_intermediate_result`` 和 ``nni.report_final_result`` 中  ``metric`` 的 ``default``
    max_key: bool
        主键是越大越好
    reporter_cls
    with_keys: list or str
        其它要存储的 metric,final report时默认为 primary_key 最优时指标
    final_keys: list or str
        with_keys 中使用最后一个 report result 而不是 primary_key 最优时指标
    dump: bool
        为 True 时,会修改 配置文件 中 workspace 参数为 ``workspace/nni.get_experiment_id()/nni.get_trial_id()``
        使得 nni 的中间结果会被存储下来。
    disable

    Returns
    -------
    cfg_kwargs: dict
        插入了nni超参后的配置文件参数
    reporthook: function
        每个iteration结束后的回调函数,用来报告中间结果。
        默认 ``nni.report_intermediate_result``。
    final_reporthook:
        所有iteration结束后的回调函数,用来报告最终结果。
        默认 ``nni.report_final_result``
    dump: bool
        和传入参数保持一致

    Examples
    --------
    .. code-block :: python

        class CFG(Configuration):
            hyper_params = {"hidden_num": 100}
            learning_rate = 0.001
            workspace = ""

        cfg_kwargs, reporthook, final_reporthook, dump = prepare_hyper_search(
            {"learning_rate": 0.1}, CFG, primary_key="macro_avg:f1", with_keys="accuracy"
        )
        # cfg_kwargs: {'learning_rate': 0.1}

    when nni start (e.g., using ``nni create --config _config.yml``),
    suppose in ``_config.yml``:

    .. code-block: yml

        searchSpacePath: _search_space.json

    and in ``_search_space.json``

    .. code-block :: json

        {
            "hidden_num": {"_type": "choice", "_value": [500, 600, 700, 835, 900]},
        }

    one of the return cfg_kwargs is ``{'hyper_params': {'hidden_num': 50}, 'learning_rate': 0.1}``
    """
    if disable:
        return cfg_kwargs, None, None, None
    try:
        import nni
        from nni import get_next_parameter, report_intermediate_result, report_final_result

        assert primary_key is not None

        def _as_key_list(_keys: (list, str, None)):
            if isinstance(_keys, str):
                if ";" in _keys:
                    _keys = _keys.split(";")
                else:
                    _keys = [_keys]
            elif isinstance(_keys, list):
                pass
            elif _keys is None:
                _keys = []
            return _keys

        with_keys = _as_key_list(with_keys)
        final_keys = _as_key_list(final_keys)

        class Reporter(BaseReporter):
            def __init__(self):
                self.datas = []

            def intermediate(self, data):
                feed_dict = {
                    'default': float(get_by_key(data, key_parser(primary_key))),
                    primary_key: get_by_key(data, key_parser(primary_key))
                }
                for key in with_keys:
                    feed_dict[key] = get_by_key(data, key_parser(key))
                report_intermediate_result(feed_dict)
                self.datas.append(data)

            def final(self):
                best_fn = get_min if max_key is False else get_max
                _with_keys = (with_keys if with_keys else []) + [primary_key]
                _final_keys = set(final_keys if final_keys else [])
                final_result = best_fn(
                    self.datas, primary_key, with_keys=";".join(_with_keys), merge=False
                )
                feed_dict = {
                    'default': float(final_result[0][primary_key])
                }
                appendix_dict = dict(final_result[1][primary_key])
                for key in _with_keys:
                    if key in _final_keys:
                        feed_dict[key] = get_by_key(self.datas[-1], key_parser(key))
                    else:
                        feed_dict[key] = appendix_dict[key]
                report_final_result(feed_dict)

        rc = Reporter() if reporter_cls is None else reporter_cls
        reporthook = reporthook if reporthook is not None else rc.intermediate
        final_reporthook = final_reporthook if final_reporthook is not None else rc.final
        cfg_cls_params = get_params(get_next_parameter())
        using_nni_tag = True if cfg_cls_params else False
        nested_update(cfg_kwargs, cfg_cls_params)
        if using_nni_tag is True and dump is True:  # pragma: no cover
            cfg_kwargs["workspace"] = cfg_kwargs.get("workspace", "") + path_append(
                nni.get_experiment_id(), nni.get_trial_id(), to_str=True
            )
        return cfg_kwargs, reporthook, final_reporthook, dump

    except ModuleNotFoundError:  # pragma: no cover
        warnings.warn("nni package not found, skip")
        return cfg_kwargs, reporthook, final_reporthook, dump
Esempio n. 20
0
def train_eval(esargs):
    """ train and eval the model
    """

    global trainloader
    global testloader
    global net
    global best_acc
    global rank

    best_acc = 0
    lr_explore = esargs['learning_rate']
    bs_explore = int(esargs['batch_size'])
    if args.optimizer == "SGD":
        optimizer = SGD(lr=lr_explore, momentum=0, decay=args.weight_decay)
    elif args.optimizer == "Adadelta":
        optimizer = Adadelta(lr=lr_explore, decay=args.weight_decay)
    elif args.optimizer == "Adagrad":
        optimizer = Adagrad(lr=lr_explore, decay=args.weight_decay)
    elif args.optimizer == "Adam":
        optimizer = Adam(lr=lr_explore, decay=args.weight_decay)
    elif args.optimizer == "Adamax":
        optimizer = Adamax(lr=lr_explore, decay=args.weight_decay)
    elif args.optimizer == "RMSprop":
        optimizer = RMSprop(lr=lr_explore, decay=args.weight_decay)
    else:
        logger.debug("Input A Wrong optimizer")

    # Compile the model
    net.compile(loss="categorical_crossentropy",
                optimizer=optimizer,
                metrics=["accuracy"])

    (x_train, y_train) = trainloader
    (x_test, y_test) = testloader

    # train procedure
    #使用callback函数记录epoch信息
    trial_id = nni.get_trial_id()
    f11 = open("/root/keras_trace" + str(rank), "a+")
    f11.write("rank-" + str(rank) + str(trial_id) + "\n")
    f11.close()
    available_devices = os.environ["CUDA_VISIBLE_DEVICES"]
    gpus = len(available_devices.split(","))
    #需要打印看看GPU个数
    history = net.fit(
        x=x_train,
        y=y_train,
        batch_size=bs_explore * gpus,
        validation_data=(x_test, y_test),
        epochs=args.epochs,
        shuffle=True,
        callbacks=[
            SendMetrics(),
            Epoch_num_record(experiment_path, trial_id),
            EarlyStopping(min_delta=0.001, patience=10),
            TensorBoard(log_dir=TENSORBOARD_DIR),
        ],
    )

    # trial report final acc to tuner
    if rank == 0:
        _, acc = net.evaluate(x_test, y_test)
        #记录超参搜索期间产生的最优acc
        f11 = open("/root/log", "a+")
        f11.write("######acc:" + str(acc) + "\n")
        f11.close()
        if acc > best_acc:
            best_acc = acc

        logger.debug("Final result is: %.3f", acc)
        list = [best_acc, bs_explore, str(lr_explore)[0:7]]
        reslist.append(list)
        acclist.append(best_acc)
    return best_acc, history.epoch[-1]
Esempio n. 21
0
    # gpuid=get_gpuid()
    # client_send(gpuid, 0)
    # print('gpuid:',gpuid)
    # os.environ["CUDA_VISIBLE_DEVICES"]="{}".format(gpuid)
    # print(os.environ["CUDA_VISIBLE_DEVICES"])
    # print(torch.cuda.device_count())
    
    args=get_args()
    tuner_params = nni.get_next_parameter()
    config=args.__dict__
    config.update(tuner_params)
    SetSeed(args.seed)
    # config['device']='cuda:{}'.format(gpuid)

    time_stamp = datetime.datetime.now()+datetime.timedelta(hours=8)
    config['ex_name'] = time_stamp.strftime('%Y.%m.%d-%H:%M:%S')+nni.get_trial_id()


    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)
    
    res= Path('/usr/data/gzy/GraphPool/ex_cmp/results')/args.dataset/'{}'.format(config['ex_name'])
    print(res)
    res.mkdir(parents=True, exist_ok=True)
    writer = SummaryWriter(res/'log')

    sv_param = os.path.join(res, 'model_param.json')
    with open(sv_param, 'w') as file_obj:
        json.dump(args.__dict__, file_obj)

    logging.basicConfig(level=logging.INFO,#控制台打印的日志级别
Esempio n. 22
0
def is_nni_run_standalone() -> bool:
    """ Simple helper function which returns whether NNI is in standalone trial run mode """
    return nni.get_experiment_id() == r'STANDALONE' and nni.get_trial_id() == r'STANDALONE' and nni.get_sequence_id() == 0
Esempio n. 23
0
        # params["gs_research_workflow.time_series.gs_steps.model_steps:FitStep > steps_per_epoch "] = 1
        # params["gs_research_workflow.time_series.gs_steps.model_steps:FitStep > validation_steps "] = 1
        # params["gs_research_workflow.time_series.models.inception_time:InceptionTime.HP > depth"] = 5
        # params["gs_research_workflow.time_series.models.inception_time:InceptionTime.HP > use_residual"] = True
        if cfg_alias_cls:
            params = {
                cfg_alias_cls.get_cfg_loc(k): v
                for k, v in params.items()
            }
        trial_uuid = generate_uuid()
        experiment_id = generate_uuid()
    else:
        os.environ[ENV_KEY_TRIAL_IN_NNI] = "1"
        params = nni.get_next_parameter()
        experiment_id = nni.get_experiment_id()
        trial_uuid = nni.get_trial_id()
        if cfg_alias_cls:
            params = {
                cfg_alias_cls.get_cfg_loc(k): v
                for k, v in params.items()
            }
    # 对 item 进行 unescape
    params = {k: unescape_nni_choice_item(v) for k, v in params.items()}

    yml_path = os.path.join(os.path.dirname(__file__), "../../..", args.cfg)
    if not os.path.isfile(yml_path):
        logger.error(f"Default cfg file {yml_path} is not existed!")
        sys.exit(0)

    trial_task = HPOTrialPodSideEnv(args.name, yml_path, params, trial_uuid,
                                    experiment_id)
Esempio n. 24
0
    # client_send(gpuid, 0)
    # print('gpuid:',gpuid)
    # os.environ["CUDA_VISIBLE_DEVICES"]="{}".format(gpuid)
    # print(os.environ["CUDA_VISIBLE_DEVICES"])
    # print(torch.cuda.device_count())

    args = get_args()
    tuner_params = nni.get_next_parameter()
    config = args.__dict__
    config.update(tuner_params)
    SetSeed(args.seed)
    # config['device']='cuda:{}'.format(gpuid)

    time_stamp = datetime.datetime.now() + datetime.timedelta(hours=8)
    config['ex_name'] = time_stamp.strftime(
        '%Y.%m.%d-%H:%M:%S') + nni.get_trial_id()

    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)

    res = Path('/usr/data/gzy/GraphPool/ex_cmp/results'
               ) / args.dataset / '{}'.format(config['ex_name'])
    print(res)
    res.mkdir(parents=True, exist_ok=True)
    writer = SummaryWriter(res / 'log')

    sv_param = os.path.join(res, 'model_param.json')
    with open(sv_param, 'w') as file_obj:
        json.dump(args.__dict__, file_obj)

    logging.basicConfig(