Beispiel #1
0
def run_experiment(dataset,
                   INPUT_MEAN,
                   INPUT_STD,
                   SOURCE_TYPES,
                   VALIDATION_SOURCE_TYPES,
                   DOWNSAMPLE_FACTOR,
                   SEQ_LENGTH,
                   TARGET_SEQ_LENGTH,
                   MAX_TARGET_POWER,
                   TARGET_APPLIANCE,
                   TRAINING_SEED,
                   VERBOSE_TRAINING,
                   LEARNING_RATE,
                   NUM_SEQ_PER_BATCH,
                   EPOCHS,
                   STEPS_PER_EPOCH,
                   USE_CUDA,
                   CHECKPOINT_BEST_MSE,
                   CHECKPOINTING_EVERY_N_EPOCHS,
                   TEST_DISAGGREGATE_EVERY_N_EPOCHS,
                   _run):

    torch.manual_seed(TRAINING_SEED)

    OUTPUT_FOLDER = os.path.join(ex.get_experiment_info()['name'],"output")
    for observer in _run.observers:
        if type(observer) is FileStorageObserver:
            OUTPUT_FOLDER = os.path.join(observer.basedir, str(_run._id))
            VERBOSE_TRAINING = 0
    os.makedirs(OUTPUT_FOLDER, exist_ok=True)

    writer = SummaryWriter(log_dir=OUTPUT_FOLDER)

    # From dataset Ingredient
    TRAIN_BUILDINGS = dataset["TRAIN_BUILDINGS"]
    ON_POWER_THRESHOLD = dataset["ON_POWER_THRESHOLD"]

    ##############################################################################################
    #PREPARE DATASET (DATALOADERs)
    ##############################################################################################
    running_data_processes = [] # stop these at the end
    sources, validation_sources = get_sources(
        training_source_names=SOURCE_TYPES,
        validation_source_names=VALIDATION_SOURCE_TYPES,
        seq_length=SEQ_LENGTH,
        sources_seed=TRAINING_SEED,
        validation_stride=128 )

    offset = (SEQ_LENGTH - TARGET_SEQ_LENGTH) // 2
    groundtruth_processing = SubSequence(offset,-offset)
    input_processing_steps = [Add(-INPUT_MEAN), DivideBy(INPUT_STD), Transpose((0, 2, 1))]
    target_processing_steps = [groundtruth_processing, Add(-INPUT_MEAN), DivideBy(INPUT_STD), Transpose((0, 2, 1))]

    if DOWNSAMPLE_FACTOR > 1:
        downsample_rng = np.random.RandomState(TRAINING_SEED)
        input_processing_steps_training = [DownSample(DOWNSAMPLE_FACTOR, downsample_rng)] + input_processing_steps
    else:
        input_processing_steps_training = input_processing_steps

    validation_pipeline = DataPipeline(
        sources=validation_sources,
        num_seq_per_batch=NUM_SEQ_PER_BATCH,
        input_processing=input_processing_steps_training,
        target_processing=target_processing_steps
    )
    validation_batches = get_validation_batches(validation_pipeline)
    print("appliance {} has {} validation batches".format(
        TARGET_APPLIANCE,
        sum([len(v) for k, v in validation_batches.items()]) ))

    data_pipeline = DataPipeline(
        sources=sources,
        num_seq_per_batch=NUM_SEQ_PER_BATCH,
        input_processing=input_processing_steps_training,
        target_processing=target_processing_steps
    )
    data_thread = DataProcess(data_pipeline)
    data_thread.start()
    running_data_processes.append(data_thread)

    net = _Net(SEQ_LENGTH, TARGET_SEQ_LENGTH)
    print(net)

    metrics_accu = MetricsAccumulator(
        on_power_threshold=ON_POWER_THRESHOLD,
        max_power=MAX_TARGET_POWER)

    # note: MSE - Mean Squared Error
    criterion = torch.nn.MSELoss()
    state_criterion = torch.nn.BCEWithLogitsLoss()

    stop_training = False
    best_mse = None

    # PREPARE DISAGGREGATOR
    if TEST_DISAGGREGATE_EVERY_N_EPOCHS is not None:
	    test_disaggregator = Disaggregator(
	        EVALUATION_DATA_PATH='input/evaluation_data_48h',
	        TARGET_APPLIANCE = TARGET_APPLIANCE,
	        ON_POWER_THRESHOLD = ON_POWER_THRESHOLD,
	        MAX_TARGET_POWER = MAX_TARGET_POWER,
	        pad_mains = True,
	        pad_appliance = False,
	        disagg_func = disag_seq2seq_sgn,
	        downsample_factor = DOWNSAMPLE_FACTOR,
	        disagg_kwargs = dict(
	            model = net,
	            input_processing=input_processing_steps,
	            target_processing=target_processing_steps,
	            n_seq_per_batch = NUM_SEQ_PER_BATCH,
	            seq_length = SEQ_LENGTH,
	            target_seq_length = TARGET_SEQ_LENGTH,
	            USE_CUDA=USE_CUDA,
	            stride = 1
	        )
	    )

    # PREPARE TENSORS, WHICH WILL BE FED USED DURING TRAINING AND VALIDATION
    input = torch.FloatTensor(NUM_SEQ_PER_BATCH, 1, SEQ_LENGTH)
    target = torch.FloatTensor(NUM_SEQ_PER_BATCH, 1, TARGET_SEQ_LENGTH)
    target_class = torch.FloatTensor(NUM_SEQ_PER_BATCH, 1, TARGET_SEQ_LENGTH)

    if USE_CUDA:
        # note: push to GPU
        net.cuda()
        criterion.cuda()
        input, target = input.cuda(), target.cuda()
        target_class = target_class.cuda()

    # setup optimizer.  TODO: Should we use 'Adam' for disaggregator?
    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE, betas=(0.9, 0.999))
    #optimizer = optim.SGD(net.parameters(), momentum=0.9, nesterov=True, lr=LEARNING_RATE)
    scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[50,75], gamma=0.1)

    history = {}
    csvpath = os.path.join(OUTPUT_FOLDER, "history.csv")
    if os.path.exists(csvpath):
        print("Already exists: {}".format(csvpath))
        return -1

    progbar_epoch = tqdm(desc="Epoch", total=EPOCHS, unit="epoch", disable=(not VERBOSE_TRAINING))
    for epoch in range(EPOCHS):
        # TRAINING
        metrics_log = {'training':{}}
        training_loss = 0.0
        progbar = tqdm(desc="Train", total=STEPS_PER_EPOCH, leave=False, disable=(not VERBOSE_TRAINING))
        for i in range(STEPS_PER_EPOCH):
            net.zero_grad()
            batch = data_thread.get_batch()
            while batch is None:
                batch = data_thread.get_batch()
            qsize = data_thread._queue.qsize()

            aggregated_signal = torch.from_numpy(batch.after_processing.input)
            target_signal = torch.from_numpy(batch.after_processing.target)
            target_class_np = np.float32(groundtruth_processing(batch.before_processing.target) > 15.0)
            target_class_t = torch.from_numpy(target_class_np.transpose((0, 2, 1)))
            if USE_CUDA:
                aggregated_signal = aggregated_signal.cuda()
                target_signal = target_signal.cuda()
                target_class_t = target_class_t.cuda()
            input.resize_as_(aggregated_signal).copy_(aggregated_signal)
            target.resize_as_(target_signal).copy_(target_signal)
            target_class.resize_as_(target_class_t).copy_(target_class_t)
            output, on_logit = net(input)
            loss = criterion(output, target) + state_criterion(on_logit, target_class)
            loss.backward()
            optimizer.step()
            training_loss += loss.item()

            progbar.set_postfix(dict(
                    loss = "{:.4f}".format(loss.item()),
                    qsize = qsize
                ), refresh=False)
            progbar.update()

        metrics_log['training']['loss'] = float(training_loss/STEPS_PER_EPOCH)
        metrics_log['training']['lr'] = optimizer.param_groups[0]['lr']

        # VALIDATION
        #pr_num_thresholds = 127
        for fold in validation_batches:
            metrics_accu.reset_accumulator()
            #accumulated_pr = {}
            #for cl in ["tp", "tn", "fp", "fn"]:
            #    accumulated_pr[cl] = torch.LongTensor(pr_num_thresholds).zero_()
            for batch in validation_batches[fold]:
                aggregated_signal = torch.from_numpy(batch.after_processing.input)
                target_signal = torch.from_numpy(batch.after_processing.target)
                target_class_np = np.float32(groundtruth_processing(batch.before_processing.target) > 15.0)
                target_class_t = torch.from_numpy(target_class_np.transpose((0, 2, 1)))
                if USE_CUDA:
                    aggregated_signal = aggregated_signal.cuda()
                    target_signal = target_signal.cuda()
                    target_class_t = target_class_t.cuda()
                    input.resize_as_(aggregated_signal).copy_(aggregated_signal)
                    target.resize_as_(target_signal).copy_(target_signal)
                    target_class.resize_as_(target_class_t).copy_(target_class_t)
                with torch.no_grad():
                    output, on_logit = net(input)
                    val_loss = criterion(output, target) + state_criterion(on_logit, target_class)
                    loss_value = val_loss.item()
                # other metrics
                pred_y = data_pipeline.apply_inverse_processing(output.cpu().data.numpy(), 'target')
                true_y = groundtruth_processing(batch.before_processing.target)
                metrics_accu.accumulate_metrics(true_y, pred_y, val_loss=loss_value)
                #calculate_pr_curve_torch(accumulated_pr, MAX_TARGET_POWER, true_y, pred_y, num_thresholds=pr_num_thresholds)

            for key, value in metrics_accu.finalize_metrics().items():
                metrics_log.setdefault(fold[0], {}).setdefault(key, {})[fold[1]] = value

            #precision = accumulated_pr["tp"] / (accumulated_pr["tp"] + accumulated_pr["fp"])
            #recall = accumulated_pr["tp"] / (accumulated_pr["tp"] + accumulated_pr["fn"])
            #writer.add_pr_curve_raw("pr_{}/{}".format(fold[0], fold[1]),
            #    true_positive_counts=accumulated_pr["tp"],
            #    false_positive_counts=accumulated_pr["fp"],
            #    true_negative_counts=accumulated_pr["tn"],
            #    false_negative_counts=accumulated_pr["fn"],
            #    precision=precision, recall=recall,
            #    global_step=(epoch+1)*STEPS_PER_EPOCH, num_thresholds=pr_num_thresholds)

        # LR Scheduler
        val_loss = metrics_log['unseen_activations']['val_loss']['rss']
        #val_loss = metrics_log['mean_squared_error']['unseen_activations']['rss']
        #scheduler.step(val_loss)
        scheduler.step()

        # PRINT STATS
        if not VERBOSE_TRAINING:
            print('[{:d}/{:d}] {}'.format(epoch+1, EPOCHS, metrics_log['training']))
        else:
            progbar_epoch.set_postfix(dict(loss=metrics_log['training']['loss']), refresh=False)

        progbar_epoch.update()
        progbar.close()

        # store in history / tensorboard
        for fold, metrics_for_fold in metrics_log.items():
            for metric_name, value in metrics_for_fold.items():
                if type(value) == dict:
                    SW_add_scalars2(writer, "{}/{}".format(fold, metric_name), value, (epoch+1)*STEPS_PER_EPOCH)
                    for k, v in value.items():
                        name = "{}/{}/{}".format(fold, metric_name, k)
                        history.setdefault(name, []).append(v)
                else:
                    name = "{}/{}".format(fold, metric_name)
                    writer.add_scalar(name, value, (epoch+1)*STEPS_PER_EPOCH)
                    history.setdefault(name, []).append(value)

        # CHECKPOINTING
        if CHECKPOINT_BEST_MSE:
            mse = val_loss
            if best_mse is None:
                best_mse = mse
            if best_mse > mse:
                msg = "[{:d}/{:d}] MSE improved from {:.4f} to {:.4f} (d={:f}), saving model...".format(epoch+1, EPOCHS, best_mse, mse, best_mse-mse)
                if not VERBOSE_TRAINING:
                    print(msg)
                else:
                    progbar_epoch.write(msg)
                torch.save({
                    'epoch': epoch + 1,
                    'step' : (epoch+1)*STEPS_PER_EPOCH,
                    'mse'  : mse,
                    'model': net.state_dict()}, '{}/net_best_mse.pth.tar'.format(OUTPUT_FOLDER))
                best_mse = mse

        if CHECKPOINTING_EVERY_N_EPOCHS is not None:
            if (epoch+1) % CHECKPOINTING_EVERY_N_EPOCHS == 0:
                torch.save(net.state_dict(), '{}/net_step_{:06d}.pth'.format(OUTPUT_FOLDER, (epoch+1)*STEPS_PER_EPOCH))

        if TEST_DISAGGREGATE_EVERY_N_EPOCHS is not None:
            if (epoch+1) % TEST_DISAGGREGATE_EVERY_N_EPOCHS == 0:
                scores = test_disaggregator.calculate_metrics()
                scores_by_metric = {}
                for building_i, building in scores.items():
                    for metric, value in building.items():
                        scores_by_metric.setdefault(metric, {})[building_i] = value
                for metric, building_d in scores_by_metric.items():
                    SW_add_scalars2(writer, "test_score/{}".format(metric), building_d, (epoch+1)*STEPS_PER_EPOCH)

        if stop_training:
            break

    # CHECKPOINTING at end
    torch.save({
        'epoch': epoch + 1,
        'step' : (epoch+1)*STEPS_PER_EPOCH,
        'model': net.state_dict(),
        'optimizer': optimizer.state_dict(),
        #'scheduler': scheduler.state_dict()
        # TODO: scheduler is not saved this way, scheduler.state_dict() does not exist
    }, '{}/net_step_{:06d}.pth.tar'.format(OUTPUT_FOLDER, (epoch+1)*STEPS_PER_EPOCH))

    df = pd.DataFrame(history)
    df.to_csv(csvpath)

    for p in running_data_processes:
        p.stop()
    writer.close()

    #return 42
    return metrics_log['training']['loss']
Beispiel #2
0
def load_disaggregator(EVALUATION_DATA_PATH, MODEL_PATH, config=None, USE_CUDA=True):
    """
        Helper function for the disaggregator script
    """

    if config is None:
        config = os.path.dirname(MODEL_PATH)

    if type(config) == str:
        try:
            import jsonpickle
            with open(os.path.join(config, 'config.json'), 'r') as configfile:
                config = jsonpickle.decode(configfile.read())
        except:
            return None

    assert(type(config) == dict)

    dataset = config['dataset']
    SEQ_LENGTH =            config['SEQ_LENGTH']
    TARGET_SEQ_LENGTH =     config['TARGET_SEQ_LENGTH']
    TARGET_APPLIANCE =      dataset['TARGET_APPLIANCE']
    ON_POWER_THRESHOLD =    dataset['ON_POWER_THRESHOLD']
    MAX_TARGET_POWER =      config['MAX_TARGET_POWER']
    NUM_SEQ_PER_BATCH =     config['NUM_SEQ_PER_BATCH']
    INPUT_STD =             config['INPUT_STD']
    INPUT_MEAN =            config['INPUT_MEAN']
    DOWNSAMPLE_FACTOR =     config['DOWNSAMPLE_FACTOR']
    #NUM_SEQ_PER_BATCH =     1024 # override

    net = _Net(SEQ_LENGTH, TARGET_SEQ_LENGTH)

    offset = (SEQ_LENGTH - TARGET_SEQ_LENGTH) // 2
    input_processing_steps = [Add(-INPUT_MEAN), DivideBy(INPUT_STD), Transpose((0, 2, 1))]
    target_processing_steps = [SubSequence(offset,-offset), Add(-INPUT_MEAN), DivideBy(INPUT_STD), Transpose((0, 2, 1))]

    if MODEL_PATH.endswith("/"):
        MODEL_PATH = MODEL_PATH + 'net_step_{:06d}.pth.tar'.format(config['EPOCHS']*config['STEPS_PER_EPOCH'])

    if USE_CUDA:
        training_state = torch.load(MODEL_PATH)
    else:
        training_state = torch.load(MODEL_PATH, map_location='cpu')

    if MODEL_PATH.endswith("tar"):
        model = training_state['model']
    else:
        model = training_state

    net.load_state_dict(model)
    if USE_CUDA:
        net.cuda()

    return Disaggregator(
        EVALUATION_DATA_PATH=EVALUATION_DATA_PATH,
        TARGET_APPLIANCE = TARGET_APPLIANCE,
        ON_POWER_THRESHOLD = ON_POWER_THRESHOLD,
        MAX_TARGET_POWER = MAX_TARGET_POWER,
        pad_mains = True,
        pad_appliance = False,
        disagg_func = disag_seq2seq_sgn,
        downsample_factor = DOWNSAMPLE_FACTOR,
        disagg_kwargs = dict(
            USE_CUDA=USE_CUDA,
            model = net,
            input_processing=input_processing_steps,
            target_processing=target_processing_steps,
            n_seq_per_batch = NUM_SEQ_PER_BATCH,
            seq_length = SEQ_LENGTH,
            target_seq_length = TARGET_SEQ_LENGTH,
            stride = 1
        )
    ), training_state