コード例 #1
0
def load_data_val_6(testList, W, index, batch):
    x_train_1 = []
    x_train_2 = []
    x_train_3 = []
    for i in range(0, batch):
        true_index = index + i
        if (true_index >= len(testList)):
            true_index = len(testList) - 1
        items = testList[true_index].split(' ')
        q_words = items[2].split('_')
        a_words = items[3].split('_')
        x_train_1_words = []
        x_train_2_words = []
        x_train_3_words = []
        for i in range(50):
            x_train_1_words.append(
                ModelUtils.build_text_image(W, q_words[i], padding=1))
            x_train_2_words.append(
                ModelUtils.build_text_image(W, a_words[i], padding=1))
            x_train_3_words.append(
                ModelUtils.build_text_image(W, a_words[i], padding=1))

        x_train_1.append(np.array(x_train_1_words).reshape((50, 300)))
        x_train_2.append(np.array(x_train_2_words).reshape((50, 300)))
        x_train_3.append(np.array(x_train_3_words).reshape((50, 300)))
    return np.array(x_train_1), np.array(x_train_2), np.array(x_train_3)
コード例 #2
0
    def _validation(self, valid_loader):
        with torch.no_grad():

            self.model.eval()
            loss_tracker = LossTracker()
            pred_scores = []
            true_scores = []

            for i, (img, label) in enumerate(valid_loader):

                img = torch.cat(img)
                img = img.to(cfg.device)
                label = torch.cat(label)
                label = label.to(cfg.device)

                classification_output = self.model(img)
                loss = self.criterion(classification_output, label)

                ModelUtils.append_results(label, classification_output, pred_scores, true_scores)
                loss_tracker.increment_loss(loss)

                if i % 100 == 0:
                    weighted_AUC = CompetitionMetric.alaska_weighted_auc(true_scores, pred_scores)
                    loss_tracker.print_losses(self.epoch, i, len(valid_loader), weighted_AUC)

            weighted_AUC = CompetitionMetric.alaska_weighted_auc(true_scores, pred_scores)
            loss_dict = loss_tracker.write_dict(weighted_AUC)
            loss_tracker.print_losses(self.epoch, i, len(valid_loader), weighted_AUC)
            self.writer.write_scalars(loss_dict, tag='val', n_iter=self.train_step)
            self.scheduler.step(metrics=loss_tracker.loss.avg)
            lr = self.optimizer.param_groups[-1]['lr']
            self.writer.write_scalars({'lr': lr}, tag='val', n_iter=self.train_step)
コード例 #3
0
def load_data_6(W, alist, raw, size):
    x_train_1 = []
    x_train_2 = []
    x_train_3 = []
    for i in range(0, size):
        items = raw[random.randint(0, len(raw) - 1)]
        nega = rand_qa(alist)
        q_words = items[2].split('_')
        a_words = items[3].split('_')
        neg_words = nega.split('_')
        x_train_1_words = []
        x_train_2_words = []
        x_train_3_words = []
        for i in range(50):
            x_train_1_words.append(
                ModelUtils.build_text_image(W, q_words[i], padding=1))
            x_train_2_words.append(
                ModelUtils.build_text_image(W, a_words[i], padding=1))
            x_train_3_words.append(
                ModelUtils.build_text_image(W, neg_words[i], padding=1))

        x_train_1.append(np.array(x_train_1_words).reshape((50, 300)))
        x_train_2.append(np.array(x_train_2_words).reshape((50, 300)))
        x_train_3.append(np.array(x_train_3_words).reshape((50, 300)))
    return np.array(x_train_1), np.array(x_train_2), np.array(x_train_3)
コード例 #4
0
    def _train(self, train_loader):
        # Show and log loss results every 100 steps
        loss_tracker = LossTracker()
        pred_scores = []
        true_scores = []

        self.model.train()

        print('Epoch: {} : LR = {}'.format(self.epoch, self.lr))

        for i, img in enumerate(train_loader):

            self.train_step += 1

            # Split the batch in 4 sub-batches, such that each sub-batch contains either the cover, JUNIWARD, JMiPOD or
            # UERD version of each image.

            batch_splits, labels = ModelUtils.batch_splitter(img, num_imgs=4)

            classification_out = [self.model(sub_batch) for sub_batch in batch_splits]
            losses = [self.criterion(c, l) for c, l in zip(classification_out, labels)]
            total_loss = sum(losses)/len(losses)

            self.optimizer.zero_grad()
            total_loss.backward()
            self.optimizer.step()

            for batch_labs, batch_res in zip(labels, classification_out):
                ModelUtils.append_results(batch_labs, batch_res, pred_scores, true_scores)

            loss_tracker.increment_loss(total_loss)

            if i % cfg.log_freq == 0 and i > 0:
                weighted_AUC = CompetitionMetric.alaska_weighted_auc(true_scores, pred_scores)
                pred_scores = []
                true_scores = []

                loss_dict = loss_tracker.write_dict(weighted_AUC)
                loss_tracker.print_losses(self.epoch, i, len(train_loader), weighted_AUC)
                loss_tracker = LossTracker()  # Reinitialize the loss tracking
                self.writer.write_scalars(loss_dict, tag='train', n_iter=self.train_step)

            if i % cfg.save_freq == 0 and i > 0:
                ModelUtils.save_model(self)
コード例 #5
0
    def main(self):
        if cfg.device.type == 'cuda':
            torch.cuda.set_enabled_lms(True)
            cudnn.benchmark = True

        ModelUtils.load_model(self)

        self.model = self.model.to(cfg.device)

        test_data = TestDataLoader(
            img_root=os.path.join(cfg.data_path, 'test'))
        self.data_loader = data.DataLoader(test_data,
                                           batch_size=cfg.batch_size,
                                           shuffle=False,
                                           num_workers=cfg.num_workers)

        print(f'Testing {len(test_data) * cfg.batch_size} images')

        results = self._infere()
        self._write_results(results)
コード例 #6
0
    def main(self):
        if cfg.device.type == 'cuda':
            if cfg.enable_lms:
                torch.cuda.set_enabled_lms(True)
            cudnn.benchmark = True

        # Loads a model only if there is an existing file in cfg.save_path
        ModelUtils.load_model(self)

        if self.train_step == 0:
            SrmFiltersSetter.initialize_filters(self.model)
            print('SRM High Pass filters initialized')

        self.model = self.model.to(cfg.device)

        train_data = TrainDataLoader(
            img_root=os.path.join(cfg.data_path, 'train'),
            transform=AugmentatedTransform(sizes=cfg.input_size),
            is_training=True
        )
        train_loader = data.DataLoader(train_data, batch_size=cfg.batch_size, shuffle=True,
                                       num_workers=cfg.num_workers)

        val_data = TrainDataLoader(
            img_root=os.path.join(cfg.data_path, 'train'),
            transform=BaseTransform(sizes=cfg.input_size),
            is_training=False
        )
        val_loader = data.DataLoader(val_data, batch_size=cfg.batch_size, shuffle=False,
                                     num_workers=cfg.num_workers)

        print(f'Training on {len(train_data)} images')
        print('Start training.')

        for e in range(self.epoch, cfg.max_epoch):
            self._train(train_loader)
            self.epoch += 1
            if self.epoch % cfg.val_freq == 0:
                self._validation(val_loader)
コード例 #7
0
def runModel(config, data_dictionary, data_statistics, train_test_folds):
    program_start_time = time()

    # assign all program arguments to local variables
    with open(config['model']['path']) as handle:
        ModelDict = json.loads(handle.read())

    # check if station and grid time invariant features should be used and set the list of desired parameters
    if not ('grid_time_invariant' in ModelDict and ModelDict['grid_time_invariant']): config[
        'grid_time_invariant_parameters'] = []
    if not ('station_time_invariant' in ModelDict and ModelDict['station_time_invariant']): config[
        'station_parameters'] = []

    # update general static model information
    experiment_info = config
    experiment_info['model'] = ModelDict
    experiment_info['code_commit'] = ModelUtils.get_git_revision_short_hash()

    # if needed, load time invariant features
    with open("%s/%s/grid_size_%s/time_invariant_data_per_station.pkl" % (
    config['input_source'], config['preprocessing'], config['original_grid_size']), "rb") as input_file:
        time_invarian_data = pkl.load(input_file)

    # initialize feature scaling function for each feature
    featureScaleFunctions = DataUtils.getFeatureScaleFunctions(ModelUtils.ParamNormalizationDict, data_statistics)

    # get optimizer config
    optimizer_config = config['optimizer']

    # generate output path for experiment information
    setting_string = '%s_grid_%s_bs_%s_tf_%s_optim_%s_lr_%s_sl_%s' % (
        config['model']['name'], config['grid_size'], config['batch_size'], config['test_fraction'], optimizer_config['algorithm'], optimizer_config['learning_rate'], config['slice_size'])
    output_path = '%s/%s' % (config['experiment_path'], setting_string)
    if not os.path.exists(output_path):
        raise Exception('Node folder of training run has been found for "%s"' % output_path)

    ds = xr.Dataset()

    # cross validation
    for run in range(config['runs']):
        print('[Run %s] Cross-validation test fold %s' % (str(run + 1), str(run + 1)))

        stations = sorted(config['stations'])

        # take the right preprocessed train/test data set for the current run
        train_fold, test_fold = train_test_folds[run]

        # get all inits
        all_inits_set = set(config['inits'])

        # get train and test inits
        train_inits_set = set([t[1] for t in train_fold])
        test_inits_set = set([t[1] for t in test_fold])

        # get all filtered inits
        filtere_inits = set(
            [init for init in all_inits_set if init not in train_inits_set and init not in test_inits_set])

        # make sure, that all sets are distinct
        assert filtere_inits ^ train_inits_set ^ test_inits_set == all_inits_set

        init_type_mapping = {}
        for init in train_inits_set: init_type_mapping[init] = 'train'
        for init in test_inits_set: init_type_mapping[init] = 'test'
        for init in filtere_inits: init_type_mapping[init] = 'filterd'

        all_inits = sorted(list(all_inits_set))
        all_data = [(station, init) for init in all_inits for station in stations]

        n_data_points = len(all_data)

        # keep mappings from init and station to index of result numpy array
        station_index_dict = {}
        for station_idx, station in enumerate(stations): station_index_dict[station] = station_idx
        init_index_dict = {}
        for init_idx, init in enumerate(all_inits): init_index_dict[init] = init_idx

        # initialize train and test dataloaders
        dataset = DataLoaders.ErrorPredictionCosmoData(
            config=config,
            station_data_dict=data_dictionary,
            files=all_data,
            featureScaling=featureScaleFunctions,
            time_invariant_data=time_invarian_data)
        dataloader = DataLoader(dataset, batch_size=config['batch_size'], shuffle=False,
                                num_workers=config['n_loaders'], collate_fn=DataLoaders.collate_fn)

        # initialize network, optimizer and loss function
        net = Baseline.model_factory(model_dict=ModelDict, params=dataset.n_parameters, time_invariant_params=dataset.n_grid_time_invariant_parameters,
                                     grid=config['grid_size'], prediction_times=config['prediction_times'])

        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(net)

        optimizer = optim.SGD(net.parameters(), lr=optimizer_config['learning_rate'], momentum=optimizer_config['momentum'])

        net, optimizer, *_ = ModelUtils.load_checkpoint(output_path + '/stored_models/run_%s' % run, model=net,
                                                        optimizer=optimizer)


        if torch.cuda.is_available():
            net.cuda()

        # we do not train, but only output the evaluation of the network on train and test data
        net.eval()

        # initialize result array of errors per init and station and initialize it with NaN
        run_error_statistics = np.empty((len(init_index_dict), len(station_index_dict), 5))
        run_error_statistics.fill(np.nan)

        # loop over complete data set
        for i, data in enumerate(dataloader, 0):
            try:
                # get training batch, e.g. label, cosmo-1 output and time inv. features for station
                DATA = data
                # DATA has only length 4 if we do not use the station time invariant features
                if len(DATA) == 4:
                    Blabel, Bip2d, BTimeData, init_station_temp = DATA
                    station_time_inv_input = None
                elif len(DATA) == 5:
                    Blabel, Bip2d, BTimeData, StationTimeInv, init_station_temp = DATA
                    station_time_inv_input = ModelUtils.getVariable(StationTimeInv).float()
                else:
                    raise Exception('Unknown data format for training...')
                input = ModelUtils.getVariable(Bip2d).float()
                time_data = ModelUtils.getVariable(BTimeData).float()
                target = ModelUtils.getVariable(Blabel).float()

            except TypeError:
                # when the batch size is small, it could happen, that all labels have been corrupted and therefore
                # collate_fn would return an empty list
                print('Value error...')
                continue

            out = net(input, time_data, station_time_inv_input).squeeze()
            target = target.squeeze()
            diff = (out - target).squeeze()

            for item in range(Blabel.shape[0]):
                init = init_station_temp[0][item]
                station = init_station_temp[1][item].item()
                cosmo_temperature = init_station_temp[2][item].item()
                target_temperature = init_station_temp[3][item].item()
                station_idx = station_index_dict[station]
                init_idx = init_index_dict[init]
                run_error_statistics[init_idx, station_idx, :] = np.array((out[item].item(), cosmo_temperature, target[item].item(), diff[item].item(), target_temperature))

            processed_samples = (i + 1)  * int(config['batch_size'])
            if (i+1) % np.max((1, ((n_data_points // config['batch_size']) // 100))) == 0:
                print("%s samples have been processed. [%2.1f%%]" % (processed_samples, (processed_samples / n_data_points) * 100))
                sys.stdout.flush()


        da = xr.DataArray(run_error_statistics, dims=('init', 'station', 'data'),
                          coords=[all_inits, stations, ['prediction', 'cosmo', 'target', 'difference', 'target_temperature']])
        da = da.sortby(variables='init')
        da.attrs['init_type_mapping'] = sorted(list(init_type_mapping.items()))

        ds['run_%s' % run] = da
        ds.attrs['config'] = config

        print('Error results of run %s have been processed.' % run)
        # flush output to see progress
        sys.stdout.flush()

    if not os.path.exists(output_path):
        raise Exception('Node folder of training run has been found for "%s"' % output_path)

    # dump experiment statistic
    with open(output_path + '/model_run_error.pkl', 'wb') as handle:
        pkl.dump(ds, handle, protocol=pkl.HIGHEST_PROTOCOL)

    # print program execution time
    m, s = divmod(time() - program_start_time, 60)
    h, m = divmod(m, 60)
    print('Experiment has successfully finished in %dh %02dmin %02ds' % (h, m, s))
コード例 #8
0
def model_eva(stock, state_dt, para_window, para_dc_window):

    if DBUtils.select_ev_result(state_dt, stock):
        print('Already ev:' + stock + ':' + state_dt)
        return 0

    # 建评估时间序列, para_window参数代表回测窗口长度

    ev_start = Utils.date2d(
        (Utils.to_date(state_dt) - datetime.timedelta(days=para_window)))
    ev_end = state_dt
    date_temp = DBUtils.get_stock_calender(ev_start, ev_end)
    ev_dt_seq = [(Utils.d2date(x)) for x in date_temp]

    # 清空评估用的中间表model_ev_mid
    DBUtils.clear_ev_mid()

    return_flag = 0
    # 开始回测,其中para_dc_window参数代表建模时数据预处理所需的时间窗长度
    for d in range(len(ev_dt_seq)):
        dc_start_dt = Utils.d2date(
            Utils.to_date(ev_dt_seq[d]) -
            datetime.timedelta(days=para_dc_window))
        dc_end_dt = ev_dt_seq[d]
        try:
            dc = DC.data_collect(stock, dc_start_dt, dc_end_dt)
            if len(set(dc.data_target)) <= 1:
                print('WARN: DC target is less than 1 record.')
                continue
        except Exception as exp:
            print("DC Error")
            print(exp)
            return_flag = 1
            break

        train = dc.data_train
        target = dc.data_target
        test_case = [dc.test_case]

        aresult = ModelUtils.use_svm(train, target, test_case)

        # 将预测结果插入到中间表
        DBUtils.insert_predict(dc_end_dt, stock, aresult)

    if return_flag == 1:
        print('WARN: something maybe wrong... when svm')
        acc = recall = acc_neg = f1 = 0
        return -1

    # 在中间表中刷真实值
    for i in range(len(ev_dt_seq)):
        r = DBUtils.update_ev_mid_with_real(stock, ev_dt_seq[i])
        if r != 0:
            print('WARN: break ev mid with real:' + stock)
            break

    # 计算查全率
    recall = DBUtils.count_recall()
    # 计算查准率
    acc = DBUtils.count_acc()
    # 计算查准率(负样本)
    acc_neg = DBUtils.count_acc_neg()
    # 计算 F1 分值
    f1 = Utils.count_F1(acc, recall)

    # 将评估结果存入结果表model_ev_resu中
    predict = DBUtils.get_predict(ev_dt_seq[-1])
    DBUtils.insert_ev_result(state_dt, stock, acc, recall, f1, acc_neg, 'svm',
                             predict)

    print(
        str(state_dt) + '   Precision : ' + str(acc) + '   Recall : ' +
        str(recall) + '   F1 : ' + str(f1) + '   Acc_Neg : ' + str(acc_neg))

    return 1
コード例 #9
0
def runModel(config, data_dictionary, data_statistics, train_test_folds):
    program_start_time = time()

    # assign all program arguments to local variables
    with open(config['model']['path']) as handle:
        ModelDict = json.loads(handle.read())

    # check if station and grid time invariant features should be used and set the list of desired parameters
    if not ('grid_time_invariant' in ModelDict and ModelDict['grid_time_invariant']): config['grid_time_invariant_parameters'] =[]
    if not ('station_time_invariant' in ModelDict and ModelDict['station_time_invariant']): config['station_parameters'] = []

    # update general static model information
    experiment_info = config
    experiment_info['model'] = ModelDict
    experiment_info['code_commit'] = ModelUtils.get_git_revision_short_hash()


    # if needed, load time invariant features
    with open("%s/%s/grid_size_%s/time_invariant_data_per_station.pkl" % (config['input_source'], config['preprocessing'], config['original_grid_size']), "rb") as input_file:
        time_invarian_data = pkl.load(input_file)


    # initialize feature scaling function for each feature
    featureScaleFunctions = DataUtils.getFeatureScaleFunctions(ModelUtils.ParamNormalizationDict, data_statistics)

    # get optimizer config
    optimizer_config = config['optimizer']

    # generate output path for experiment information
    setting_string = '%s_grid_%s_bs_%s_tf_%s_optim_%s_lr_%s_sl_%s' % (
        config['model']['name'], config['grid_size'], config['batch_size'], config['test_fraction'], optimizer_config['algorithm'], optimizer_config['learning_rate'], config['slice_size'])
    output_path = '%s/%s' % (config['experiment_path'], setting_string)
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    # time for the set up until first run
    experiment_info['set_up_time'] = time() - program_start_time
    print('[Time]: Set-up %s' % strftime("%H:%M:%S", gmtime(experiment_info['set_up_time'])))
    sys.stdout.flush()

    # initialize statistics
    error_statistics = None
    run_times = None
    skip_statistics = None
    if 'per_station_rmse' in config:
        error_per_station_statistics = None

    # keep used learning rates
    experiment_info['scheduled_learning_rates'] = []

    # cross validation
    for run in range(config['runs']):
        # logger  for tensorboardX
        train_logger = Logger(output_path + '/logs/run_%s/train' % run)
        test_logger = Logger(output_path + '/logs/run_%s/test' % run)

        print('[Run %s] Cross-validation test fold %s' % (str(run + 1), str(run + 1)))

        # take the right preprocessed train/test data set for the current run
        train_fold, test_fold = train_test_folds[run]

        # initialize best epoch test error
        best_epoch_test_rmse = float("inf")

        # use different data loader if we want to train a 3nn model approach
        if "knn" in ModelDict:
            # initialize train and test dataloaders
            trainset = DataLoaders.CosmoData3NNData(
                config=config,
                station_data_dict=data_dictionary,
                files=train_fold,
                featureScaling=featureScaleFunctions,
                time_invariant_data=time_invarian_data)
            trainloader = DataLoader(trainset, batch_size=config['batch_size'], shuffle=True,
                                     num_workers=config['n_loaders'], collate_fn=DataLoaders.collate_fn)

            testset = DataLoaders.CosmoData3NNData(
                config=config,
                station_data_dict=data_dictionary,
                files=test_fold,
                featureScaling=featureScaleFunctions,
                time_invariant_data=time_invarian_data)
            testloader = DataLoader(testset, batch_size=config['batch_size'], shuffle=True,
                                    num_workers=config['n_loaders'], collate_fn=DataLoaders.collate_fn)
        else:
            # initialize train and test dataloaders
            trainset = DataLoaders.CosmoDataGridData(
                config=config,
                station_data_dict=data_dictionary,
                files=train_fold,
                featureScaling=featureScaleFunctions,
                time_invariant_data=time_invarian_data)
            trainloader = DataLoader(trainset, batch_size=config['batch_size'], shuffle=True,
                                     num_workers=config['n_loaders'], collate_fn=DataLoaders.collate_fn)

            testset = DataLoaders.CosmoDataGridData(
                config=config,
                station_data_dict=data_dictionary,
                files=test_fold,
                featureScaling=featureScaleFunctions,
                time_invariant_data=time_invarian_data)
            testloader = DataLoader(testset, batch_size=config['batch_size'], shuffle=True,
                                    num_workers=config['n_loaders'], collate_fn=DataLoaders.collate_fn)

        # initialize network, optimizer and loss function
        net = Baseline.model_factory(ModelDict, trainset.n_parameters, trainset.n_grid_time_invariant_parameters,
                                     config['grid_size'], config['prediction_times'])
        # store class name
        experiment_info['model_class'] = net.__class__.__name__

        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(net)

        if torch.cuda.is_available():
            net.cuda()

        # load number of train and test samples
        n_train_samples, n_test_samples = len(train_fold), len(test_fold)

        optimizer, scheduler = ModelUtils.initializeOptimizer(optimizer_config, net)
        criterion = nn.MSELoss()

        # keep number of processed smaples over all epochs for tensorboard
        processed_train_samples_global = 0
        processed_test_samples_global = 0

        # start learning
        for epoch in range(config['epochs']):
            epoch_train_time = np.zeros((5,))
            epoch_start_time = time()
            print('Epoch: ' + str(epoch + 1) + '\n------------------------------------------------------------')

            # adapt learning rate and store information in experiment attributes
            if scheduler is not None:
                scheduler.step()
                if run == 0: experiment_info['scheduled_learning_rates'] += scheduler.get_lr()
                print('Using learning rate %s' % str(scheduler.get_lr()))

            # TRAINING
            # initialize variables for epoch statistics
            LABELS, MODELoutputs, COSMOoutputs = None, None, None
            processed_train_samples = 0
            net.train(True)

            train_start_time = time()
            # loop over complete train set
            for i, data in enumerate(trainloader, 0):
                time_start = time()
                try:
                    # get training batch, e.g. label, cosmo-1 output and time inv. features for station
                    DATA = data
                    # DATA has only length 4 if we do not use the station time invariant features
                    if len(DATA) == 4:
                        Blabel, Bip2d, BTimeData, init_station_temp = DATA
                        station_time_inv_input = None
                    elif len(DATA) == 5:
                        Blabel, Bip2d, BTimeData, StationTimeInv, init_station_temp = DATA
                        station_time_inv_input = ModelUtils.getVariable(StationTimeInv).float()
                    else:
                        raise Exception('Unknown data format for training...')
                    input = ModelUtils.getVariable(Bip2d).float()
                    time_data = ModelUtils.getVariable(BTimeData).float()
                    target = ModelUtils.getVariable(Blabel).float()

                except TypeError:
                    # when the batch size is small, it could happen, that all labels have been corrupted and therefore
                    # collate_fn would return an empty list
                    print('Value error...')
                    continue
                time_after_data_preparation = time()

                processed_train_samples += len(Blabel)

                optimizer.zero_grad()
                out = net(input, time_data, station_time_inv_input)
                time_after_forward_pass = time()
                loss = criterion(out, target)
                loss.backward()
                optimizer.step()
                time_after_backward_pass = time()

                if LABELS is None:
                    LABELS = Blabel.data
                    MODELoutputs = out.data
                    COSMOoutputs = init_station_temp[2].data
                else:
                    LABELS = np.vstack((LABELS, Blabel.data))
                    MODELoutputs = np.vstack((MODELoutputs, out.data))
                    COSMOoutputs = np.vstack((COSMOoutputs, init_station_temp[2].data))

                time_after_label_stack = time()

                if (i + 1) % 64 == 0:

                    print('Sample: %s \t Loss: %s' % (processed_train_samples, float(np.sqrt(loss.data))))

                    # ============ TensorBoard logging ============#
                    # (1) Log the scalar values
                    info = {
                        setting_string: np.sqrt(loss.item()),
                    }

                    for tag, value in info.items():
                        train_logger.scalar_summary(tag, value, processed_train_samples_global + processed_train_samples)

                    # (2) Log values and gradients of the parameters (histogram)
                    for tag, value in net.named_parameters():
                        tag = tag.replace('.', '/')
                        train_logger.histo_summary(tag, ModelUtils.to_np(value), i + 1)
                        train_logger.histo_summary(tag + '/grad', ModelUtils.to_np(value.grad), i + 1)

                    epoch_train_time += np.array((time_start - time_end,
                                                  time_after_data_preparation - time_start,
                                                  time_after_forward_pass - time_after_data_preparation,
                                                  time_after_backward_pass - time_after_forward_pass,
                                                  time_after_label_stack - time_after_backward_pass))

                time_end = time()

            # calculate error statistic of current epoch
            diff_model = MODELoutputs - LABELS
            diff_cosmo = COSMOoutputs - LABELS
            epoch_train_rmse_model = np.apply_along_axis(func1d=ModelUtils.rmse, arr=diff_model, axis=0)
            epoch_train_rmse_cosmo = np.apply_along_axis(func1d=ModelUtils.rmse, arr=diff_cosmo, axis=0)


            # update global processed samples
            processed_train_samples_global += processed_train_samples

            if np.isnan(epoch_train_rmse_model).any():
                print("Learning rate too large resulted in NaN-error while training. Stopped training...")
                return
            # print epoch training times
            print('Timing: Waiting on data=%s, Data Preparation=%s,'
                  'Forward Pass=%s, Backward Pass=%s, Data Stacking=%s' % tuple(list(epoch_train_time / len(epoch_train_time))))

            # RMSE of epoch
            print('Train/test statistic for epoch: %s' % str(epoch + 1))
            print('Train RMSE COSMO: ' , ", ".join(["T=%s: %s" % (idx, epoch_train_rmse_cosmo[idx]) for idx in range(len(epoch_train_rmse_cosmo))]))
            print('Train RMSE Model: ' , ", ".join(["T=%s: %s" % (idx, epoch_train_rmse_model[idx]) for idx in range(len(epoch_train_rmse_model))]))
            sys.stdout.flush()

            train_time = time() - train_start_time

            # TESTING
            test_start_time = time()

            LABELS, MODELoutputs, COSMOoutputs, STATION = None, None, None, None
            processed_test_samples = 0
            net.eval()
            for i, data in enumerate(testloader, 0):
                try:
                    # get training batch, e.g. label, cosmo-1 output and time inv. features for station
                    DATA = data
                    # DATA has only length 4 if we do not use the station time invariant features
                    if len(DATA) == 4:
                        Blabel, Bip2d, BTimeData, init_station_temp = DATA
                        station_time_inv_input = None
                    elif len(DATA) == 5:
                        Blabel, Bip2d, BTimeData, StationTimeInv, init_station_temp = DATA
                        station_time_inv_input = ModelUtils.getVariable(StationTimeInv).float()
                    else:
                        raise Exception('Unknown data format for training...')
                    input = ModelUtils.getVariable(Bip2d).float()
                    time_data = ModelUtils.getVariable(BTimeData).float()
                    target = ModelUtils.getVariable(Blabel).float()

                except TypeError:
                    # when the batch size is small, it could happen, that all labels have been corrupted and therefore
                    # collate_fn would return an empty list
                    print('Value error...')
                    continue

                processed_test_samples += len(Blabel)

                out = net(input, time_data, station_time_inv_input)
                loss = criterion(out, target)

                if LABELS is None:
                    LABELS = Blabel.data
                    MODELoutputs = out.data
                    COSMOoutputs = init_station_temp[2].data
                    STATION = init_station_temp[1].data
                else:
                    LABELS = np.vstack((LABELS, Blabel.data))
                    MODELoutputs = np.vstack((MODELoutputs, out.data))
                    COSMOoutputs = np.vstack((COSMOoutputs, init_station_temp[2].data))
                    STATION = np.hstack((STATION, init_station_temp[1].data))

                if i % 16:
                    # ============ TensorBoard logging ============#
                    # (1) Log the scalar values
                    info = {
                        setting_string: np.sqrt(loss.item()),
                    }

                    for tag, value in info.items():
                        test_logger.scalar_summary(tag, value, processed_test_samples_global + processed_test_samples)

            # calculate error statistic of current epoch
            diff_model = MODELoutputs - LABELS
            diff_cosmo = COSMOoutputs - LABELS

            # rmse
            epoch_test_rmse_model = np.apply_along_axis(func1d=ModelUtils.rmse, arr=diff_model, axis=0)
            epoch_test_rmse_cosmo = np.apply_along_axis(func1d=ModelUtils.rmse, arr=diff_cosmo, axis=0)
            overall_test_rmse_model = ModelUtils.rmse(diff_model)
            overall_test_rmse_cosmo = ModelUtils.rmse(diff_cosmo)

            # mae
            epoch_test_mae_model = np.apply_along_axis(func1d=ModelUtils.mae, arr=diff_model, axis=0)
            epoch_test_mae_cosmo = np.apply_along_axis(func1d=ModelUtils.mae, arr=diff_cosmo, axis=0)
            overall_test_mae_model = ModelUtils.mae(diff_model)
            overall_test_mae_cosmo = ModelUtils.mae(diff_cosmo)

            # calculate per station rmse if desired (especially for K-fold station generalization experiment
            if "per_station_rmse" in config:
                max_station_id = 1435

                squared_errors_per_epoch = np.array((np.square(diff_model), np.square(diff_cosmo))).squeeze()

                # the highest index of data is 1435, thus we expect at least 1435 entries, which we can access by
                # station id
                test_samples_per_station = np.bincount(STATION, minlength=max_station_id+1)
                model_squared_error_per_station = np.bincount(STATION, weights=squared_errors_per_epoch[0], minlength=max_station_id+1)
                cosmo_squared_error_per_station = np.bincount(STATION, weights=squared_errors_per_epoch[1], minlength=max_station_id+1)

                # set division by zero/NaN warning to 'ignore'
                np.seterr(divide='ignore', invalid='ignore')

                # calculate rmse per station
                rmse_per_station = np.vstack((np.sqrt(np.divide(model_squared_error_per_station, test_samples_per_station)),
                                              np.sqrt(np.divide(cosmo_squared_error_per_station, test_samples_per_station)))).T

                # set division by zero/NaN warning to 'warn'
                np.seterr(divide='warn', invalid='warn')






            # update global processed samples
            processed_test_samples_global += processed_test_samples

            # RMSE of epoch
            print('Test RMSE COSMO: ', ", ".join(
                ["T=%s: %s" % (idx, epoch_test_rmse_cosmo[idx]) for idx in range(len(epoch_test_rmse_cosmo))]),
                  " (Overall: %s" % overall_test_rmse_cosmo)
            print('Test RMSE Model: ' , ", ".join(["T=%s: %s" % (idx, epoch_test_rmse_model[idx]) for idx in range(len(epoch_test_rmse_model))]),
                  " (Overall: %s" % overall_test_rmse_model)
            # mae of epoch
            print('Test MAE COSMO: ', ", ".join(
                ["T=%s: %s" % (idx, epoch_test_mae_cosmo[idx]) for idx in range(len(epoch_test_mae_cosmo))]),
                  " (Overall: %s" % overall_test_mae_cosmo)
            print('Test MAE Model: ' , ", ".join(["T=%s: %s" % (idx, epoch_test_mae_model[idx]) for idx in range(len(epoch_test_mae_model))]),
                  " (Overall: %s" % overall_test_mae_model)

            sys.stdout.flush()

            test_time = time() - test_start_time

            # time for epoch
            epoch_time = time() - epoch_start_time

            # update error statistics
            error_statistics = ModelUtils.updateErrorStatistic(error_statistics,
                                                               np.array([epoch_train_rmse_model, epoch_test_rmse_model])[None, None, ...],
                                                               run, epoch, config['prediction_times'])
            # update run times statistic
            run_times = ModelUtils.updateRuntimeStatistic(run_times, np.array([epoch_time, train_time, test_time])[None, None, ...],
                                                          run, epoch)
            # update skip statistic
            skip_statistics = ModelUtils.updateSkipStatistic(skip_statistics,
                                                             np.array([n_train_samples, processed_train_samples,
                                                                       n_test_samples, processed_test_samples])[None, None, ...],
                                                             run, epoch)

            # update per station rmse data array over runs if desired (especially for K-fold station generalization experiment
            if "per_station_rmse" in config:
                error_per_station_statistics = ModelUtils.updatePerStationErrorStatistic(error_per_station_statistics, rmse_per_station, run, epoch, np.arange(max_station_id+1))

            # store model if it was the best yes
            is_best = overall_test_rmse_model <= best_epoch_test_rmse
            best_epoch_test_rmse = min(overall_test_rmse_model, best_epoch_test_rmse)
            ModelUtils.save_checkpoint({
                'epoch': epoch,
                'run': run,
                'arch': net.__class__.__name__,
                'state_dict': net.state_dict(),
                'overall_test_rmse': overall_test_rmse_model,
                'lead_test_rmse' : overall_test_rmse_model,
                'best_epoch_test_rmse': best_epoch_test_rmse,
                'optimizer': optimizer.state_dict(),
            }, is_best, output_path + '/stored_models/run_%s' % run)

            # flush output to see progress
            sys.stdout.flush()

    # update statistics dict
    ModelUtils.get_model_details(experiment_info, net, optimizer, criterion)

    # complete program runtime
    experiment_info['program_runtime'] = time() - program_start_time

    # generate data set of all experiment statistics and additional information
    experiment_statistic = xr.Dataset({
        'error_statistic' : error_statistics,
        'run_time_statistic': run_times,
        'samples_statistic' : skip_statistics}).assign_attrs(experiment_info)

    # dump experiment statistic
    with open(output_path + '/experiment_statistic.pkl', 'wb') as handle:
        pkl.dump(experiment_statistic, handle, protocol=pkl.HIGHEST_PROTOCOL)

    if 'per_station_rmse' in config:
        # dump experiment statistic
        with open(output_path + '/rmse_per_station.pkl', 'wb') as handle:
            pkl.dump(error_per_station_statistics, handle, protocol=pkl.HIGHEST_PROTOCOL)

    # print program execution time
    m, s = divmod(experiment_info['program_runtime'], 60)
    h, m = divmod(m, 60)
    print('Experiment has successfully finished in %dh %02dmin %02ds' % (h, m, s))
コード例 #10
0
from sklearn.linear_model import LinearRegression

import utils.AlphaVantageUtils as av
import utils.PostgresUtils as pg
import utils.ModelUtils as mdl

name = pg.get_symbol_name(av._TIC_MICROSOFT)
df_prices = pg.get_prices_with_features(av._TIC_MICROSOFT, av._INT_DAILY, None,
                                        None, None)

df_prices.drop(columns=['open', 'high', 'low', 'volume'], inplace=True)

print(df_prices.info())

df_train, df_test = mdl.train_test_split(df_prices, 1000)

predictions = []

train = df_train.drop(pg._COL_DATETIME, axis=1)
test = df_test.drop(pg._COL_DATETIME, axis=1)

print(train.shape)
print(test.shape)

train_X = train.drop(pg._COL_CLOSE, axis=1)
train_y = train[pg._COL_CLOSE]

print(train_X.iloc[0:1, :])

test_X = test.drop(pg._COL_CLOSE, axis=1)
コード例 #11
0
def CreateData(config, data_dictionary, data_statistics, train_test_folds):

    # assign all program arguments to local variables
    with open(config['model']['path']) as handle:
        ModelDict = json.loads(handle.read())

    # check if station and grid time invariant features should be used and set the list of desired parameters
    if not ('grid_time_invariant' in ModelDict
            and ModelDict['grid_time_invariant']):
        config['grid_time_invariant_parameters'] = []
    if not ('station_time_invariant' in ModelDict
            and ModelDict['station_time_invariant']):
        config['station_parameters'] = []

    # if needed, load time invariant features
    with open(
            "%s/%s/grid_size_%s/time_invariant_data_per_station.pkl" %
        (config['input_source'], config['preprocessing'],
         config['original_grid_size']), "rb") as input_file:
        time_invarian_data = pkl.load(input_file)

    # initialize feature scaling function for each feature
    featureScaleFunctions = DataUtils.getFeatureScaleFunctions(
        ModelUtils.ParamNormalizationDict, data_statistics)

    # add revision short hash to the config
    config['code_commit'] = ModelUtils.get_git_revision_short_hash()

    # take the right preprocessed train/test data set for the first run
    train_fold, test_fold = train_test_folds[0]

    # initialize train and test dataloaders
    trainset = DataLoaders.CosmoDataGridData(
        config=config,
        station_data_dict=data_dictionary,
        files=train_fold,
        featureScaling=featureScaleFunctions,
        time_invariant_data=time_invarian_data)
    trainloader = DataLoader(trainset,
                             batch_size=config['batch_size'],
                             shuffle=True,
                             num_workers=config['n_loaders'],
                             collate_fn=DataLoaders.collate_fn)

    testset = DataLoaders.CosmoDataGridData(
        config=config,
        station_data_dict=data_dictionary,
        files=test_fold,
        featureScaling=featureScaleFunctions,
        time_invariant_data=time_invarian_data)
    testloader = DataLoader(testset,
                            batch_size=config['batch_size'],
                            shuffle=True,
                            num_workers=config['n_loaders'],
                            collate_fn=DataLoaders.collate_fn)

    # loop over complete train set
    train_data = None
    train_inits = []
    train_stations = None
    for i, data in enumerate(trainloader, 0):
        try:
            # get training batch, e.g. label, cosmo-1 output and time inv. features for station
            DATA = data
            # DATA has only length 4 if we do not use the station time invariant features
            if len(DATA) == 4:
                Blabel, Bip2d, BTimeData, init_station_temp = DATA
                station_time_inv_input = None
            elif len(DATA) == 5:
                Blabel, Bip2d, BTimeData, StationTimeInv, init_station_temp = DATA
                station_time_inv_input = ModelUtils.getVariable(
                    StationTimeInv).float()
            else:
                raise Exception('Unknown data format for training...')
            input = ModelUtils.getVariable(Bip2d).float()
            time_data = ModelUtils.getVariable(BTimeData).float()
            target = ModelUtils.getVariable(Blabel).float()

            try:
                batch_data = np.concatenate(
                    (input.squeeze(), station_time_inv_input, time_data,
                     target, init_station_temp[2]),
                    axis=1)
            except:
                batch_data = np.concatenate(
                    (input.squeeze(), time_data, target, init_station_temp[2]),
                    axis=1)

            train_inits += init_station_temp[0]

            if train_data is None:
                train_data = batch_data
                train_stations = init_station_temp[1]
            else:
                train_data = np.vstack((train_data, batch_data))
                train_stations = np.hstack(
                    (train_stations, init_station_temp[1]))

        except TypeError:
            # when the batch size is small, it could happen, that all labels have been corrupted and therefore
            # collate_fn would return an empty list
            print('Value error...')
            continue

    # define column names for data frame
    column_names = [
        'Pressure', 'Wind U-Comp.', 'Wind V-Comp.', 'Wind VMAX',
        '2m-Temperature', 'Temp. of Dew Point', 'Cloud Coverage (High)',
        'Cloud Coverage (Medium)', 'Cloud Coverage (Low)',
        'Tot. Precipitation', 'ALB_RAD', 'ASOB', 'ATHB', 'HPBL',
        '2m-Temperature (Lead=0)'
    ]
    column_names += [
        'Grid Height', 'Grid-Station Height Diff.', 'Fraction of Land',
        'Soiltype', 'Latitiude', 'Longitued', 'Grid-Station 2d Distance'
    ]
    if train_data.shape[1] >= 31:
        column_names += [
            'Station Height', 'Station Latitude', 'Station Longitude'
        ]
    column_names += [
        'Hour (Cosine)', 'Hour (Sine)', 'Month (Cosine)', 'Month (Sine)',
        'Lead-Time'
    ]
    column_names += ['Target 2m-Temp.']
    column_names += ['COSMO 2m-Temp.']

    train_keys = pd.DataFrame.from_dict({
        'Station': train_stations,
        'Init': train_inits
    })
    train_data = pd.DataFrame(data=train_data, columns=column_names)
    train_ds = pd.concat([train_keys, train_data], axis=1)

    test_data = None
    test_inits = []
    test_stations = None
    for i, data in enumerate(testloader, 0):
        try:
            # get training batch, e.g. label, cosmo-1 output and time inv. features for station
            DATA = data
            # DATA has only length 4 if we do not use the station time invariant features
            if len(DATA) == 4:
                Blabel, Bip2d, BTimeData, init_station_temp = DATA
                station_time_inv_input = None
            elif len(DATA) == 5:
                Blabel, Bip2d, BTimeData, StationTimeInv, init_station_temp = DATA
                station_time_inv_input = ModelUtils.getVariable(
                    StationTimeInv).float()
            else:
                raise Exception('Unknown data format for training...')
            input = ModelUtils.getVariable(Bip2d).float()
            time_data = ModelUtils.getVariable(BTimeData).float()
            target = ModelUtils.getVariable(Blabel).float()

            try:
                batch_data = np.concatenate(
                    (input.squeeze(), station_time_inv_input, time_data,
                     target, init_station_temp[2]),
                    axis=1)
            except:
                batch_data = np.concatenate(
                    (input.squeeze(), time_data, target, init_station_temp[2]),
                    axis=1)

            test_inits += init_station_temp[0]

            if test_data is None:
                test_data = batch_data
                test_stations = init_station_temp[1]
            else:
                test_data = np.vstack((test_data, batch_data))
                test_stations = np.hstack(
                    (test_stations, init_station_temp[1]))

        except TypeError:
            # when the batch size is small, it could happen, that all labels have been corrupted and therefore
            # collate_fn would return an empty list
            print('Value error...')
            continue

    test_keys = pd.DataFrame.from_dict({
        'Station': test_stations,
        'Init': test_inits
    })
    test_data = pd.DataFrame(data=test_data, columns=column_names)
    test_ds = pd.concat([test_keys, test_data], axis=1)

    network_ready_data_path = config['input_source'] + '/network_ready_data'
    if not os.path.exists(network_ready_data_path):
        os.makedirs(network_ready_data_path)

    network_ready_train_data_path = network_ready_data_path + '/train_data'
    network_ready_test_data_path = network_ready_data_path + '/test_data'

    train_ds.to_pickle(network_ready_train_data_path)
    test_ds.to_pickle(network_ready_test_data_path)

    # shap specific config entries for analysis in jupyter notebook
    config['train_data_path'] = network_ready_data_path + '/train_data'
    config['test_data_path'] = network_ready_data_path + '/test_data'

    # dump config
    with open(network_ready_data_path + '/config.pkl', 'wb') as handle:
        pkl.dump(config, handle, protocol=pkl.HIGHEST_PROTOCOL)

    print('Network ready data analysis successfully executed.')
コード例 #12
0
elif options.script == 'validatePreprocessing':
    print('Starting to run %s' % options.script)

    S = [x for x in range(144)]
    ValidatePreprocessing.GetData(S, G, withTopo, DB, DE, T, isLocal)

# main methods to run nerual network model runs
# this requires >=1 model config file in the "models" folder of an experiment and an "experiment_parameters.txt" file
# sample model configs can be found under /results/runs/neural_network
elif options.script == 'runModel':
    # take time of start of the experiment
    experiment_start = time()

    # this method executes all prerequisite steps to run a model, i.e. preparation of run config, generating/loading data splits,
    # loading the data into a dictionary for "station" preprocessed data, loading data statistic of features for normalization
    config, train_test_folds, data_dictionary, data_statistics = ModelUtils.setUpModelRun(
        options=options, G=G)

    print('Starting to run %s' % options.script)

    # validation method of feature normalization, can be ignored
    if options.model_type == "featureNormalizationValidation":
        ValidateFeatureNormalization.runModel(
            config=config,
            data_dictionary=data_dictionary,
            data_statistics=data_statistics,
            train_test_folds=train_test_folds)
        print('Finished validation of feature normalization.')
    # all model runs, or experiment with a similar setup
    else:
        # get all paths of model configuration files. This allows to define several models at a time to be run
        models = [