def minibatch_xxxy(self, inputs):
        def xpxixpy(X, y, size):
            xpp = np.dot(
                np.linalg.inv(
                    np.dot(X.T, X) + np.diag(np.repeat(.0001, X.shape[1]))),
                np.dot(X.T, y))
            xpp = np.append(
                np.squeeze(xpp).astype(dtype='float32'),
                np.random.normal(0, .0000001, size - len(xpp)))
            return torch.FloatTensor(xpp)

        df1 = pd.DataFrame(inputs.data.numpy())
        df1.columns = [
            'CarAge', 'DriverAge', 'ClaimNb_0', 'ClaimNb_1', 'ClaimNb_2',
            'ClaimNb_3', 'ClaimNb_4', 'Power_d', 'Power_e', 'Power_f',
            'Power_g', 'Power_h', 'Power_i', 'Power_j', 'Power_k', 'Power_l',
            'Power_m', 'Power_n', 'Power_o', 'Brand_Fiat',
            'Brand_Japanese (except Nissan) or Korean',
            'Brand_Mercedes, Chrysler or BMW',
            'Brand_Opel, General Motors or Ford',
            'Brand_Renault, Nissan or Citroen',
            'Brand_Volkswagen, Audi, Skoda or Seat', 'Brand_other',
            'Gas_Diesel', 'Gas_Regular', 'Region_R11', 'Region_R23',
            'Region_R24', 'Region_R25', 'Region_R31', 'Region_R52',
            'Region_R53', 'Region_R54', 'Region_R72', 'Region_R74',
            'ExposureCat_1', 'ExposureCat_2', 'ExposureCat_3', 'ExposureCat_4',
            'ExposureCat_5', 'ExposureCat_6', 'ExposureCat_7', 'ExposureCat_8',
            'ExposureCat_9', 'ExposureCat_10', 'ExposureCat_11',
            'ExposureCat_12', 'DensityCat_1', 'DensityCat_2', 'DensityCat_3',
            'DensityCat_4', 'DensityCat_5'
        ]
        df2 = back_from_dummies(df1)
        df2['ClaimNb'] = df2['ClaimNb'].astype('int')
        y, X = dmatrices(
            'ClaimNb ~ CarAge + DriverAge + Power + Brand + Gas + Region + DensityCat',
            data=df2,
            return_type='dataframe')
        if (np.sum(y) == 0).bool():
            y[:1] = 1
        return torch.cat(
            (inputs, xpxixpy(X, y, self.add_rows).repeat(len(inputs), 1)), 1)
def train_GAN2(generator,
               discriminator,
               optim_gen,
               optim_disc,
               auto_loader,
               autoencoder,
               z_size,
               epochs=500,
               disc_epochs=2,
               gen_epochs=3,
               penalty=0.1,
               temperature=None,
               var_locs=[0, 1, 2, 3],
               save_bool=False,
               output_disc_path='./saved_parameters/discriminator2',
               output_gen_path='./saved_parameters/generator2',
               output_disc_optim_path='./saved_parameters/disc_optim2',
               output_gen_optim_path='./saved_parameters/gen_optim2',
               output_fig_save_path='./saved_parameters/fig1'):
    autoencoder.train(mode=False)
    generator.train(mode=True)
    discriminator.train(mode=True)
    disc_losses = []
    gen_losses = []
    disc_loss = torch.tensor(9999)
    gen_loss = torch.tensor(9999)
    cont_num = len(var_locs)
    loop = tqdm(total=epochs, position=0, leave=False)
    for epoch in range(epochs):
        #loop = tqdm(total = len(auto_loader), position = 0, leave = False)
        for d_epoch in range(disc_epochs):
            for c1, c2 in auto_loader:
                batch = torch.cat([c2, c1], 1)
                optim_disc.zero_grad()
                # train discriminator with real data
                real_features = Variable(batch)
                real_pred = discriminator(real_features)
                # the disc output high numbers if it thinks the data is real, we take the negative of this
                real_loss = -real_pred.mean(0).view(1)
                real_loss.backward()

                # then train the discriminator only with fake data
                cont_num = len(var_locs)
                noise = Variable(
                    torch.FloatTensor(len(batch), z_size).normal_())
                fake_code = generator(noise)
                cat_part = fake_code[:, cont_num:(z_size + cont_num)]
                cont_part = fake_code[:, 0:cont_num]
                fake_features = torch.cat([
                    cont_part,
                    autoencoder.decode(
                        cat_part, training=False, temperature=temperature)
                ], 1)
                fake_features = fake_features.detach(
                )  # do not propagate to the generator
                fake_pred = discriminator(fake_features)
                fake_loss = fake_pred.mean(0).view(1)
                fake_loss.backward()

                # this is the magic from WGAN-GP
                gradient_penalty = calculate_gradient_penalty(
                    discriminator, penalty, real_features, fake_features)
                gradient_penalty.backward()

                # finally update the discriminator weights
                # using two separated batches is another trick to improve GAN training
                optim_disc.step()

                disc_loss = real_loss + fake_loss + gradient_penalty
                disc_losses.append(disc_loss.item())
                del gradient_penalty
                del fake_loss
                del real_loss
                del disc_loss

        for g_epoch in range(gen_epochs):
            optim_gen.zero_grad()

            noise = Variable(torch.FloatTensor(len(batch), z_size).normal_())
            gen_code = generator(noise)
            cat_partg = gen_code[:, cont_num:(z_size + cont_num)]
            cont_partg = gen_code[:, 0:cont_num]
            gen_features = torch.cat([
                cont_partg,
                autoencoder.decode(
                    cat_partg, training=False, temperature=temperature)
            ], 1)
            gen_pred = discriminator(gen_features)

            gen_loss = -gen_pred.mean(0).view(1)
            gen_loss.backward()

            optim_gen.step()

            gen_loss = gen_loss
            gen_losses.append(gen_loss.item())
            del gen_loss
            loop.set_description(
                'epoch:{}, disc_loss:{:.4f}, gen_loss:{:.4f}'.format(
                    epoch, disc_losses[epoch], gen_losses[epoch]))
            loop.update(1)

        # analyze poisson regression parameters every 20 epochs
        if (epoch % 50 == 0):

            def generate_data(trained_generator):
                test_noise = Variable(
                    torch.FloatTensor(pol_dat.shape[0], z_size).normal_())
                with torch.no_grad():
                    test_code = trained_generator(test_noise, training=False)
                test_cat_partg = test_code[:, cont_num:(z_size + cont_num)]
                test_cont_partg = test_code[:, 0:cont_num]
                test_gen_features = torch.cat([
                    test_cont_partg,
                    autoencoder.decode(test_cat_partg,
                                       training=False,
                                       temperature=temperature)
                ], 1)
                return (test_gen_features)

            generated_data = generate_data(generator)
            df1 = pd.DataFrame(generated_data.data.numpy())
            df1.columns = list(pol_dat)
            df2 = back_from_dummies(df1)
            df2['ClaimNb'] = df2['ClaimNb'].astype('int')
            y_gen, X_gen = dmatrices(
                'ClaimNb ~ CarAge + DriverAge + Power + Brand + Gas + Region + DensityCat',
                data=df2,
                return_type='dataframe')
            df2['Exposure'] = df2['ExposureCat'].astype('float32') / 11
            poisson_mod_gen = sm.GLM(y_gen,
                                     X_gen,
                                     family=sm.families.Poisson(),
                                     offset=np.log(df2['Exposure'])).fit()
            pois_df = pd.concat([
                original_params.reset_index(drop=True),
                lower.reset_index(drop=True),
                upper.reset_index(drop=True),
                poisson_mod_gen.params.reset_index(drop=True),
                ((poisson_mod_gen.params > lower) &
                 (poisson_mod_gen.params < upper)).reset_index(drop=True)
            ],
                                axis=1)

            pois_df.columns = ['Real', 'Lower', 'Upper', 'Generated', 'IN 95']
            pois_df.index = poisson_mod.params.index
            print(pois_df)
            if save_bool:
                # Option to save parameters to restart training
                torch.save(discriminator.state_dict(),
                           f=output_disc_path + '_' + str(epoch))
                torch.save(generator.state_dict(),
                           f=output_gen_path + '_' + str(epoch))
                torch.save(optim_disc.state_dict(),
                           f=output_disc_optim_path + '_' + str(epoch))
                torch.save(optim_gen.state_dict(),
                           f=output_gen_optim_path + '_' + str(epoch))
policy1['DensityCat'] = policy1['Density_cat'].astype('category')

policy_cat = pd.get_dummies(policy1.loc[:, [
    "ClaimNb", "Power", "Brand", "Gas", "Region", "ExposureCat", "DensityCat"
]])
cont_vars = policy1[['CarAge', 'DriverAge']]

# Scale the continuos variables to help training
cont_vars2 = (cont_vars - np.mean(cont_vars)) / np.std(cont_vars)
pol_dat = pd.concat([cont_vars2.reset_index(drop=True), policy_cat], axis=1)

# Take a sample from the original data for faster training
pol_dat = pol_dat.sample(n=10000, random_state=12)

# Fit a poisson model for the original data
td = back_from_dummies(pol_dat)
td['ClaimNb'] = td['ClaimNb'].astype('int')
y_real, X_real = dmatrices(
    'ClaimNb ~ CarAge + DriverAge + Power + Brand + Gas + Region + DensityCat',
    data=td,
    return_type='dataframe')
td['Exposure'] = td['ExposureCat'].astype('float32') / 11


def xpxixpy(X, y):
    return np.dot(np.linalg.inv(np.dot(X.T, X)), np.dot(X.T, y))


xy = xpxixpy(X_real, y_real)
disc_add_rows = xy.shape[0]
poisson_mod = sm.GLM(y_real,
policy1['ClaimNb'] = policy1['ClaimNb'].astype('category')
policy1['ExposureCat'] = policy1['Exposure_cat'].astype('category')
policy1['DensityCat'] = policy1['Density_cat'].astype('category')

policy_cat = pd.get_dummies(policy1.loc[:, [
    "ClaimNb", "Power", "Brand", "Gas", "Region", "ExposureCat", "DensityCat"
]])
cont_vars = policy1[['CarAge', 'DriverAge']]

cont_vars2 = (cont_vars - np.mean(cont_vars)) / np.std(cont_vars)

pol_dat = pd.concat([cont_vars2.reset_index(drop=True), policy_cat], axis=1)

pol_dat = pol_dat.sample(n=10000, random_state=12)

td = back_from_dummies(pol_dat)
td['ClaimNb'] = td['ClaimNb'].astype('int')
y_real, X_real = dmatrices(
    'ClaimNb ~ CarAge + DriverAge + Power + Brand + Gas + Region + DensityCat',
    data=td,
    return_type='dataframe')
td['Exposure'] = td['ExposureCat'].astype('float32') / 11


def xpxixpy(X, y):
    return np.dot(np.linalg.inv(np.dot(X.T, X)), np.dot(X.T, y))


xy = xpxixpy(X_real, y_real)
disc_add_rows = xy.shape[0]
poisson_mod = sm.GLM(y_real,
all_inds = np.arange(0,(pol_dat.shape[0]-1))
test_inds = np.random.choice(all_inds, size=np.floor(pol_dat.shape[0]*.1).astype('int'), replace=False, p=None)
second_inds = np.setdiff1d(all_inds, test_inds)
val_inds = np.random.choice(second_inds, size=np.floor(pol_dat.shape[0]*.1).astype('int'), replace=False, p=None)
train_inds = np.setdiff1d(second_inds, val_inds)
train_2 = np.random.choice(train_inds, size=np.floor(pol_dat.shape[0]*.4).astype('int'), replace=False, p=None)

test = pol_dat.iloc[test_inds]
val = pol_dat.iloc[val_inds]
train = pol_dat.iloc[train_inds]
train_half = pol_dat.iloc[train_2]

pol_dat = train

# Wrangle train data
td = back_from_dummies(train_half)
td['ClaimNb'] = td['ClaimNb'].astype('int')
y_real, X_real = dmatrices('ClaimNb ~ CarAge + DriverAge + Power + Brand + Gas + Region + DensityCat',
                 data=td,
                 return_type='dataframe')
td['Exposure'] = td['ExposureCat'].astype('float32')/11
def xpxixpy(X,y):
            return np.dot(np.linalg.inv(np.dot(X.T,X)), np.dot(X.T,y))
xy = xpxixpy(X_real,y_real)
disc_add_rows = xy.shape[0]


# Fit a poisson Model
poisson_mod = sm.GLM(y_real,X_real,family = sm.families.Poisson(), offset = td['Exposure']).fit()
original_params = poisson_mod.params
Beispiel #6
0
def train_GAN2(generator,
               discriminator,
               optim_gen,
               optim_disc,
               auto_loader,
               z_size,
               epochs=500,
               disc_epochs=2,
               gen_epochs=3,
               penalty=0.1,
               temperature=None,
               var_locs=[0, 1, 2, 3],
               save_bool=False,
               output_disc_path='./saved_parameters/discriminator2',
               output_gen_path='./saved_parameters/generator2',
               output_disc_optim_path='./saved_parameters/disc_optim2',
               output_gen_optim_path='./saved_parameters/gen_optim2',
               output_fig_save_path='./saved_parameters/fig1',
               output_data_save_path='./saved_parameters/data_generator2'):
    generator.train(mode=True)
    discriminator.train(mode=True)
    disc_losses = []
    gen_losses = []
    pois_metric = []
    # rf_metric = []
    # rf_imp_metric = []
    disc_loss = torch.tensor(9999)
    gen_loss = torch.tensor(9999)
    loop = tqdm(total=epochs, position=0, leave=False)
    for epoch in range(epochs):
        for d_epoch in range(disc_epochs):
            for c1, c2 in auto_loader:
                batch = torch.cat([c2, c1], 1)
                optim_disc.zero_grad()

                # train discriminator with real data
                real_features = Variable(batch)
                real_pred = discriminator(real_features)
                # the disc outputs high numbers if it thinks the data is real, we take the negative of this
                # Because we are minimizing loss
                real_loss = -real_pred.mean(0).view(1)
                real_loss.backward()

                # then train the discriminator only with fake data
                noise = Variable(
                    torch.FloatTensor(len(batch), z_size).normal_())
                fake_features = generator(noise)
                fake_features = fake_features.detach(
                )  # do not propagate to the generator
                fake_pred = discriminator(fake_features)
                fake_loss = fake_pred.mean(0).view(1)
                fake_loss.backward()

                # this is the magic from WGAN-GP
                gradient_penalty = calculate_gradient_penalty(
                    discriminator, penalty, real_features, fake_features)
                gradient_penalty.backward()

                # finally update the discriminator weights
                optim_disc.step()

                disc_loss = real_loss + fake_loss + gradient_penalty
                disc_losses.append(disc_loss.item())
                # Delete to prevent memory leakage
                del gradient_penalty
                del fake_loss
                del real_loss
                del disc_loss
                del real_features
                del real_pred
                del noise
                del fake_features
                del fake_pred

        for g_epoch in range(gen_epochs):
            optim_gen.zero_grad()

            noise = Variable(torch.FloatTensor(len(batch), z_size).normal_())
            gen_features = generator(noise, training=TRUE)
            gen_pred = discriminator(gen_features)

            gen_loss = -gen_pred.mean(0).view(1)
            gen_loss.backward()

            optim_gen.step()

            gen_loss = gen_loss
            gen_losses.append(gen_loss.item())
            del gen_loss
            del noise
            del gen_features
            del gen_pred

        loop.set_description(
            'epoch:{}, disc_loss:{:.4f}, gen_loss:{:.4f}'.format(
                epoch, disc_losses[epoch], gen_losses[epoch]))
        loop.update(1)
        # analyze poisson regression parameters every 20 epochs
        if (epoch % 25 == 0):
            with torch.no_grad():
                generated_data = generator(Variable(
                    torch.FloatTensor(best_test.shape[0], z_size).normal_()),
                                           training=False)
            df1 = pd.DataFrame(generated_data.data.numpy())
            df1.columns = list(pol_dat)
            df2 = back_from_dummies(df1)
            df2['ClaimNb'] = df2['ClaimNb'].astype('int')
            y_gen, X_gen = dmatrices(
                'ClaimNb ~ CarAge + DriverAge + Power + Brand + Gas + Region + DensityCat',
                data=df2,
                return_type='dataframe')
            df2['Exposure'] = df2['ExposureCat'].astype('float32') / 11

            #df2.to_csv(output_data_save_path)
            # Fit poisson Model
            poisson_mod_gen = sm.GLM(y_gen,
                                     X_gen,
                                     family=sm.families.Poisson(),
                                     offset=np.log(df2['Exposure'])).fit()

            # Fit Random Forest
            # gen_features= X_gen
            # gen_features = np.array(gen_features)
            # y_gen2 = np.squeeze(y_gen)/np.squeeze(df2['Exposure'])
            # rf_gen = RandomForestRegressor(n_estimators = 1000, random_state = 42)
            # rf_gen.fit(gen_features, np.squeeze(y_gen2))

            # gen_predictions = rf_gen.predict(X_test)

            # importances_gen = rf_gen.feature_importances_

            # Calculate Errors
            errors_pois = poisson_mod_gen.predict(X_test) - real_pois_preds
            # errors_rf = abs(gen_predictions - real_predictions)
            # errors_imp = abs(importances_gen - importances_real)

            pois_metric.append(round(np.mean(errors_pois), 4))
            # rf_metric.append(round(np.mean(errors_rf), 4))
            # rf_imp_metric.append(np.mean(errors_imp))

            if (epoch > 3):
                plt.subplot(311)
                plt.plot(pois_metric, label='train')
                plt.ylabel('poission Dif')

                # plt.subplot(312)
                # plt.plot(rf_metric, label = 'train')
                # plt.ylabel('rf pred dif')

                # plt.subplot(313)
                # plt.plot(rf_imp_metric, label = 'train')
                # plt.ylabel('rf imp dif')
                if save_bool:
                    plt.savefig(output_fig_save_path, bbox_inches='tight')
                plt.show()
                plt.clf()

            #print('Mean Absolute Difference RF:', round(np.mean(errors_rf), 4))
            print('Mean Absolute Difference Pois:',
                  round(np.mean(errors_pois), 4))
            #print('Mean Absolute Difference RF Imp:', round(np.mean(errors_imp), 4))

            del errors_pois
            #del errors_rf
            #del errors_imp
            #del rf_gen
            #del y_gen2
            del poisson_mod_gen
            del y_gen
            del X_gen
            #del importances_gen
            #del gen_predictions
            del generated_data
            del df1
            del df2
            #del gen_features

            if save_bool:
                # Option to save parameters to restart training
                torch.save(discriminator.state_dict(), f=output_disc_path)
                torch.save(generator.state_dict(), f=output_gen_path)
                torch.save(optim_disc.state_dict(), f=output_disc_optim_path)
                torch.save(optim_gen.state_dict(), f=output_gen_optim_path)
Beispiel #7
0
                                          .15).astype('int'),
                            replace=False,
                            p=None)
third_inds = np.setdiff1d(all_inds, np.concatenate((val_inds, test_inds)))
#best_test_inds = np.random.choice(third_inds, size=np.floor(pol_dat.shape[0]*.15).astype('int'), replace=False, p=None)
train_inds = np.setdiff1d(all_inds, np.concatenate((val_inds, test_inds)))

test = pol_dat.iloc[test_inds]
val = pol_dat.iloc[val_inds]
best_test = pol_dat.iloc[val_inds]
train = pol_dat.iloc[train_inds]

pol_dat = train

# Wrangle train data
td = back_from_dummies(train)
td['ClaimNb'] = td['ClaimNb'].astype('int')
y_real, X_real = dmatrices(
    'ClaimNb ~ CarAge + DriverAge + Power + Brand + Gas + Region + DensityCat',
    data=td,
    return_type='dataframe')
td['Exposure'] = td['ExposureCat'].astype('float32') / 11


def xpxixpy(X, y):
    return np.dot(np.linalg.inv(np.dot(X.T, X)), np.dot(X.T, y))


xy = xpxixpy(X_real, y_real)
disc_add_rows = xy.shape[0]