def __init__(self, random_seed,
                 embedding_sizelist, embedding_dimlist,
                 c_input_dim,
                 learning_rate, out_ratio):
        nn.Module.__init__(self)

        self.random_seed = random_seed
        set_random_seed(random_seed)

        self.out_ratio = out_ratio

        self.id_model = idModel(embedding_sizelist, embedding_dimlist)
        id_out_size = sum(embedding_dimlist)

        self.dnn_feature_extract = nn.Sequential(
                    nn.Linear(c_input_dim + id_out_size, 128),
                    nn.ReLU(),
                    nn.Linear(128, 256),
                    nn.ReLU(),
                    nn.Linear(256, 512),
                    nn.ReLU(),
                    nn.Linear(512, 600),
                    nn.Dropout(0.05),
                    nn.ReLU(),
        )


        self.dnn_predict_val = nn.Sequential(
                    nn.Linear(600+c_input_dim + id_out_size, 1, bias=False),
        )

        self.out_bias = torch.tensor(1, dtype=torch.float32)
        self.out_bias = nn.Parameter(self.out_bias)

        def loss_fn(x, y):

            x = torch.nn.functional.leaky_relu(x -1) + 1
            y = torch.nn.functional.relu(y -1) + 1

            re = torch.abs(x - y)/(torch.abs(x + y))

            return torch.sum(re)

        self.loss_fn = loss_fn
        # self.loss_fn = lambda x, y: torch.sum(torch.abs(torch.abs(x -1) - torch.abs(y - 1))/(torch.abs(x - 1) + torch.abs(y - 1) + 2))
        # self.loss_fn = torch.nn.MSELoss(reduction='sum')
        # self.loss_fn = lambda x, y: torch.sum((x - y)**2)
        self.optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
        self.scheduler = torch.optim.lr_scheduler.LambdaLR(self.optimizer, lambda epoch_num: 1/math.sqrt(epoch_num+1))

        self.vis = None
Ejemplo n.º 2
0
    def unsupervisedTraining(self, x, y, epoch_nums, batch_size, train_ratio, custom_metric=None, plot_fold=None):

        set_random_seed(self.random_seed)

        samples_num = x.shape[0]
        index_list = np.random.permutation(np.arange(samples_num))

        train_index = index_list[0: int(train_ratio*samples_num)]
        valid_index = index_list[int(train_ratio*samples_num): ]

        train_x = x[train_index]
        train_y = y[train_index]

        valid_x = x[valid_index]
        valid_y = y[valid_index]

        self.fit(train_x, train_y, epoch_nums, batch_size, valid_x, valid_y, custom_metric, plot_fold, unsupervised_flag=True)
Ejemplo n.º 3
0
from torch_geometric.nn import GCNConv
from torch_geometric.nn import ChebConv
from torch_geometric.nn import SAGEConv
from torch_geometric.nn import GraphConv
from torch_geometric.nn import GATConv
from torch_geometric.nn import ARMAConv
from torch_geometric.nn import APPNP
from specialTools.GatedGraphConv import GatedGraphConv

from specialTools.STChebConv import STChebConv

from torch_geometric.data import Batch
from lxyTools.pytorchTools import set_random_seed


set_random_seed(2018)

raw_train = pd.read_csv('./processedData/train.csv')
test = pd.read_csv('./processedData/test.csv')


roadmap = pd.read_csv('./rawdata/Metro_roadMap.csv')
#
# roadmap.loc[roadmap.index==55, '53'] = 1
# roadmap.loc[np.array(roadmap['53']==1)&np.array(roadmap.index==54), '53'] = 0
#
# roadmap.loc[roadmap.index==53, '55'] = 1
# roadmap.loc[np.array(roadmap['55']==1)&np.array(roadmap.index==54), '55'] = 0
#
# roadmap.loc[roadmap['54']==1, '54'] = 0
from lxyTools.pytorchTools import BiInteraction
from lxyTools.pytorchTools import CrossNet
from lxyTools.pytorchTools import FM

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

import lightgbm as lgb

from tqdm import tqdm

tqdm.pandas(desc="my bar!")

set_random_seed(1546)

# 读取数据
train = pd.read_csv('./processedData/train.csv')
train.create_time = pd.to_datetime(train.create_time)
train.days = pd.to_datetime(train.days)

valid = pd.read_csv('./processedData/valid.csv')
valid.create_time = pd.to_datetime(valid.create_time)
valid.days = pd.to_datetime(valid.days)

# 合并训练数据,以及验证数据,给模型足够的数据去训练
train = pd.concat([train, valid])
train = train.reset_index(drop=True)

# 对于为零的样本进行降采样
Ejemplo n.º 5
0
    def fit(self, x, y, epoch_nums, batch_size, valid_x, valid_y, custom_metric=None, plot_fold=None, unsupervised_flag=False):

        if self.vis is not None:
            vis = visdom.Visdom(env=self.vis)

        set_random_seed(self.random_seed)

        self.batch_size = batch_size

        x_train = torch.tensor(x, dtype=torch.float32).cuda()
        y_train = torch.tensor(y, dtype=torch.float32).cuda()
        x_val = torch.tensor(valid_x, dtype=torch.float32).cuda()
        y_val = torch.tensor(valid_y, dtype=torch.float32).cuda()

        loss_fn = self.supervised_loss_fn
        optimizer = self.optimizer

        if unsupervised_flag:
            loss_fn = self.unsupervised_loss_fn
            optimizer = self.unsupervised_optimizer

        scheduler = self.scheduler

        train = torch.utils.data.TensorDataset(x_train, y_train)
        valid = torch.utils.data.TensorDataset(x_val, y_val)

        train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
        valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)

        for epoch in range(epoch_nums):
            scheduler.step()
            # print('lr:\t', scheduler.get_lr()[0])

            start_time = time.time()
            self.train()
            avg_loss = 0.
            avg_l2_reg = 0.
            for x_batch, y_batch in tqdm(train_loader, disable=True):

                y_pred = self(x_batch)
                if unsupervised_flag:
                    y_pred = y_pred[1]
                else:
                    y_pred = y_pred[0]

                bceloss = loss_fn(y_pred, y_batch)

                optimizer.zero_grad()
                bceloss.backward()
                optimizer.step()

                avg_loss += bceloss.item() / len(train_loader)

            self.eval()
            valid_preds = np.zeros((x_val.size(0)))

            avg_val_loss = 0.
            for i, (x_batch, y_batch) in enumerate(valid_loader):
                y_pred = self(x_batch)
                if unsupervised_flag:
                    y_pred = y_pred[1]
                else:
                    y_pred = y_pred[0]
                y_pred = y_pred.detach()
                avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
                valid_preds[i * batch_size:(i+1) * batch_size] = y_pred.cpu().numpy()[:, 0]

            if custom_metric is not None:
                score = custom_metric(valid_y, valid_preds)

            elapsed_time = time.time() - start_time

            if self.vis is not None:
                vis.line(X=torch.Tensor([[epoch, epoch]]),
                         Y=torch.Tensor([[avg_loss/batch_size, avg_val_loss/batch_size]]),
                         win='loss'+plot_fold,
                         opts={'legend':['local_loss', 'valid_loss'],
                               'xlabel': 'epoch',
                               'title': 'train'+plot_fold},
                         update='append' if epoch > 0 else None)

            if custom_metric is not None:
                if self.vis is not None:
                    vis.line(X=torch.Tensor([epoch]),
                             Y=torch.Tensor([score]),
                             win='score'+plot_fold,
                             opts={'legend':['score'],
                                   'xlabel': 'epoch',
                                   'title': 'valid'+plot_fold},
                             update='append' if epoch > 0 else None)

            if custom_metric is not None:
                print('Epoch {}/{} \t loss={:.4f}  \t l2={:.4f} \t val_loss={:.4f} \t score={:.4f} \t time={:.2f}s'.format(
                    epoch + 1, epoch_nums, avg_loss/batch_size, avg_l2_reg, avg_val_loss/batch_size, score, elapsed_time))
            else:
                print('Epoch {}/{} \t loss={:.4f}  \t l2={:.4f} \t val_loss={:.4f} \t time={:.2f}s'.format(
                    epoch + 1, epoch_nums, avg_loss/batch_size, avg_l2_reg, avg_val_loss/batch_size, elapsed_time))
    def fit(self, train_x, train_y, valid_x, valid_y, kfold, train_epochs,
            batch_size):

        seed_start = 10086
        seed_step = 500

        self.fittedModelslist = []

        custom_metric = lambda x, y: Online_Metric(x, y)[1]
        train_preds = np.zeros((len(train_x)))

        for fold_num, (train_idx,
                       valid_idx) in enumerate(kfold.split(train_x, train_y)):

            x_train_fold = train_x[train_idx]
            y_train_fold = train_y[train_idx]
            x_val_fold = train_x[valid_idx]
            y_val_fold = train_y[valid_idx]

            model = self.modelFun(random_seed=seed_start +
                                  fold_num * seed_step,
                                  **self.param)
            model = model.cuda()
            model.batch_size = batch_size
            vis = visdom.Visdom(env=model.vis)

            set_random_seed(model.random_seed)

            x_train_fold_tensor = torch.tensor(x_train_fold,
                                               dtype=torch.float32).cuda()
            y_train_fold_tensor = torch.tensor(y_train_fold,
                                               dtype=torch.float32).cuda()
            x_val_fold_tensor = torch.tensor(x_val_fold,
                                             dtype=torch.float32).cuda()
            y_val_fold_tensor = torch.tensor(y_val_fold,
                                             dtype=torch.float32).cuda()

            loss_fn = model.loss_fn
            optimizer = model.optimizer
            scheduler = model.scheduler

            train_fold = torch.utils.data.TensorDataset(
                x_train_fold_tensor, y_train_fold_tensor)
            valid_fold = torch.utils.data.TensorDataset(
                x_val_fold_tensor, y_val_fold_tensor)

            train_fold_loader = torch.utils.data.DataLoader(
                train_fold, batch_size=batch_size, shuffle=True)
            valid_fold_loader = torch.utils.data.DataLoader(
                valid_fold, batch_size=batch_size, shuffle=False)

            for epoch in range(train_epochs):
                scheduler.step()

                start_time = time.time()
                model.train()
                avg_loss = 0.
                for x_batch, y_batch in tqdm(train_fold_loader, disable=True):

                    y_pred = model(x_batch)
                    loss = loss_fn(y_pred, y_batch)

                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                    avg_loss += loss.item() / len(train_fold_loader)
                avg_loss /= batch_size

                model.eval()
                valid_fold_preds = np.zeros((y_val_fold.shape[0]))

                avg_val_loss = 0.
                for i, (x_batch, y_batch) in enumerate(valid_fold_loader):
                    y_pred = model(x_batch)
                    y_pred = y_pred.detach()
                    avg_val_loss += loss_fn(
                        y_pred, y_batch).item() / len(valid_fold_loader)
                    valid_fold_preds[i * batch_size:(i + 1) *
                                     batch_size] = y_pred.cpu().numpy()

                avg_val_loss /= batch_size

                kfold_score = custom_metric(y_val_fold, valid_fold_preds)
                valid_score = custom_metric(valid_y,
                                            model.predict_proba(valid_x))

                elapsed_time = time.time() - start_time

                vis.line(X=torch.Tensor([[epoch, epoch]]),
                         Y=torch.Tensor([[avg_loss, avg_val_loss]]),
                         win='loss' + '_' + str(fold_num),
                         opts={
                             'legend': ['local_loss', 'valid_loss'],
                             'xlabel': 'epoch',
                             'title': 'train' + '_' + str(fold_num)
                         },
                         update='append' if epoch > 0 else None)

                vis.line(X=torch.Tensor([[epoch, epoch]]),
                         Y=torch.Tensor([[kfold_score, valid_score]]),
                         win='score' + '_' + str(fold_num),
                         opts={
                             'legend': ['kfold score', 'valid score'],
                             'xlabel': 'epoch',
                             'title': 'valid' + '_' + str(fold_num)
                         },
                         update='append' if epoch > 0 else None)

                print(
                    'Epoch {}/{} \t loss={:.4f} \t val_fold_loss={:.4f} \t fold_score={:.4f} \t valid_score={:.4f} \t time={:.2f}s'
                    .format(epoch + 1, train_epochs, avg_loss, avg_val_loss,
                            kfold_score, valid_score, elapsed_time))

            train_preds[valid_idx] = model.predict_proba(x_val_fold)
            self.fittedModelslist.append(model)
            print('-' * 50)

        print('-' * 20, 'finished', '-' * 20)
        print('kfold score:\t', custom_metric(train_y, train_preds))
        print('valid score:\t', custom_metric(valid_y, self.predict(valid_x)))
        print('-' * 50)
    def fit(self, train_x, train_y, cat_features, valid_x, valid_y, kfold,
            train_epochs, batch_size):

        seed_start = 10086
        seed_step = 500

        self.fittedLgbModellist = []
        self.fittedNNModellist = []

        self.train_pred = np.zeros(train_x.shape[0])

        for fold_num, (kTrainIndex,
                       kTestIndex) in enumerate(kfold.split(train_x, train_y)):

            kTrain_x = train_x.iloc[kTrainIndex]
            kTrain_y = train_y.iloc[kTrainIndex]

            kTest_x = train_x.iloc[kTestIndex]
            kTest_y = train_y.iloc[kTestIndex]

            self.lgb_param['random_state'] = seed_start + fold_num * seed_step
            lgb_model = self.lgb_model_fun(**self.lgb_param)

            lgb_model.fit(kTrain_x,
                          kTrain_y,
                          categorical_feature=cat_features,
                          eval_metric=Online_Metric,
                          eval_set=[(kTest_x, kTest_y), (valid_x, valid_y)],
                          verbose=50)

            kTrain_x_leaves = lgb_model.predict(kTrain_x, pred_leaf=True)
            kTest_x_leaves = lgb_model.predict(kTest_x, pred_leaf=True)
            valid_x_leaves = lgb_model.predict(valid_x, pred_leaf=True)

            for i in range(self.lgb_param['n_estimators']):
                kTrain_x_leaves[:, i] += i * self.lgb_param['num_leaves']
                kTest_x_leaves[:, i] += i * self.lgb_param['num_leaves']
                valid_x_leaves[:, i] += i * self.lgb_param['num_leaves']

            self.nn_param['random_seed'] = seed_start + fold_num * seed_step
            set_random_seed(self.nn_param['random_seed'])
            nn_model = self.nn_model_fun(**self.nn_param).cuda()

            nn_model.fit(kTrain_x_leaves,
                         kTrain_y.values,
                         train_epochs,
                         batch_size,
                         kTest_x_leaves,
                         kTest_y.values,
                         custom_metric=lambda x, y: Online_Metric(x, y)[1],
                         plot_fold=str(fold_num))

            valid_pre = nn_model.predict(valid_x_leaves)
            self.train_pred[kTestIndex] = nn_model.predict(kTest_x_leaves)

            print('valid socre:\t', Online_Metric(valid_y, valid_pre)[1])
            print('--' * 50)
            self.fittedLgbModellist.append(lgb_model)
            self.fittedNNModellist.append(nn_model)

        print('train kfold score:\t', Online_Metric(train_y, self.train_pred))
        print('valid score:\t', Online_Metric(valid_y, self.predict(valid_x)))
    def fit(self, train_x, train_y, valid_x, valid_y, kfold, train_epochs,
            batch_size):

        seed_start = 10086
        seed_step = 500

        self.fittedModelslist = []

        custom_metric = lambda x, y: Online_Metric(x, y)[1]
        train_preds = np.zeros((len(train_x[0])))

        for fold_num, (train_idx, valid_idx) in enumerate(
                kfold.split(train_x[0], train_y)):
            x_train_fold = [data[train_idx] for data in train_x]
            y_train_fold = train_y[train_idx]

            x_val_fold = [data[valid_idx] for data in train_x]
            y_val_fold = train_y[valid_idx]

            model = self.modelFun(random_seed=seed_start +
                                  fold_num * seed_step,
                                  **self.param)
            model = model.cuda()
            model.batch_size = batch_size

            set_random_seed(model.random_seed)

            x_train_fold_tensor = [
                torch.tensor(x_train_fold[0], dtype=torch.long).cuda(),
                torch.tensor(x_train_fold[1], dtype=torch.float32).cuda(),
            ]

            x_val_fold_tensor = [
                torch.tensor(x_val_fold[0], dtype=torch.long).cuda(),
                torch.tensor(x_val_fold[1], dtype=torch.float32).cuda(),
            ]

            y_train_fold_tensor = torch.tensor(y_train_fold,
                                               dtype=torch.float32).cuda()
            y_val_fold_tensor = torch.tensor(y_val_fold,
                                             dtype=torch.float32).cuda()

            loss_fn = model.loss_fn
            optimizer = model.optimizer
            scheduler = model.scheduler

            train_fold = torch.utils.data.TensorDataset(
                *x_train_fold_tensor, y_train_fold_tensor)
            valid_fold = torch.utils.data.TensorDataset(
                *x_val_fold_tensor, y_val_fold_tensor)

            train_fold_loader = torch.utils.data.DataLoader(
                train_fold, batch_size=batch_size, shuffle=True)
            valid_fold_loader = torch.utils.data.DataLoader(
                valid_fold, batch_size=batch_size, shuffle=False)

            for epoch in range(train_epochs):
                scheduler.step()

                start_time = time.time()
                model.train()
                avg_loss = 0.
                for x_batch_1, x_batch_2, y_batch in tqdm(train_fold_loader,
                                                          disable=True):

                    x_batch = [x_batch_1, x_batch_2]
                    y_pred = model(*x_batch)
                    loss = loss_fn(y_pred, y_batch)

                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                    avg_loss += loss.item() / len(train_fold_loader)
                avg_loss /= batch_size

                model.eval()
                valid_fold_preds = np.zeros((y_val_fold.shape[0]))

                avg_val_loss = 0.
                for i, (x_batch_1, x_batch_2,
                        y_batch) in enumerate(valid_fold_loader):
                    x_batch = [x_batch_1, x_batch_2]
                    y_pred = model(*x_batch)
                    y_pred = y_pred.detach()
                    avg_val_loss += loss_fn(
                        y_pred, y_batch).item() / len(valid_fold_loader)
                    valid_fold_preds[i * batch_size:(i + 1) *
                                     batch_size] = y_pred.cpu().numpy()

                avg_val_loss /= batch_size

                kfold_score = custom_metric(y_val_fold, valid_fold_preds)
                valid_score = custom_metric(valid_y, model.predict(valid_x))

                elapsed_time = time.time() - start_time

                print(
                    'Epoch {}/{} \t loss={:.4f} \t val_fold_loss={:.4f} \t fold_score={:.4f} \t valid_score={:.4f} \t time={:.2f}s'
                    .format(epoch + 1, train_epochs, avg_loss, avg_val_loss,
                            kfold_score, valid_score, elapsed_time))

            train_preds[valid_idx] = model.predict(x_val_fold)
            self.fittedModelslist.append(model)
            print('-' * 50)

        print('-' * 20, 'finished', '-' * 20)
        print('kfold score:\t', custom_metric(train_y, train_preds))
        print('valid score:\t', custom_metric(valid_y, self.predict(valid_x)))
        print('-' * 50)