Esempio n. 1
0
    def train_epoch(self, epoch):
        self.model.train()
        losses = []

        def reset_hidden(hidden, mask):
            """Helper function that resets hidden state when some sessions terminate"""
            if len(mask) != 0:
                hidden[:, mask, :] = 0
            return hidden

        hidden = self.model.init_hidden()
        dataloader = lib.DataLoader(self.train_data, self.batch_size)
        #for ii,(data,label) in tqdm(enumerate(train_dataloader),total=len(train_data)):
        for ii, (input, target, mask) in tqdm(
                enumerate(dataloader),
                total=len(dataloader.dataset.df) // dataloader.batch_size,
                miniters=1000):
            input = input.to(self.device)
            target = target.to(self.device)
            self.optim.zero_grad()
            hidden = reset_hidden(hidden, mask).detach()
            logit, hidden = self.model(input, hidden)
            # output sampling
            logit_sampled = logit[:, target.view(-1)]
            loss = self.loss_func(logit_sampled)
            losses.append(loss.item())
            loss.backward()
            self.optim.step()

        mean_losses = np.mean(losses)
        return mean_losses
Esempio n. 2
0
    def eval(self, eval_data, batch_size):
        self.model.eval()
        losses = []
        recalls = []
        mrrs = []
        dataloader = lib.DataLoader(eval_data, batch_size)
        with torch.no_grad():
            hidden = self.model.init_hidden()
            for ii, (input, target, mask) in tqdm(
                    enumerate(dataloader),
                    total=len(dataloader.dataset.df) // dataloader.batch_size,
                    miniters=1000):
                #for input, target, mask in dataloader:
                input = input.to(self.device)
                target = target.to(self.device)
                logit, hidden = self.model(input, hidden)
                logit_sampled = logit[:, target.view(-1)]
                loss = self.loss_func(logit_sampled)
                recall, mrr = lib.evaluate(logit, target, k=self.topk)

                # torch.Tensor.item() to get a Python number from a tensor containing a single value
                losses.append(loss.item())
                recalls.append(recall)
                mrrs.append(mrr.item())
        mean_losses = np.mean(losses)
        mean_recall = np.mean(recalls)
        mean_mrr = np.mean(mrrs)

        return mean_losses, mean_recall, mean_mrr
Esempio n. 3
0
    def train_epoch(self, epoch):
        self.model.train()
        losses = []

        def reset_hidden(hidden, mask):
            """Helper function that resets hidden state when some sessions terminate"""
            if len(mask) != 0:
                hidden[:, mask, :] = 0
            return hidden

        hidden = self.model.init_hidden()
        dataloader = lib.DataLoader(self.train_data)
        for input, target, mask in dataloader:
            input = input.to(self.device)
            target = target.to(self.device)
            self.optim.zero_grad()
            hidden = reset_hidden(hidden, mask).detach()
            logit, hidden = self.model(input, hidden)
            # output sampling
            logit_sampled = logit[:, target.view(-1)]
            loss = self.loss_func(logit_sampled)
            losses.append(loss.item())
            loss.backward()
            self.optim.step()

        mean_losses = np.mean(losses)
        return mean_losses
Esempio n. 4
0
import lib

if __name__ == '__main__':
    data_file = "data/suumo_sess_data_eval.csv"
    batch_size = 2
    data = lib.Dataset(data_file)
    print(data.get_click_offset())
    dataloader = lib.DataLoader(data, batch_size)

    for i, (input, target, mask) in enumerate(dataloader):
        if i < 10:
            print(input, target, mask)
visitor2idx = pd.Series(data=np.arange(len(visitor_ids)), index=visitor_ids)
visitormap = pd.DataFrame({
    'visitorid': visitor_ids,
    'visitor_idx': visitor2idx[visitor_ids].values
})

item_dic_file_path = os.path.join(args.data_folder, args.item_dic_data)
np.savetxt(item_dic_file_path, itemmap, fmt='%d')

visitor_dic_file_path = os.path.join(args.data_folder, args.visitor_dic_data)
np.savetxt(visitor_dic_file_path, visitormap, fmt='%d')

train_data = lib.Dataset(os.path.join(args.data_folder, args.train_data),
                         visitormap=visitormap,
                         itemmap=itemmap)
train_dataloader = lib.DataLoader(train_data, args.batch_size)
train_dataloader.dataset.df.to_csv("./train.csv")

valid_data = lib.Dataset(os.path.join(args.data_folder, args.valid_data),
                         visitormap=visitormap,
                         itemmap=itemmap)
valid_dataloader = lib.DataLoader(valid_data, args.batch_size)
valid_dataloader.dataset.df.to_csv("./valid.csv")

all_df_data = pd.concat(
    [train_dataloader.dataset.df, valid_dataloader.dataset.df])
all_df_data.to_csv("./all.csv")

# create train data
input_filepath = os.path.join(args.data_folder, args.train_input_data)
target_filepath = os.path.join(args.data_folder, args.train_target_data)
Esempio n. 6
0
def main():
    print("Loading train data from {}".format(args.raw_data))
    df = pd.read_csv(args.raw_data)
    df_train_input_sc, df_train_target, df_test_input_sc, df_test_target = lib.clear_data(df, args)

    if args.algo == 'decisiontree':
        # min_samples_leaf: 0.05, min_samples_split: 10, class_weight: None, splitter: best, max_features: 10
        # criterion: entropy, max_depth: 7 for all feature

        # min_samples_leaf: 0.05, min_samples_split: 3, class_weight: None, splitter: best, max_features: 8
        # criterion: entropy, max_depth: 6 for discrete feature

        model = tree.DecisionTreeClassifier(min_samples_leaf=0.05,
                                            min_samples_split=3,
                                            class_weight=None,
                                            splitter="best",
                                            max_features=8,
                                            criterion="entropy",
                                            max_depth=6)
        model.fit(df_train_input_sc, df_train_target)
        y_pred = model.predict(df_test_input_sc)

    if args.algo == 'randomforest':
        #  random_state: 42, n_estimators: 1000, criterion: gini, max_depth: 7, bootstrap: True, max_features: 5,
        #  min_samples_leaf: 7, min_samples_split: 7 for all feature

        #  random_state: 42, n_estimators: 100, criterion: gini, max_depth: 7, bootstrap: True, max_features: 5,
        #  min_samples_leaf: 7, min_samples_split: 7 for discrete feature

        model = RandomForestClassifier(random_state=42,  # pafam for using all feature
                                       n_estimators=1000,
                                       criterion="gini",
                                       max_depth=7,
                                       bootstrap=True,
                                       max_features=5,
                                       min_samples_leaf=7,
                                       min_samples_split=7)

        model.fit(df_train_input_sc, df_train_target)
        y_pred = model.predict(df_test_input_sc)

    if args.algo == 'logisticregression':
        # penalty: l1, random_state: 42, C: 0.05, tol: 0.01, intercept_scaling: 3, fit_intercept: True,
        # max_iter: 10 for all feature

        # penalty: l2, random_state: 42, C: 0.05, tol: 0.1, intercept_scaling: 1, fit_intercept: True,
        # max_iter: 10 for discrete feature

        model = LogisticRegression(penalty="l1",
                                   random_state=42,
                                   C=.05,
                                   tol=0.01,
                                   intercept_scaling=3,
                                   fit_intercept=True,
                                   max_iter=10)

        model.fit(df_train_input_sc, df_train_target)
        y_pred = model.predict(df_test_input_sc)

    if args.algo == 'ADA':
        model = AdaBoostClassifier()
        model.fit(df_train_input_sc, df_train_target)
        y_pred = model.predict(df_test_input_sc)

    if args.algo == 'XGB':
        model = XGBClassifier()
        model.fit(df_train_input_sc, df_train_target)
        y_pred = model.predict(df_test_input_sc)

    if args.algo == 'FFN':
        model = lib.FFN(df_train_input_sc.shape[1], args.output_dim, args.num_classes)
        optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)
        dataloader = lib.DataLoader(df_train_input_sc, df_train_target, args.batchsize)

        # training
        model.train()
        for epoch in range(args.num_epochs):
            sum_loss = 0
            cnt = 0
            for it, (input_data, target_data) in enumerate(dataloader):
                cnt += 1
                input_data = torch.Tensor(input_data)
                target_data = torch.LongTensor(target_data)
                optimizer.zero_grad()
                logit = model(input_data)
                loss = F.nll_loss(logit, target_data)
                pred = logit.data.max(1)[1]
                sum_loss += loss.item()
                loss.backward()
                optimizer.step()
            print("Epoch: {} - loss: {}".format(epoch, float(sum_loss) / cnt))

        # testing
        model.eval()
        with torch.no_grad():
            input_data_test = torch.Tensor(df_test_input_sc)
            target_data_test = torch.LongTensor(df_test_target)
            logit = model(input_data_test)
            loss = F.nll_loss(logit, target_data_test)
            y_pred = logit.data.max(1)[1]

    print(classification_report(df_test_target, y_pred))