def train_epoch(self, epoch): self.model.train() losses = [] def reset_hidden(hidden, mask): """Helper function that resets hidden state when some sessions terminate""" if len(mask) != 0: hidden[:, mask, :] = 0 return hidden hidden = self.model.init_hidden() dataloader = lib.DataLoader(self.train_data, self.batch_size) #for ii,(data,label) in tqdm(enumerate(train_dataloader),total=len(train_data)): for ii, (input, target, mask) in tqdm( enumerate(dataloader), total=len(dataloader.dataset.df) // dataloader.batch_size, miniters=1000): input = input.to(self.device) target = target.to(self.device) self.optim.zero_grad() hidden = reset_hidden(hidden, mask).detach() logit, hidden = self.model(input, hidden) # output sampling logit_sampled = logit[:, target.view(-1)] loss = self.loss_func(logit_sampled) losses.append(loss.item()) loss.backward() self.optim.step() mean_losses = np.mean(losses) return mean_losses
def eval(self, eval_data, batch_size): self.model.eval() losses = [] recalls = [] mrrs = [] dataloader = lib.DataLoader(eval_data, batch_size) with torch.no_grad(): hidden = self.model.init_hidden() for ii, (input, target, mask) in tqdm( enumerate(dataloader), total=len(dataloader.dataset.df) // dataloader.batch_size, miniters=1000): #for input, target, mask in dataloader: input = input.to(self.device) target = target.to(self.device) logit, hidden = self.model(input, hidden) logit_sampled = logit[:, target.view(-1)] loss = self.loss_func(logit_sampled) recall, mrr = lib.evaluate(logit, target, k=self.topk) # torch.Tensor.item() to get a Python number from a tensor containing a single value losses.append(loss.item()) recalls.append(recall) mrrs.append(mrr.item()) mean_losses = np.mean(losses) mean_recall = np.mean(recalls) mean_mrr = np.mean(mrrs) return mean_losses, mean_recall, mean_mrr
def train_epoch(self, epoch): self.model.train() losses = [] def reset_hidden(hidden, mask): """Helper function that resets hidden state when some sessions terminate""" if len(mask) != 0: hidden[:, mask, :] = 0 return hidden hidden = self.model.init_hidden() dataloader = lib.DataLoader(self.train_data) for input, target, mask in dataloader: input = input.to(self.device) target = target.to(self.device) self.optim.zero_grad() hidden = reset_hidden(hidden, mask).detach() logit, hidden = self.model(input, hidden) # output sampling logit_sampled = logit[:, target.view(-1)] loss = self.loss_func(logit_sampled) losses.append(loss.item()) loss.backward() self.optim.step() mean_losses = np.mean(losses) return mean_losses
import lib if __name__ == '__main__': data_file = "data/suumo_sess_data_eval.csv" batch_size = 2 data = lib.Dataset(data_file) print(data.get_click_offset()) dataloader = lib.DataLoader(data, batch_size) for i, (input, target, mask) in enumerate(dataloader): if i < 10: print(input, target, mask)
visitor2idx = pd.Series(data=np.arange(len(visitor_ids)), index=visitor_ids) visitormap = pd.DataFrame({ 'visitorid': visitor_ids, 'visitor_idx': visitor2idx[visitor_ids].values }) item_dic_file_path = os.path.join(args.data_folder, args.item_dic_data) np.savetxt(item_dic_file_path, itemmap, fmt='%d') visitor_dic_file_path = os.path.join(args.data_folder, args.visitor_dic_data) np.savetxt(visitor_dic_file_path, visitormap, fmt='%d') train_data = lib.Dataset(os.path.join(args.data_folder, args.train_data), visitormap=visitormap, itemmap=itemmap) train_dataloader = lib.DataLoader(train_data, args.batch_size) train_dataloader.dataset.df.to_csv("./train.csv") valid_data = lib.Dataset(os.path.join(args.data_folder, args.valid_data), visitormap=visitormap, itemmap=itemmap) valid_dataloader = lib.DataLoader(valid_data, args.batch_size) valid_dataloader.dataset.df.to_csv("./valid.csv") all_df_data = pd.concat( [train_dataloader.dataset.df, valid_dataloader.dataset.df]) all_df_data.to_csv("./all.csv") # create train data input_filepath = os.path.join(args.data_folder, args.train_input_data) target_filepath = os.path.join(args.data_folder, args.train_target_data)
def main(): print("Loading train data from {}".format(args.raw_data)) df = pd.read_csv(args.raw_data) df_train_input_sc, df_train_target, df_test_input_sc, df_test_target = lib.clear_data(df, args) if args.algo == 'decisiontree': # min_samples_leaf: 0.05, min_samples_split: 10, class_weight: None, splitter: best, max_features: 10 # criterion: entropy, max_depth: 7 for all feature # min_samples_leaf: 0.05, min_samples_split: 3, class_weight: None, splitter: best, max_features: 8 # criterion: entropy, max_depth: 6 for discrete feature model = tree.DecisionTreeClassifier(min_samples_leaf=0.05, min_samples_split=3, class_weight=None, splitter="best", max_features=8, criterion="entropy", max_depth=6) model.fit(df_train_input_sc, df_train_target) y_pred = model.predict(df_test_input_sc) if args.algo == 'randomforest': # random_state: 42, n_estimators: 1000, criterion: gini, max_depth: 7, bootstrap: True, max_features: 5, # min_samples_leaf: 7, min_samples_split: 7 for all feature # random_state: 42, n_estimators: 100, criterion: gini, max_depth: 7, bootstrap: True, max_features: 5, # min_samples_leaf: 7, min_samples_split: 7 for discrete feature model = RandomForestClassifier(random_state=42, # pafam for using all feature n_estimators=1000, criterion="gini", max_depth=7, bootstrap=True, max_features=5, min_samples_leaf=7, min_samples_split=7) model.fit(df_train_input_sc, df_train_target) y_pred = model.predict(df_test_input_sc) if args.algo == 'logisticregression': # penalty: l1, random_state: 42, C: 0.05, tol: 0.01, intercept_scaling: 3, fit_intercept: True, # max_iter: 10 for all feature # penalty: l2, random_state: 42, C: 0.05, tol: 0.1, intercept_scaling: 1, fit_intercept: True, # max_iter: 10 for discrete feature model = LogisticRegression(penalty="l1", random_state=42, C=.05, tol=0.01, intercept_scaling=3, fit_intercept=True, max_iter=10) model.fit(df_train_input_sc, df_train_target) y_pred = model.predict(df_test_input_sc) if args.algo == 'ADA': model = AdaBoostClassifier() model.fit(df_train_input_sc, df_train_target) y_pred = model.predict(df_test_input_sc) if args.algo == 'XGB': model = XGBClassifier() model.fit(df_train_input_sc, df_train_target) y_pred = model.predict(df_test_input_sc) if args.algo == 'FFN': model = lib.FFN(df_train_input_sc.shape[1], args.output_dim, args.num_classes) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) dataloader = lib.DataLoader(df_train_input_sc, df_train_target, args.batchsize) # training model.train() for epoch in range(args.num_epochs): sum_loss = 0 cnt = 0 for it, (input_data, target_data) in enumerate(dataloader): cnt += 1 input_data = torch.Tensor(input_data) target_data = torch.LongTensor(target_data) optimizer.zero_grad() logit = model(input_data) loss = F.nll_loss(logit, target_data) pred = logit.data.max(1)[1] sum_loss += loss.item() loss.backward() optimizer.step() print("Epoch: {} - loss: {}".format(epoch, float(sum_loss) / cnt)) # testing model.eval() with torch.no_grad(): input_data_test = torch.Tensor(df_test_input_sc) target_data_test = torch.LongTensor(df_test_target) logit = model(input_data_test) loss = F.nll_loss(logit, target_data_test) y_pred = logit.data.max(1)[1] print(classification_report(df_test_target, y_pred))