def test(model, data_loader, criterion, posp, max_fpr): model.eval() targets, predicts = list(), list() loss = Averager() posp = torch.FloatTensor([posp]).cuda() one = torch.FloatTensor([1]).cuda() with torch.no_grad(): for j, (ids, values, seqlength, label, seq_mask) in enumerate(data_loader): ids, values = ids.cuda(), values.cuda() label = label.cuda().float() seq_mask = seq_mask.cuda() y, _ = model(ids, values, seqlength, seq_mask, 'tgt') p = posp * label + (one - posp) * (one - label) loss.add(torch.mean(p * criterion(y, label)).item()) targets.extend(label.tolist()) predicts.extend(y.tolist()) model.train() return roc_auc_score(targets, predicts, max_fpr=max_fpr), loss.item(), roc_auc_score( targets, predicts)
def validation(model, ctc_criterion, attn_criterion, evaluation_loader, ctc_converter, attn_converter, opt): """ validation or evaluation """ for p in model.parameters(): p.requires_grad = False n_correct = 0 norm_ED = 0 length_of_data = 0 infer_time = 0 valid_loss_avg = Averager() ctc_correct = 0 for i, (image_tensors, labels) in enumerate(evaluation_loader): batch_size = image_tensors.size(0) length_of_data = length_of_data + batch_size #image = image_tensors.cuda() image = image_tensors.to(device) length_for_pred = torch.IntTensor([opt.batch_max_length] * batch_size).to(device) text_for_pred = torch.LongTensor(batch_size, opt.batch_max_length + 1).fill_(0).to(device) ctc_text_for_loss, ctc_length_for_loss = ctc_converter.encode(labels) attn_text_for_loss, attn_length_for_loss = attn_converter.encode(labels) start_time = time.time() ctc_preds, attn_preds = model(image, text_for_pred) forward_time = time.time() - start_time # ctc ctc_preds = ctc_preds.log_softmax(2) # Calculate evaluation loss for CTC deocder. preds_size = torch.IntTensor([ctc_preds.size(1)] * batch_size) ctc_preds = ctc_preds.permute(1, 0, 2) # to use CTCloss format ctc_cost = ctc_criterion(ctc_preds, ctc_text_for_loss, preds_size, ctc_length_for_loss) # Select max probabilty (greedy decoding) then decode index to character _, preds_index = ctc_preds.max(2) preds_index = preds_index.transpose(1, 0).contiguous().view(-1) ctc_preds_str = ctc_converter.decode(preds_index.data, preds_size.data) # attention attn_preds = attn_preds[:, :attn_text_for_loss.shape[1] - 1, :] target = attn_text_for_loss[:, 1:] # without [GO] Symbol attn_cost = attn_criterion(attn_preds.contiguous().view(-1, attn_preds.shape[-1]), target.contiguous().view(-1)) # select max probabilty (greedy decoding) then decode index to character _, attn_preds_index = attn_preds.max(2) attn_preds_str = attn_converter.decode(attn_preds_index, length_for_pred) attn_labels = attn_converter.decode(attn_text_for_loss[:, 1:], attn_length_for_loss) cost = opt.ctc_weight * ctc_cost + (1.0 - opt.ctc_weight) * attn_cost infer_time += forward_time valid_loss_avg.add(cost) # calculate accuracy. #for attn_pred, attn_gt in zip(attn_preds_str, attn_labels): for pred, gt, attn_pred, attn_gt in zip(ctc_preds_str, labels, attn_preds_str, attn_labels): attn_pred = attn_pred[:attn_pred.find('[s]')] # prune after "end of sentence" token ([s]) attn_gt = attn_gt[:attn_gt.find('[s]')] if pred == gt: ctc_correct += 1 if attn_pred == attn_gt: n_correct += 1 norm_ED += edit_distance(attn_pred, attn_gt) / len(attn_gt) accuracy = n_correct / float(length_of_data) * 100 ctc_accuracy = ctc_correct / float(length_of_data) * 100 return valid_loss_avg.val(), accuracy, ctc_accuracy, norm_ED, attn_preds_str, attn_labels, infer_time, length_of_data
propagate_loader = DataLoader(dataset=trainset, batch_size=1280, shuffle=True, num_workers=24, pin_memory=True) premodel = torch.load(cfg.load) # model = Convnet() model = copyModel(torch.load(cfg.load), Convnet()).cuda() # model = copyModel(Convnet(), torch.load(cfg.oad)).cuda() model.eval() allacc = [] ave_acc = Averager() allExtraData = [] with torch.no_grad(): for i, batch in enumerate(propagate_loader, 1): ext_data, _ = [_.cuda() for _ in batch] allExtraData.append(ext_data) allExtraData = torch.cat(allExtraData) if cfg.progalambda > 0: with torch.no_grad(): n = 1280 allExtraproto = [] index = torch.randperm( allExtraData.shape[0])[:int(allExtraData.shape[0] / 10)] extraDatatemp = allExtraData[index]
def train(opt): """ dataset preparation """ opt.select_data = opt.select_data.split('-') opt.batch_ratio = opt.batch_ratio.split('-') train_dataset = Batch_Balanced_Dataset(opt) AlignCollate_valid = AlignCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio_with_pad=opt.PAD) valid_dataset = hierarchical_dataset(root=opt.valid_data, opt=opt) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=opt.batch_size, shuffle=True, # 'True' to check training progress with validation function. num_workers=int(opt.workers), collate_fn=AlignCollate_valid, pin_memory=True) print('-' * 80) """ model configuration """ if 'CTC' in opt.Prediction: converter = CTCLabelConverter(opt.character) else: converter = AttnLabelConverter(opt.character) opt.num_class = len(converter.character) if opt.rgb: opt.input_channel = 3 model = Model(opt) print('model input parameters', opt.imgH, opt.imgW, opt.num_fiducial, opt.input_channel, opt.output_channel, opt.hidden_size, opt.num_class, opt.batch_max_length, opt.Transformation, opt.FeatureExtraction, opt.SequenceModeling, opt.Prediction) # weight initialization for name, param in model.named_parameters(): if 'localization_fc2' in name: print(f'Skip {name} as it is already initialized') continue try: if 'bias' in name: init.constant_(param, 0.0) elif 'weight' in name: init.kaiming_normal_(param) except Exception as e: # for batchnorm. if 'weight' in name: param.data.fill_(1) continue # data parallel for multi-GPU model = torch.nn.DataParallel(model).cuda() model.train() if opt.continue_model != '': if opt.without_prediction: load_model_without_prediction(opt.continue_model, model) print(f'loading pretrained model from {opt.continue_model}, without prediction layer') else: print(f'loading pretrained model from {opt.continue_model}') model.load_state_dict(torch.load(opt.continue_model)) print("Model:") print(model) """ setup loss """ if 'CTC' in opt.Prediction: criterion = torch.nn.CTCLoss(zero_infinity=True).cuda() else: criterion = torch.nn.CrossEntropyLoss(ignore_index=0).cuda() # ignore [GO] token = ignore index 0 # loss averager loss_avg = Averager() # filter that only require gradient decent filtered_parameters = [] params_num = [] for p in filter(lambda p: p.requires_grad, model.parameters()): filtered_parameters.append(p) params_num.append(np.prod(p.size())) print('Trainable params num : ', sum(params_num)) # [print(name, p.numel()) for name, p in filter(lambda p: p[1].requires_grad, model.named_parameters())] # setup optimizer if opt.adam: optimizer = optim.Adam(filtered_parameters, lr=opt.lr, betas=(opt.beta1, 0.999)) else: optimizer = optim.Adadelta(filtered_parameters, lr=opt.lr, rho=opt.rho, eps=opt.eps) print("Optimizer:") print(optimizer) """ final options """ # print(opt) with open(f'./saved_models/{opt.experiment_name}/opt.txt', 'a') as opt_file: opt_log = '------------ Options -------------\n' args = vars(opt) for k, v in args.items(): opt_log += f'{str(k)}: {str(v)}\n' opt_log += '---------------------------------------\n' print(opt_log) opt_file.write(opt_log) """ start training """ start_iter = 0 if opt.continue_model != '': print(f'continue to train, start_iter: {start_iter}') start_time = time.time() best_accuracy = -1 best_norm_ED = 1e+6 i = start_iter while True: # train part for p in model.parameters(): p.requires_grad = True image_tensors, labels = train_dataset.get_batch() image = image_tensors.cuda() text, length = converter.encode(labels) batch_size = image.size(0) if 'CTC' in opt.Prediction: preds = model(image, text).log_softmax(2) preds_size = torch.IntTensor([preds.size(1)] * batch_size) preds = preds.permute(1, 0, 2) # to use CTCLoss format cost = criterion(preds, text, preds_size, length) else: preds = model(image, text) target = text[:, 1:] # without [GO] Symbol cost = criterion(preds.view(-1, preds.shape[-1]), target.contiguous().view(-1)) model.zero_grad() cost.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), opt.grad_clip) # gradient clipping with 5 (Default) optimizer.step() loss_avg.add(cost) # validation part if i % opt.valInterval == 0: elapsed_time = time.time() - start_time logging.info(f'[{i}/{opt.num_iter}] Loss: {loss_avg.val():0.5f} elapsed_time: {elapsed_time:0.5f}') # for log with open(f'./saved_models/{opt.experiment_name}/log_train.txt', 'a') as log: log.write(f'[{i}/{opt.num_iter}] Loss: {loss_avg.val():0.5f} elapsed_time: {elapsed_time:0.5f}\n') loss_avg.reset() model.eval() valid_loss, current_accuracy, current_norm_ED, preds, labels, infer_time, length_of_data = validation( model, criterion, valid_loader, converter, opt) model.train() for pred, gt in zip(preds[:5], labels[:5]): if 'Attn' in opt.Prediction: pred = pred[:pred.find('[s]')] gt = gt[:gt.find('[s]')] print(f'{pred:20s}, gt: {gt:20s}, {str(pred == gt)}') log.write(f'{pred:20s}, gt: {gt:20s}, {str(pred == gt)}\n') valid_log = f'[{i}/{opt.num_iter}] valid loss: {valid_loss:0.5f}' valid_log += f' accuracy: {current_accuracy:0.3f}, norm_ED: {current_norm_ED:0.2f}' log.write(valid_log + '\n') # keep best accuracy model if current_accuracy > best_accuracy: best_accuracy = current_accuracy torch.save(model.state_dict(), f'./saved_models/{opt.experiment_name}/mtl_best_accuracy.pth') if current_norm_ED < best_norm_ED: best_norm_ED = current_norm_ED torch.save(model.state_dict(), f'./saved_models/{opt.experiment_name}/best_norm_ED.pth') best_model_log = f'best_accuracy: {best_accuracy:0.3f}, best_norm_ED: {best_norm_ED:0.2f}' logging.info(best_model_log) log.write(best_model_log + '\n') # save model per 1e+5 iter. if (i + 1) % 50000 == 0: torch.save( model.state_dict(), f'./saved_models/{opt.experiment_name}/iter_{i+1}.pth') if i == opt.num_iter: logging.info('end the training') sys.exit() i += 1
def train(opt): #logging.info(opt) train_dataset = Batch_Dataset(opt) AlignCollate_valid = AlignCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio_with_pad=opt.PAD) valid_dataset = LmdbDataset(root=opt.valid_data, opt=opt) valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=opt.batch_size, shuffle=True, num_workers=int(opt.workers), collate_fn=AlignCollate_valid, pin_memory=True) print('-' * 80) """ model configuration """ ctc_converter = CTCLabelConverter(opt.character, opt.subword) attn_converter = AttnLabelConverter(opt.character, opt.subword, opt.batch_max_length) opt.num_class = len(attn_converter.character) opt.ctc_num_class = len(ctc_converter.character) print("ctc num class {}".format(len(ctc_converter.character))) print("attention num class {}".format(len(attn_converter.character))) if opt.rgb: opt.input_channel = 3 model = MyModel(opt) # weight initialization for name, param in model.named_parameters(): if 'localization_fc2' in name: print('Skip {name} as it is already initialized'.format(name)) continue try: if 'bias' in name: init.constant_(param, 0.0) elif 'weight' in name: init.kaiming_normal_(param) except Exception as e: if 'weight' in name: param.data.fill_(1) continue model = torch.nn.DataParallel(model).to(device) model.train() if opt.continue_model != '': print('loading pretrained model from {}'.format(opt.continue_model)) model.load_state_dict(torch.load(opt.continue_model)) """ setup loss """ ctc_criterion = torch.nn.CTCLoss(zero_infinity=True).to(device) attn_criterion = torch.nn.CrossEntropyLoss(ignore_index=0).to(device) loss_avg = Averager() filtered_parameters = [] params_num = [] for p in filter(lambda p: p.requires_grad, model.parameters()): filtered_parameters.append(p) params_num.append(np.prod(p.size())) print('Trainable params num : ', sum(params_num)) if opt.adam: optimizer = optim.Adam(filtered_parameters, lr=opt.lr, betas=(opt.beta1, 0.999)) else: optimizer = optim.Adadelta(filtered_parameters, lr=opt.lr, rho=opt.rho, eps=opt.eps) print("Optimizer:") print(optimizer) """ final options """ with open(osj(opt.outPath, '{}/opt.txt'.format(opt.experiment_name)), 'a') as opt_file: opt_log = '------------ Options -------------\n' args = vars(opt) for k, v in args.items(): opt_log += '{}: {}\n'.format(str(k), str(v)) opt_log += '---------------------------------------\n' print(opt_log) opt_file.write(opt_log) """ start training """ start_iter = 0 if opt.continue_model != '': print('continue to train, start_iter: {}'.format(start_iter)) start_time = time.time() best_accuracy = -1 i = start_iter while True: # train part for p in model.parameters(): p.requires_grad = True image_tensors, labels = train_dataset.get_batch() image = image_tensors.to(device) ctc_text, ctc_length = ctc_converter.encode(labels) attn_text, attn_length = attn_converter.encode(labels) batch_size = image.size(0) # ctc loss ctc_preds, attn_preds = model(image, attn_text) ctc_preds = ctc_preds.log_softmax(2) preds_size = torch.IntTensor([ctc_preds.size(1)] * batch_size) ctc_preds = ctc_preds.permute(1, 0, 2) ctc_cost = ctc_criterion(ctc_preds, ctc_text, preds_size, ctc_length) # attn loss target = attn_text[:, 1:] attn_cost = attn_criterion(attn_preds.view(-1, attn_preds.shape[-1]), target.contiguous().view(-1)) cost = opt.ctc_weight * ctc_cost + (1.0 - opt.ctc_weight) * attn_cost model.zero_grad() cost.backward() torch.nn.utils.clip_grad_norm_( model.parameters(), opt.grad_clip) # gradient clipping with 5 (Default) optimizer.step() loss_avg.add(cost) # validation part if i % opt.valInterval == 0: elapsed_time = time.time() - start_time logging.info('[{}/{}] Loss: {:0.5f} elapsed_time: {:0.5f}'.format( i, opt.num_iter, loss_avg.val(), elapsed_time)) # for log with open( osj(opt.outPath, '{}/log_train.txt'.format(opt.experiment_name)), 'a') as log: log.write( '[{}/{}] Loss: {:0.5f} elapsed_time: {:0.5f}\n'.format( i, opt.num_iter, loss_avg.val(), elapsed_time)) loss_avg.reset() model.eval() with torch.no_grad(): valid_loss, current_accuracy, ctc_accuracy, current_norm_ED, preds, labels, infer_time, length_of_data \ = mtl_validation(model, ctc_criterion, attn_criterion, valid_loader, ctc_converter, attn_converter, opt) model.train() for pred, gt in zip(preds[:5], labels[:5]): pred = pred[:pred.find('[s]')] gt = gt[:gt.find('[s]')] print('{:20s}, gt: {:20s}, {}'.format( pred, gt, str(pred == gt))) log.write('{:20s}, gt: {:20s}, {}\n'.format( pred, gt, str(pred == gt))) valid_log = '[{}/{}] valid loss: {:0.5f}'.format( i, opt.num_iter, valid_loss) valid_log += ' accuracy: {:0.3f}'.format(current_accuracy) log.write(valid_log + '\n') # save best accuracy model if current_accuracy > best_accuracy: best_accuracy = current_accuracy torch.save( model.state_dict(), osj(opt.outPath, '{}/best_accuracy.pth'.format( opt.experiment_name))) best_model_log = 'best_accuracy: {:0.3f}'.format(best_accuracy) logging.info(best_model_log) log.write(best_model_log + '\n') if (i + 1) % 50000 == 0: torch.save( model.state_dict(), osj(opt.outPath, '{}/iter_{}.pth'.format(opt.experiment_name, i + 1))) if i == opt.num_iter: logging.info('end the training') sys.exit() i += 1
def validation(model, criterion, evaluation_loader, converter, opt): """ validation or evaluation """ for p in model.parameters(): p.requires_grad = False n_correct = 0 norm_ED = 0 length_of_data = 0 infer_time = 0 valid_loss_avg = Averager() for i, (image_tensors, labels) in enumerate(evaluation_loader): batch_size = image_tensors.size(0) length_of_data = length_of_data + batch_size with torch.no_grad(): image = image_tensors.cuda() # For max length prediction length_for_pred = torch.cuda.IntTensor([opt.batch_max_length] * batch_size) text_for_pred = torch.cuda.LongTensor( batch_size, opt.batch_max_length + 1).fill_(0) text_for_loss, length_for_loss = converter.encode(labels) start_time = time.time() if 'CTC' in opt.Prediction: preds = model(image, text_for_pred).log_softmax(2) forward_time = time.time() - start_time # Calculate evaluation loss for CTC deocder. preds_size = torch.IntTensor([preds.size(1)] * batch_size) preds = preds.permute(1, 0, 2) # to use CTCloss format cost = criterion(preds, text_for_loss, preds_size, length_for_loss) # Select max probabilty (greedy decoding) then decode index to character _, preds_index = preds.max(2) preds_index = preds_index.transpose(1, 0).contiguous().view(-1) preds_str = converter.decode(preds_index.data, preds_size.data) else: preds = model(image, text_for_pred, is_train=False) forward_time = time.time() - start_time preds = preds[:, :text_for_loss.shape[1] - 1, :] target = text_for_loss[:, 1:] # without [GO] Symbol cost = criterion(preds.contiguous().view(-1, preds.shape[-1]), target.contiguous().view(-1)) # select max probabilty (greedy decoding) then decode index to character _, preds_index = preds.max(2) preds_str = converter.decode(preds_index, length_for_pred) labels = converter.decode(text_for_loss[:, 1:], length_for_loss) infer_time += forward_time valid_loss_avg.add(cost) # calculate accuracy. for pred, gt in zip(preds_str, labels): if 'Attn' in opt.Prediction: pred = pred[:pred.find( '[s]')] # prune after "end of sentence" token ([s]) gt = gt[:gt.find('[s]')] if pred == gt: n_correct += 1 norm_ED += edit_distance(pred, gt) / len(gt) accuracy = n_correct / float(length_of_data) * 100 return valid_loss_avg.val( ), accuracy, norm_ED, preds_str, labels, infer_time, length_of_data
allExtraData = [] with torch.no_grad(): for i, batch in enumerate(propagate_loader, 1): ext_data, _ = [_.cuda() for _ in batch] allExtraData.append(ext_data) allExtraData = torch.cat(allExtraData) timer = Timer() for epoch in range(1, cfg.max_epoch + 1): # torch.cuda.empty_cache() lr_scheduler.step() model.train() tl = Averager() ta = Averager() for i, batch in enumerate(train_loader, 1): # time.sleep(100) torch.cuda.empty_cache() data, _ = [_.cuda() for _ in batch] p = cfg.shot * cfg.train_way data_shot, data_query = data[:p], data[p:] proto = model(data_shot) proto = proto.reshape(cfg.shot, cfg.train_way, -1).mean(dim=0) query_proto = model(data_query) p = (1 - cfg.progalambda) * proto
def train(model, optimizer, src_loader, tgt_loader, valid_loader, criterion, log_interval=1000, val_interval=50, posp=1, nagp=0.5, params_cls=0.5, params_da=0.5, da_type='cmmd', max_fpr=0.01): global max_auc global max_auchead global min_loss posp = torch.FloatTensor([posp]).cuda() nagp = torch.FloatTensor([nagp]).cuda() one = torch.FloatTensor([1]).cuda() iter_src = iter(src_loader) iter_tgt = iter(tgt_loader) num_iter = len(src_loader) stoper = Stoper() avg_all_loss = Averager() avg_src_loss = Averager() avg_tgt_loss = Averager() avg_da_loss = Averager() start_time = time.time() for i in range(1, num_iter * 20): model.train() src_ids, src_values, src_seqlength, src_label, src_seq_mask = iter_src.next( ) src_ids, src_values, src_label = src_ids.cuda(), src_values.cuda( ), src_label.cuda().float() src_seq_mask = src_seq_mask.cuda() if i % len(src_loader) == 0: iter_src = iter(src_loader) if i % len(tgt_loader) == 0: iter_tgt = iter(tgt_loader) src_p = posp * src_label + nagp * (one - src_label) src_y, src_fea_LSTM = model(src_ids, src_values, src_seqlength, src_seq_mask, 'src') src_loss = torch.mean( src_p * criterion(src_y, src_label) ) # + torch.mean(src_p * criterion(src_spey, src_label)) tgt_ids, tgt_values, tgt_seqlength, tgt_label, tgt_seq_mask = iter_tgt.next( ) tgt_ids, tgt_values, tgt_label = tgt_ids.cuda(), tgt_values.cuda( ), tgt_label.cuda().float() tgt_seq_mask = tgt_seq_mask.cuda() # print(tgt_seqlength, tgt_label) tgt_p = posp * tgt_label + nagp * (one - tgt_label) tgt_y, tgt_fea_LSTM, tgt_spey = model(tgt_ids, tgt_values, tgt_seqlength, tgt_seq_mask, 'tgt') tgt_loss = torch.mean( tgt_p * criterion(tgt_y, tgt_label) ) # + 0.5 * torch.mean(tgt_p * criterion(tgt_spey, tgt_label)) if da_type == 'cmmd': da_loss = cmmd(src_fea_LSTM, tgt_fea_LSTM, src_label.long(), tgt_label.long()) elif da_type == 'mmd': da_loss = mmd_rbf_noaccelerate(src_fea_LSTM, tgt_fea_LSTM) elif da_type == 'coral': da_loss = coral(src_fea_LSTM, tgt_fea_LSTM) elif da_type == 'euclidian': da_loss = euclidian(src_fea_LSTM, tgt_fea_LSTM) elif da_type == 'c_euclidian': da_loss = c_euclidian(src_fea_LSTM, tgt_fea_LSTM, src_label.long(), tgt_label.long()) elif da_type == 'nometric': da_loss = nometric(src_fea_LSTM, tgt_fea_LSTM) elif da_type == 'ced': da_loss = ced(src_fea_LSTM, tgt_fea_LSTM, src_label.long(), tgt_label.long()) lambd = 2 / (1 + math.exp((-5 * i) / (len(src_loader)))) - 1 loss = params_cls * src_loss + tgt_loss + params_da * lambd * da_loss model.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) optimizer.step() avg_all_loss.add(loss.item()) avg_src_loss.add(src_loss.item()) avg_tgt_loss.add(tgt_loss.item()) avg_da_loss.add(da_loss.item()) if (i + 1) % log_interval == 0: print( 'step: {}, loss: {:.4f}, src_loss: {:.4f}, tgt_loss: {:.4f}, {}_loss:, {:.4f}, lambda: {}' .format(i + 1, avg_all_loss.item(), avg_src_loss.item(), avg_tgt_loss.item(), da_type, avg_da_loss.item(), lambd)) avg_all_loss = Averager() avg_src_loss = Averager() avg_tgt_loss = Averager() avg_da_loss = Averager() if (i + 1) % val_interval == 0: end_time = time.time() print('train time (s):', end_time - start_time) start_time = time.time() auc_head, loss, auc = test(model, valid_loader, criterion, posp, max_fpr) if loss < min_loss: min_loss = loss if auc > max_auc: max_auc = auc if auc_head > max_auchead: torch.save(model, f'{save_dir}/tmp.pt') max_auchead = auc_head print( 'dev --- auchead: {:.4f}, max_auchead: {:.4f}, auc: {:.4f}, max_auc: {:.4f}, loss: {:.4f}, minloss: {:.4f}' .format(auc_head, max_auchead, auc, max_auc, loss, min_loss)) end_time = time.time() print('dev time (s):', end_time - start_time) start_time = time.time() if stoper.add(auc_head): print('training end') break
def transform(self, stu, que, ans) -> pd.DataFrame: """ Main method to calculate, preprocess students's features and append textual embeddings :param stu: students dataframe with preprocessed textual columns :param que: questions dataframe with preprocessed textual columns :param ans: answers dataframe with preprocessed textual columns :return: dataframe of students's id, timestamp and model-friendly students's features after that timestamp """ stu['students_state'] = stu['students_location'].apply( lambda s: str(s).split(', ')[-1]) que['questions_body_length'] = que['questions_body'].apply( lambda s: len(str(s))) ans['answers_body_length'] = ans['answers_body'].apply( lambda s: len(str(s))) # prepare all the dataframes needed for iteration que_change = stu.merge(que, left_on='students_id', right_on='questions_author_id') ans_change = que_change.merge(ans, left_on='questions_id', right_on='answers_question_id') \ .rename(columns={'answers_date_added': 'students_time'}) # add new columns which will be used to determine to which change corressponds stacked DataFrame row ans_change['change_type'] = 'answer' que_change['change_type'] = 'question' que_change = que_change.rename( columns={'questions_date_added': 'students_time'}) # stack two DataFrame to form resulting one for iteration df = pd.concat([que_change, ans_change], ignore_index=True, sort=True).sort_values('students_time') # data is a dist with mapping from student's id to his list of features # each list contains dicts with mapping from feature name to its value on a particular moment data = {} avgs = {} for i, row in stu.iterrows(): cur_stu = row['students_id'] # DEFAULT CASE # student's feature values before he left any questions if cur_stu not in data: new = { 'students_questions_asked': 0, 'students_previous_question_time': row['students_date_joined'] } for feature in ['students_time' ] + self.features['numerical']['mean']: new[feature] = None data[cur_stu] = [new] avgs[cur_stu] = { feature: Averager() for feature in self.features['numerical']['mean'] } for i, row in df.iterrows(): cur_stu = row['students_id'] # features on previous timestamp prv = data[cur_stu][-1] new = prv.copy() new['students_time'] = row['students_time'] # UPDATE RULES # if current change is new question, update question-depended features if row['change_type'] == 'question': new['students_questions_asked'] += 1 new['students_previous_question_time'] = row[ 'questions_date_added'] new['students_average_question_body_length'] = row[ 'questions_body_length'] # if new answer is added, update answer-depended features else: new['students_average_answer_body_length'] = row[ 'answers_body_length'] new['students_average_answer_amount'] = new['students_average_answer_amount'] + 1 \ if new['students_average_answer_amount'] is not None else 1 # NORMALIZE AVERAGE FEATURES for feature in ['students_average_question_body_length'] if row['change_type'] == 'question' else \ ['students_average_answer_body_length', 'students_average_answer_amount']: avgs[cur_stu][feature].upd(new[feature]) new[feature] = avgs[cur_stu][feature].get() data[cur_stu].append(new) # construct a DataFrame out of dict of list of feature dicts df = pd.DataFrame([{ **f, **{ 'students_id': id } } for (id, fs) in data.items() for f in fs]) df = df.merge(stu, on='students_id') # launch feature pre-processing self.preprocess(df) # re-order the columns df = df[['students_id', 'students_time'] + self.features['all']] return df