def compute_pivot_rep(model_dir, input_doc_list, param_fpathin_voca2index, mtype='TopicalWordEmbedding'): ''' given a list of documents, transfer the documents into instances, enumerate the instances and compute the pivot representations. ==================== params: ---------- model_dir: saved model dir input_list: input documents, unparsed mtype: model name return: ---------- (pivot word list, rep list, topic rep list) ''' # ----------load the voca2index # fpointerInVocabulary2Index = open( # param_fpathin_voca2index, # 'rt', # encoding='utf8') # dictVocabulary2Index = \ # json.load(fpointerInVocabulary2Index) # fpointerInVocabulary2Index.close() # ----------get a list of (pivot word, xn, wc) oYelpPreprocessor = YelpPreprocessor() parsed_list = oYelpPreprocessor.yelpDoclist2Parsedlist( paramDocList=input_doc_list, paramFpathInVocabulary2Index=param_fpathin_voca2index) (list_pivot, list_xn, list_wc) = zip(*parsed_list) # ----------load the trained model config = DefaultConfig() config.set_attrs({'batch_size': len(list_pivot)}) model_path = '%s/model' % model_dir model = topicalWordEmbedding.TopicalWordEmbedding( param_on_cuda=config.on_cuda, param_half_window_size=HALF_WINDOW_SIZE, param_vocabulary_size=VOCABULARY_SIZE, param_hidden_layer_size=HIDDEN_LAYER_SIZE, param_encoder_pi_size=DIM_ENCODER, param_topic_count=TOPIC_COUNT) print('Loading trained model') if config.on_cuda: model.load(model_path) model = model.cuda() else: model.load_cpu_from_gputrained(model_path) model = model.cpu() # ----------compute the representation list arr_xn = numpy.zeros((len(list_xn), VOCABULARY_SIZE), dtype=numpy.int32) for list_xn_linenum, list_xn_vocabindex in enumerate(list_xn): arr_xn[list_xn_linenum, list_xn_vocabindex] += 1 arr_xn = arr_xn.astype(numpy.float32) arr_wc = numpy.array(list_wc).astype(numpy.float32) if config.on_cuda: var_xn = Variable(torch.from_numpy(arr_xn)).cuda() var_wc = Variable(torch.from_numpy(arr_wc)).cuda() else: var_xn = Variable(torch.from_numpy(arr_xn)).cpu() var_wc = Variable(torch.from_numpy(arr_wc)).cpu() var_rep = model.forward_obtain_xn_rep(var_xn, var_wc) var_zeta = model.forward_obtain_xn_zeta(var_xn, var_wc) arr_rep = var_rep.data.cpu().numpy() arr_zeta = var_zeta.data.cpu().numpy() return list_pivot, arr_rep, arr_zeta
def train(**kwargs): ''' begin training the model *kwargs: train(1, 2, 3, 4, 5) => kwargs[0] = 1 kwargs[1] = 2 ..., kwargs is principally a tuple **kwargs: train(a=1, b=2, c=3, d=4) CustomPreProcessor => kwargs[a] = 1, kwargs[b] = 2, kwargs[c] = 3, kwargs[d] = 4, kwargs is principally a dict function containing kwargs *kwargs **kwargs must be written as: def train(args,*args,**args) ''' saveid = latest_save_num() + 1 # the save_path is save_path = '%s/%d' % (SAVE_DIR, saveid) print("logger save path: %s" % (save_path)) if not os.path.exists(save_path): os.makedirs(save_path) log_path_each_save = '%s/log.txt' % save_path model_path_each_save = '%s/model' % save_path logger = get_logger(log_path_each_save) config = DefaultConfig() # settings here, avalid_data_utillso about whether on cuda config.set_attrs(kwargs) # print(config.get_attrs()) epochs = config.epochs batch_size = config.batch_size # determine whether to run on cuda if config.on_cuda: config.on_cuda = torch.cuda.is_available() if not config.on_cuda: logger.info('Cuda is unavailable,\ Although wants to run on cuda,\ Model still run on CPU') # 300 in our model # 1024 is the Elmo size, # the concatenated hidden size is supposed to Elmo size, # however, any size is OK # it depends on the setting # attention size should be a smoothed representation of character-emb if config.model == 'TopicalWordEmbedding': model = topicalWordEmbedding.TopicalWordEmbedding( param_on_cuda=config.on_cuda, param_half_window_size=HALF_WINDOW_SIZE, param_vocabulary_size=VOCABULARY_SIZE, param_hidden_layer_size=HIDDEN_LAYER_SIZE, param_encoder_pi_size=DIM_ENCODER, param_topic_count=TOPIC_COUNT) if config.on_cuda: logger.info('Model run on GPU') model = model.cuda() logger.info('Model initialized on GPU') else: logger.info('Model run on CPU') model = model.cpu() logger.info('Model initialized on CPU') # print('logger-setted',file=sys.stderr) # output the string informetion to the log logger.info(model.modelname) # output the string information to the log logger.info(str(config.get_attrs())) # read in the trainset and the trial set # Train Set train_data_manager = DataManager(batch_size, TRAINING_INSTANCES) train_data_manager.load_dataframe_from_file(TRAIN_SET_PATH) # set the optimizer parameter, # such as learning rate and weight_decay, # function Adam, a method for Stochastic Optizimism # load the learning rate in config, that is settings.py lr = config.learning_rate # params_iterator_requires_grad can only be iterated once params_iterator_requires_grad = filter( lambda trainingParams: trainingParams.requires_grad, model.parameters()) # print(len(list(params_iterator_requires_grad))) # 25 parameters # weight decay that is L2 penalty that is L2 regularization, # usually added after a cost function(loss function), # for example C=C_0+penalty, QuanZhongShuaiJian, # to avoid overfitting optimizer = torch.optim.Adam( params_iterator_requires_grad, lr=lr, weight_decay=config.weight_decay) # By default, the losses are averaged over observations # for each minibatch. # However, if the field size_average is set to False, # the losses are instead # summed for each minibatch # The CrossEntropyLoss, # My selector in my notebook = loss + selecting strategy # (often is selecting the least loss) # criterion = torch.nn.CrossEntropyLoss(size_average=False) # once you have the loss function, you also have # to train the parameters in g(x), # which will be used for prediction # the loss calculated after the smooth method, # that is L2 penalty mentioned in torch.optim.Adam loss_meter = meter.AverageValueMeter() # get confusionMatrix, the confusion matrix is the one show as follows: # confusion_matrix = meter.ConfusionMeter( # CLASS_COUNT) ''' class1 predicted class2 predicted class3 predicted class1 ground truth [[4, 1, 1] class2 ground truth [2, 3, 1] class2 ground truth [1, 2, 9]] ''' model.train() # pre_loss = 1e100 # best_acc = 0 smallest_loss = 0x7fffffffffffffffffffffffffffffff for epoch in range(epochs): ''' an epoch, that is, train data of all barches(all the data) for one time ''' loss_meter.reset() # confusion_matrix.reset() train_data_manager.reshuffle_dataframe() # it was ceiled, so it is "instances/batch_size + 1" n_batch = train_data_manager.n_batches() batch_index = 0 for batch_index in range(0, n_batch - 1): # this operation is time consuming xn, wc = train_data_manager.next_batch() # long seems to trigger cuda error, # it cannot handle long # variable by defalut requires_grad = False # t = torch.Tensor(1) # t.to(torch.float32) <=> t.float() # t.to(torch.int64) <=> t.long() var_xn = Variable(torch.from_numpy(xn).float()) # print( x.size() ) var_wc = Variable(torch.from_numpy(wc).float(), requires_grad=False) # y = y - 1 # print(y.size()) # #########################logger.info('Begin fetching a batch') loss = eval_batch(model, var_xn, var_wc, config.on_cuda) # #########################logger.info( # 'End fetching a batch, begin optimizer') optimizer.zero_grad() loss.backward() optimizer.step() # #########################logger.info('End optimizer') # data is the tensor, # [0] is a Python number, # if a 0-dim tensor, then .item will get the python number, # if 1-dim then .items will get list loss_meter.add(loss.data.item()) # confusion_matrix.add(scores.data, y.data) # if batch_index == 10 then display the accuracy of the batch if (batch_index + 1) % 200 == 0: # for 2 LongTensors, 17 / 18 = 0 # accuracy = corrects.float() / config.batch_size # .value()[0] is the loss value logger.info('TRAIN\tepoch: %d/%d\tbatch: %d/%d\tloss: %f' % ( epoch, epochs, batch_index, n_batch, loss_meter.value()[0])) # abandon the tail batch, because it will trigger duplicate # context window thus causing loss == nan if TRAINING_INSTANCES % batch_size == 0: train_data_manager.set_current_cursor_in_dataframe_zero() else: # train_data_manager.set_current_cursor_in_dataframe_zero() print('!!!!!!!!!!!Enter tail batch') # the value can be inherented batch_index += 1 (xn, wc) = train_data_manager.tail_batch_nobatchpadding() # long seems to trigger # t = torch.Tensor(1) # t.to(torch.float32) <=> t.float() # t.to(torch.int64) <=> t.long() var_xn = Variable(torch.from_numpy(xn).float()) var_wc = Variable(torch.from_numpy(wc).float(), requires_grad=False) # y = y - 1 loss = eval_batch(model, var_xn, var_wc, config.on_cuda) optimizer.zero_grad() loss.backward() optimizer.step() loss_meter.add(loss.data.item()) # confusion_matrix.add( scores.data , y.data ) # if batch_index == 10 then display the accuracy of the batch if (batch_index + 1) % 200 == 0: # for 2 LongTensors, 17 / 18 = 0 # accuracy = corrects.float() / config.batch_size # print("accuracy = %f, corrects = %d"%(accuracy, corrects)) # .value()[0] is the loss value y = Variable( # torch.LongTensor(y), requires_grad = False) logger.info('TRAIN\tepoch: %d/%d\tbatch: %d/%d\tloss: %f' % ( epoch, epochs, batch_index, n_batch, loss_meter.value()[0])) # print('!!!!!!!!!!!Exit tail batch') # after an epoch it should be evaluated # switch to evaluate model model.eval() # if (batch_epochsindex + 1) % 25 == 0: # every 50 batches peek its accuracy and get the best accuracy # confusion_matrix_value=confusion_matrix.value() # acc = 0 # for i in range(CLASS_COUNT): # correct prediction count # acc += confusion_matrix_value[i][i] # the accuracy, overall accuracy in an epoch # acc = acc / confusion_matrix_value.sum() # a 1-dim tensor with lenth 1, # so you have to access the element by [0] # loss_meter.value() = (mean, var), mean is the average among batches the_overall_averaged_loss_in_epoch = loss_meter.value()[0] logger.info('epoch: %d/%d\taverage_loss: %f' % ( epoch, epochs, the_overall_averaged_loss_in_epoch)) # switch to train model model.train() # if accuracy increased, then save the model and # change the learning rate if loss_meter.value()[0] < smallest_loss: # save the model model.save(model_path_each_save) logger.info('model saved to %s' % model_path_each_save) # change the learning rate if epoch < 4: lr = lr * config.lr_decay else: if epoch < 8: lr = lr * 0.97 else: lr = lr * 0.99 logger.info('learning_rate changed to %f' % lr) for param_group in optimizer.param_groups: param_group['lr'] = lr smallest_loss = loss_meter.value()[0] else: print('the loss_meter = ', loss_meter.value()[0])