def aggr_and_output_all_word_zeta(model_dir,
                                  param_fpathin_index2vocab,
                                  param_fpathout_aggrd_all_wordrep,
                                  mtype='TopicalWordEmbedding'):
    '''
    The dataset take the trainset path
    ====================
    params:
    ----------
    model_dir: saved model dir
    param_fpathin_index2vocab: vocabfile
    param_fpathout_aggrd_all_wordrep: the outputed all wordrep
    mtype: model name

    return:
    ----------
    None
    '''
    # ----------load the index2vocabulary
    fpointerInIndex2Vocabulary = open(param_fpathin_index2vocab,
                                      'rt',
                                      encoding='utf8')
    dictIndex2Vocab = \
        json.load(fpointerInIndex2Vocabulary)
    fpointerInIndex2Vocabulary.close()
    config = DefaultConfig()

    batch_size = config.batch_size

    # ----------Compute the wordrep
    dictIndex2Wordvec = dict()
    for i in range(VOCABULARY_SIZE):
        dictIndex2Wordvec[i] = numpy.zeros(shape=TOPIC_COUNT,
                                           dtype=numpy.float32)

    # determine whether to run on cuda
    if config.on_cuda:
        config.on_cuda = torch.cuda.is_available()
        if not config.on_cuda:
            logger.info('Cuda is unavailable,\
                Although wants to run on cuda,\
                Model still run on CPU')

    model_path = '%s/model' % model_dir

    if config.model == 'TopicalWordEmbedding':
        model = topicalWordEmbedding.TopicalWordEmbedding(
            param_on_cuda=config.on_cuda,
            param_half_window_size=HALF_WINDOW_SIZE,
            param_vocabulary_size=VOCABULARY_SIZE,
            param_hidden_layer_size=HIDDEN_LAYER_SIZE,
            param_encoder_pi_size=DIM_ENCODER,
            param_topic_count=TOPIC_COUNT)

    print('Loading trained model')
    if config.on_cuda:
        model.load(model_path)
        model = model.cuda()
    else:
        model.load_cpu_from_gputrained(model_path)
        model = model.cpu()

    train_data_manager = DataManager(batch_size, TRAINING_INSTANCES)
    train_data_manager.load_dataframe_from_file(TRAIN_SET_PATH)
    n_batch = train_data_manager.n_batches()
    batch_index = 0
    for batch_index in range(0, n_batch - 1):
        # this operation is time consuming
        xn, wc = train_data_manager.next_batch()
        idx = numpy.argmax(xn, axis=1)
        if config.on_cuda:
            var_xn = Variable(torch.from_numpy(xn).float()).cuda()
            # print( x.size() )
            var_wc = Variable(torch.from_numpy(wc).float(),
                              requires_grad=False).cuda()
        else:
            var_xn = Variable(torch.from_numpy(xn).float()).cpu()
            # print( x.size() )
            var_wc = Variable(torch.from_numpy(wc).float(),
                              requires_grad=False).cpu()
        var_zeta = model.forward_obtain_xn_zeta(var_xn, var_wc)
        # var_zeta_softmaxd = softmax(var_zeta, dim=1)
        arr_zeta = var_zeta.data.cpu().numpy()
        for row_idx, pivot_idx in enumerate(idx):
            pivot_zeta = arr_zeta[row_idx]
            dictIndex2Wordvec[pivot_idx] += pivot_zeta
            # += pivot_rep
        for pivot_idx in idx:
            dictIndex2Wordvec[pivot_idx] = softmax_np(
                dictIndex2Wordvec[pivot_idx])
        # y = y - 1
        # print(y.size())

    if TRAINING_INSTANCES % batch_size == 0:
        train_data_manager.set_current_cursor_in_dataframe_zero()
    else:
        train_data_manager.set_current_cursor_in_dataframe_zero()

    # ----------Output the dict
    fpointerOutWordRep = open(param_fpathout_aggrd_all_wordrep,
                              'wt',
                              encoding='utf8')
    for an_word_idx in dictIndex2Wordvec:
        arr_word_rep = dictIndex2Wordvec[an_word_idx]
        arr_word_rep = arr_word_rep.astype(dtype=str)
        str_word_rep = ' '.join(arr_word_rep)
        str_vocab = dictIndex2Vocab[str(an_word_idx)]
        str4output = str_vocab + ' ' + str_word_rep + '\n'
        fpointerOutWordRep.write(str4output)
    fpointerOutWordRep.close()
Ejemplo n.º 2
0
def subst_compute_originalpivot_rep(
        model_dir,
        fpathIn_instances,
        fpathIn_labels,
        fpathOut_topcandidate,
        param_fpathin_voca2index,
        param_fpathin_subst_voca2index,
        mtype='TopicalWordEmbedding'):
    '''
    for each line in the fpathIn_instances,
    iterate the dict and construct a list, find the closest
    rep for the original
    ====================
    params:
    ----------
    model_dir: saved model dir
    fpathIn_instances: input filepath, parsed
    fpathOut_pivot_rep: output pivot_rep, it's pi
    param_fpathin_subst_voca2index: the dictionary for candidate
    mtype: model name

    return:
    ----------
    (pivot word list, rep list, topic rep list)
    '''

    # ----------load the subst_voca2index
    fpointerInSubstVocabulary2Index = open(
        param_fpathin_subst_voca2index,
        'rt',
        encoding='utf8')
    dictSubstVocabulary2Index = \
        json.load(fpointerInSubstVocabulary2Index)
    fpointerInSubstVocabulary2Index.close()

    # ---------- load the instance list
    fpointerIn_instances = open(fpathIn_instances, 'rt', encoding='utf8')
    list_instances = list(map(str.strip, fpointerIn_instances.readlines()))
    fpointerIn_instances.close()

    # ---------- load the pivot list
    fpointerIn_labels = open(fpathIn_labels, 'rt', encoding='utf8')
    list_pivots = list(map(str.strip, fpointerIn_labels.readlines()))
    for idx, aline_in_pivot in enumerate(list_pivots):
        list_pivots[idx] = aline_in_pivot.split(' ')[0]
    fpointerIn_labels.close()

    # ----------load the trained model
    config = DefaultConfig()
    # config.set_attrs({'batch_size': len(list_pivot)})
    model_path = '%s/model' % model_dir

    model = topicalWordEmbedding.TopicalWordEmbedding(
        param_on_cuda=config.on_cuda,
        param_half_window_size=HALF_WINDOW_SIZE,
        param_vocabulary_size=VOCABULARY_SIZE,
        param_hidden_layer_size=HIDDEN_LAYER_SIZE,
        param_encoder_pi_size=DIM_ENCODER,
        param_topic_count=TOPIC_COUNT)
    print('Loading trained model')

    if config.on_cuda:
        model.load(model_path)
        model = model.cuda()
    else:
        model.load_cpu_from_gputrained(model_path)
        model = model.cpu()

    # ----------iterate over each instance to find the best, and output
    fpointerOut_topcandidate = open(
        fpathOut_topcandidate, 'wt', encoding='utf8')
    for idx_instance, a_candidate_instance in enumerate(list_instances):
        a_candidate_pivot = list_pivots[idx_instance]
        list_possible_instances = [a_candidate_instance]
        list_possible_pivots = [a_candidate_pivot]

        head_start_index = a_candidate_instance.find('<head>') + 6
        head_end_index = a_candidate_instance.find('</head>')

        for a_possible_pivot in dictSubstVocabulary2Index:
            if a_possible_pivot == a_candidate_pivot:
                continue

            a_possible_instance = a_candidate_instance[:head_start_index]\
                + a_possible_pivot + a_candidate_instance[head_end_index:]
            list_possible_instances.append(a_possible_instance)
            list_possible_pivots.append(a_possible_pivot)

        # ----------get a list of (pivot word, xn, wc)
        parsed_list = yelpDoclist2Parsedlist_noTokenize(
            paramDocList=list_possible_instances,
            paramPivotList=list_possible_pivots,
            paramFpathInVocabulary2Index=param_fpathin_voca2index)
        # print(len(parsed_list))

        (list_pivot, list_xn, list_wc) = zip(*parsed_list)
        # print(list_pivot)

        # ----------compute the representation list
        arr_xn = numpy.zeros((len(list_xn),
                              VOCABULARY_SIZE),
                             dtype=numpy.int32)
        for list_xn_linenum, list_xn_vocabindex in enumerate(list_xn):
            arr_xn[list_xn_linenum, list_xn_vocabindex] += 1
        arr_xn = arr_xn.astype(numpy.float32)
        arr_wc = numpy.array(list_wc).astype(numpy.float32)
        if config.on_cuda:
            var_xn = Variable(torch.from_numpy(arr_xn)).cuda()
            var_wc = Variable(torch.from_numpy(arr_wc)).cuda()
        else:
            var_xn = Variable(torch.from_numpy(arr_xn)).cpu()
            var_wc = Variable(torch.from_numpy(arr_wc)).cpu()
        var_rep = model.forward_obtain_xn_rep(var_xn, var_wc)
        # var_zeta = model.forward_obtain_xn_zeta(var_xn, var_wc)
        arr_rep = var_rep.data.cpu().numpy()
        # arr_zeta = var_zeta.data.cpu().numpy()

        best_pivots, best_reps = find_the_best_possible_pivot(
            list_pivot, arr_rep)
        # print(list_possible_instances)
        # break
        # print(list_pivot[0], best_pivots)
        fpointerOut_topcandidate.write(' '.join(best_pivots) + '\n')

    fpointerOut_topcandidate.close()
Ejemplo n.º 3
0
def display_sorted_topic_matrix(model_dir,
                                param_fpathin_index2vocab,
                                mtype='TopicalWordEmbedding'):
    '''
    get the topic matrix, for each topic, concatenate, sort and
    map the top 10 words
    ====================
    params:
    ----------
    model_dir: saved model dir
    param_fpathin_voca2index: input dict dir
    mtype: model name

    return:
    ----------
    None, output to the console
    '''
    # ----------load the voca2index
    fpointerInIndex2Vocabulary = open(param_fpathin_index2vocab,
                                      'rt',
                                      encoding='utf8')
    dictIndex2Vocabulary = \
        json.load(fpointerInIndex2Vocabulary)
    fpointerInIndex2Vocabulary.close()

    # ----------load the trained model
    config = DefaultConfig()
    # config.set_attrs({'batch_size': len(list_pivot)})
    model_path = '%s/model' % model_dir

    model = topicalWordEmbedding.TopicalWordEmbedding(
        param_on_cuda=config.on_cuda,
        param_half_window_size=HALF_WINDOW_SIZE,
        param_vocabulary_size=VOCABULARY_SIZE,
        param_hidden_layer_size=HIDDEN_LAYER_SIZE,
        param_encoder_pi_size=DIM_ENCODER,
        param_topic_count=TOPIC_COUNT)
    print('Loading trained model')

    if config.on_cuda:
        model.load(model_path)
        model = model.cuda()
    else:
        model.load_cpu_from_gputrained(model_path)
        model = model.cpu()

    # ----------get the topic matrix
    var_topic_matrix = model.vae_decoder.MATRIX_decoder_beta
    arr_topic_matrix = var_topic_matrix.data.cpu().numpy()
    itemgetter_1 = operator.itemgetter(1)
    for topic_index in range(TOPIC_COUNT):
        list_voca = list(range(VOCABULARY_SIZE))
        list_topicvoca = arr_topic_matrix[topic_index, :].tolist()
        # concatenate
        list_voca_topicvoca = list(zip(list_voca, list_topicvoca))
        list_voca_topicvoca.sort(key=itemgetter_1, reverse=True)
        (list_voca, list_topicvoca) = zip(*list_voca_topicvoca)
        top_list_voca = list_voca[0:50]
        top_list_voca_mapped = [
            dictIndex2Vocabulary[str(i)] for i in top_list_voca
        ]
        print(top_list_voca_mapped)

        list_voca = None
        list_topicvoca = None
        list_voca_topicvoca = None

    return None
Ejemplo n.º 4
0
def compute_pivot_rep(model_dir,
                      input_doc_list,
                      param_fpathin_voca2index,
                      mtype='TopicalWordEmbedding'):
    '''
    given a list of documents, transfer the documents into instances,
    enumerate the instances and compute the pivot representations.
    ====================
    params:
    ----------
    model_dir: saved model dir
    input_list: input documents, unparsed
    mtype: model name

    return:
    ----------
    (pivot word list, rep list, topic rep list)
    '''

    # ----------load the voca2index
    # fpointerInVocabulary2Index = open(
    #     param_fpathin_voca2index,
    #     'rt',
    #     encoding='utf8')
    # dictVocabulary2Index = \
    #     json.load(fpointerInVocabulary2Index)
    # fpointerInVocabulary2Index.close()

    # ----------get a list of (pivot word, xn, wc)
    oYelpPreprocessor = YelpPreprocessor()
    parsed_list = oYelpPreprocessor.yelpDoclist2Parsedlist(
        paramDocList=input_doc_list,
        paramFpathInVocabulary2Index=param_fpathin_voca2index)

    (list_pivot, list_xn, list_wc) = zip(*parsed_list)
    # ----------load the trained model
    config = DefaultConfig()
    config.set_attrs({'batch_size': len(list_pivot)})
    model_path = '%s/model' % model_dir

    model = topicalWordEmbedding.TopicalWordEmbedding(
        param_on_cuda=config.on_cuda,
        param_half_window_size=HALF_WINDOW_SIZE,
        param_vocabulary_size=VOCABULARY_SIZE,
        param_hidden_layer_size=HIDDEN_LAYER_SIZE,
        param_encoder_pi_size=DIM_ENCODER,
        param_topic_count=TOPIC_COUNT)
    print('Loading trained model')

    if config.on_cuda:
        model.load(model_path)
        model = model.cuda()
    else:
        model.load_cpu_from_gputrained(model_path)
        model = model.cpu()

    # ----------compute the representation list
    arr_xn = numpy.zeros((len(list_xn), VOCABULARY_SIZE), dtype=numpy.int32)
    for list_xn_linenum, list_xn_vocabindex in enumerate(list_xn):
        arr_xn[list_xn_linenum, list_xn_vocabindex] += 1
    arr_xn = arr_xn.astype(numpy.float32)
    arr_wc = numpy.array(list_wc).astype(numpy.float32)
    if config.on_cuda:
        var_xn = Variable(torch.from_numpy(arr_xn)).cuda()
        var_wc = Variable(torch.from_numpy(arr_wc)).cuda()
    else:
        var_xn = Variable(torch.from_numpy(arr_xn)).cpu()
        var_wc = Variable(torch.from_numpy(arr_wc)).cpu()
    var_rep = model.forward_obtain_xn_rep(var_xn, var_wc)
    var_zeta = model.forward_obtain_xn_zeta(var_xn, var_wc)
    arr_rep = var_rep.data.cpu().numpy()
    arr_zeta = var_zeta.data.cpu().numpy()

    return list_pivot, arr_rep, arr_zeta
Ejemplo n.º 5
0
def output_sorted_topic_matrix(model_dir,
                               param_fpathin_index2vocab,
                               param_fpathout_topic_matrix,
                               mtype='TopicalWordEmbedding'):
    '''
    get the topic matrix, for each topic, concatenate, sort and
    map the top 10 words
    ====================
    params:
    ----------
    model_dir: saved model dir
    param_fpathin_voca2index: input dict dir
    mtype: model name
    param_fpathout_topic_matrix: output_topic_matrix_path

    return:
    ----------
    None, output to the console
    '''
    # ----------load the voca2index
    fpointerInIndex2Vocabulary = open(param_fpathin_index2vocab,
                                      'rt',
                                      encoding='utf8')
    dictIndex2Vocabulary = \
        json.load(fpointerInIndex2Vocabulary)
    fpointerInIndex2Vocabulary.close()

    # ----------load the trained model
    config = DefaultConfig()
    # config.set_attrs({'batch_size': len(list_pivot)})
    model_path = '%s/model' % model_dir

    model = topicalWordEmbedding.TopicalWordEmbedding(
        param_on_cuda=config.on_cuda,
        param_half_window_size=HALF_WINDOW_SIZE,
        param_vocabulary_size=VOCABULARY_SIZE,
        param_hidden_layer_size=HIDDEN_LAYER_SIZE,
        param_encoder_pi_size=DIM_ENCODER,
        param_topic_count=TOPIC_COUNT)
    print('Loading trained model')

    if config.on_cuda:
        model.load(model_path)
        model = model.cuda()
    else:
        model.load_cpu_from_gputrained(model_path)
        model = model.cpu()

    # ----------get and output the topic matrix
    fpointerOutTopicMatrix = open(param_fpathout_topic_matrix,
                                  'wt',
                                  encoding='utf8')

    var_topic_matrix = model.vae_decoder.MATRIX_decoder_beta
    arr_topic_matrix = var_topic_matrix.data.cpu().numpy()
    itemgetter_1 = operator.itemgetter(1)
    for topic_index in range(TOPIC_COUNT):
        list_voca = list(range(VOCABULARY_SIZE))
        list_topicvoca = arr_topic_matrix[topic_index, :].tolist()
        # concatenate
        list_voca_topicvoca = list(zip(list_voca, list_topicvoca))
        list_voca_topicvoca.sort(key=itemgetter_1, reverse=True)
        (list_voca, list_topicvoca) = zip(*list_voca_topicvoca)
        top_list_voca = list_voca[0:50]
        top_list_voca_mapped = [
            dictIndex2Vocabulary[str(i)] for i in top_list_voca
        ]
        top_list_voca_cleaned = [
            vocastr for vocastr in top_list_voca_mapped
            if (vocastr.find('~') == -1 and vocastr.find('!') == -1
                and vocastr.find('@') == -1 and vocastr.find('#') == -1
                and vocastr.find('$') == -1 and vocastr.find('%') == -1
                and vocastr.find('^') == -1 and vocastr.find('&') == -1
                and vocastr.find('*') == -1 and vocastr.find('(') == -1
                and vocastr.find(')') == -1 and vocastr.find('0') == -1
                and vocastr.find('1') == -1 and vocastr.find('2') == -1
                and vocastr.find('3') == -1 and vocastr.find('4') == -1
                and vocastr.find('5') == -1 and vocastr.find('6') == -1
                and vocastr.find('7') == -1 and vocastr.find('8') == -1
                and vocastr.find('9') == -1 and vocastr.find('-') == -1
                and vocastr.find('+') == -1 and vocastr.find('_') == -1
                and vocastr.find('=') == -1 and vocastr.find('.') == -1
                and vocastr.find(',') == -1 and vocastr.find('/') == -1
                and vocastr.find('?') == -1 and vocastr.find('\\') == -1
                and vocastr.find('"') == -1 and vocastr.find(':') == -1
                and vocastr.find('\'') == -1 and vocastr.find(';') == -1
                and vocastr.find('|') == -1 and vocastr.find('<') == -1
                and vocastr.find('>') == -1 and vocastr.find('[') == -1
                and vocastr.find(']') == -1)
        ]
        top_list_voca_top10 = top_list_voca_cleaned[:10]
        fpointerOutTopicMatrix.write('topic %03d ' % topic_index +
                                     ' '.join(top_list_voca_top10) + '\n')

        list_voca = None
        list_topicvoca = None
        list_voca_topicvoca = None
    fpointerOutTopicMatrix.close()
    return None
Ejemplo n.º 6
0
def train(**kwargs):
    '''
    begin training the model
    *kwargs: train(1, 2, 3, 4, 5) =>
    kwargs[0] = 1 kwargs[1] = 2 ..., kwargs is principally a tuple
    **kwargs: train(a=1, b=2, c=3, d=4)
    CustomPreProcessor =>
    kwargs[a] = 1, kwargs[b] = 2, kwargs[c] = 3,
    kwargs[d] = 4, kwargs is principally a dict
    function containing kwargs *kwargs **kwargs must be written as:
    def train(args,*args,**args)
    '''

    saveid = latest_save_num() + 1
    # the save_path is
    save_path = '%s/%d' % (SAVE_DIR, saveid)
    print("logger save path: %s" % (save_path))
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    log_path_each_save = '%s/log.txt' % save_path
    model_path_each_save = '%s/model' % save_path
    logger = get_logger(log_path_each_save)

    config = DefaultConfig()

    # settings here, avalid_data_utillso about whether on cuda
    config.set_attrs(kwargs)
    # print(config.get_attrs())

    epochs = config.epochs
    batch_size = config.batch_size

    # determine whether to run on cuda
    if config.on_cuda:
        config.on_cuda = torch.cuda.is_available()
        if not config.on_cuda:
            logger.info('Cuda is unavailable,\
                Although wants to run on cuda,\
                Model still run on CPU')

    # 300 in our model
    # 1024 is the Elmo size,
    # the concatenated hidden size is supposed to Elmo size,
    # however, any size is OK
    # it depends on the setting
    # attention size should be a smoothed representation of character-emb

    if config.model == 'TopicalWordEmbedding':
        model = topicalWordEmbedding.TopicalWordEmbedding(
            param_on_cuda=config.on_cuda,
            param_half_window_size=HALF_WINDOW_SIZE,
            param_vocabulary_size=VOCABULARY_SIZE,
            param_hidden_layer_size=HIDDEN_LAYER_SIZE,
            param_encoder_pi_size=DIM_ENCODER,
            param_topic_count=TOPIC_COUNT)

    if config.on_cuda:
        logger.info('Model run on GPU')
        model = model.cuda()
        logger.info('Model initialized on GPU')
    else:
        logger.info('Model run on CPU')
        model = model.cpu()
        logger.info('Model initialized on CPU')

    # print('logger-setted',file=sys.stderr)
    # output the string informetion to the log
    logger.info(model.modelname)
    # output the string information to the log
    logger.info(str(config.get_attrs()))

    # read in the trainset and the trial set
    # Train Set
    train_data_manager = DataManager(batch_size, TRAINING_INSTANCES)

    train_data_manager.load_dataframe_from_file(TRAIN_SET_PATH)

    # set the optimizer parameter,
    # such as learning rate and weight_decay,
    # function Adam, a method for Stochastic Optizimism

    # load the learning rate in config, that is settings.py
    lr = config.learning_rate
    # params_iterator_requires_grad can only be iterated once
    params_iterator_requires_grad = filter(
        lambda trainingParams: trainingParams.requires_grad,
        model.parameters())
    # print(len(list(params_iterator_requires_grad)))

    # 25 parameters
    # weight decay that is L2 penalty that is L2 regularization,
    # usually added after a cost function(loss function),
    # for example C=C_0+penalty, QuanZhongShuaiJian,
    # to avoid overfitting
    optimizer = torch.optim.Adam(
        params_iterator_requires_grad,
        lr=lr,
        weight_decay=config.weight_decay)

    # By default, the losses are averaged over observations
    # for each minibatch.
    # However, if the field size_average is set to False,
    # the losses are instead
    # summed for each minibatch

    # The CrossEntropyLoss,
    # My selector in my notebook = loss + selecting strategy
    # (often is selecting the least loss)

    # criterion = torch.nn.CrossEntropyLoss(size_average=False)

    # once you have the loss function, you also have
    # to train the parameters in g(x),
    # which will be used for prediction

    # the loss calculated after the smooth method,
    # that is L2 penalty mentioned in torch.optim.Adam
    loss_meter = meter.AverageValueMeter()
    # get confusionMatrix, the confusion matrix is the one show as follows:
    # confusion_matrix = meter.ConfusionMeter(
    #     CLASS_COUNT)
    '''                    class1 predicted class2 predicted class3 predicted
    class1 ground truth  [[4,               1,               1]
    class2 ground truth   [2,               3,               1]
    class2 ground truth   [1,               2,               9]]
    '''
    model.train()
    # pre_loss = 1e100
    # best_acc = 0
    smallest_loss = 0x7fffffffffffffffffffffffffffffff

    for epoch in range(epochs):
        '''
        an epoch, that is, train data of all
        barches(all the data) for one time
        '''

        loss_meter.reset()
        # confusion_matrix.reset()

        train_data_manager.reshuffle_dataframe()

        # it was ceiled, so it is "instances/batch_size + 1"
        n_batch = train_data_manager.n_batches()

        batch_index = 0
        for batch_index in range(0, n_batch - 1):
            # this operation is time consuming
            xn, wc = train_data_manager.next_batch()

            # long seems to trigger cuda error,
            # it cannot handle long
            # variable by defalut requires_grad = False
            # t = torch.Tensor(1)
            # t.to(torch.float32) <=> t.float()
            # t.to(torch.int64) <=> t.long()
            var_xn = Variable(torch.from_numpy(xn).float())
            # print( x.size() )
            var_wc = Variable(torch.from_numpy(wc).float(),
                              requires_grad=False)
            # y = y - 1
            # print(y.size())

            # #########################logger.info('Begin fetching a batch')
            loss = eval_batch(model, var_xn, var_wc, config.on_cuda)
            # #########################logger.info(
            #     'End fetching a batch, begin optimizer')
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # #########################logger.info('End optimizer')
            # data is the tensor,
            # [0] is a Python number,
            # if a 0-dim tensor, then .item will get the python number,
            # if 1-dim then .items will get list
            loss_meter.add(loss.data.item())

            # confusion_matrix.add(scores.data, y.data)
            # if batch_index == 10 then display the accuracy of the batch
            if (batch_index + 1) % 200 == 0:
                # for 2 LongTensors,  17 / 18 = 0
                # accuracy = corrects.float() / config.batch_size
                # .value()[0] is the loss value
                logger.info('TRAIN\tepoch: %d/%d\tbatch: %d/%d\tloss: %f' % (
                    epoch, epochs, batch_index, n_batch,
                    loss_meter.value()[0]))
        # abandon the tail batch, because it will trigger duplicate
        # context window thus causing loss == nan
        if TRAINING_INSTANCES % batch_size == 0:
            train_data_manager.set_current_cursor_in_dataframe_zero()
        else:
            # train_data_manager.set_current_cursor_in_dataframe_zero()
            print('!!!!!!!!!!!Enter tail batch')
            # the value can be inherented
            batch_index += 1
            (xn, wc) = train_data_manager.tail_batch_nobatchpadding()
            # long seems to trigger
            # t = torch.Tensor(1)
            # t.to(torch.float32) <=> t.float()
            # t.to(torch.int64) <=> t.long()
            var_xn = Variable(torch.from_numpy(xn).float())
            var_wc = Variable(torch.from_numpy(wc).float(),
                              requires_grad=False)
            # y = y - 1
            loss = eval_batch(model, var_xn, var_wc, config.on_cuda)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            loss_meter.add(loss.data.item())
            # confusion_matrix.add( scores.data , y.data )
            # if batch_index == 10 then display the accuracy of the batch
            if (batch_index + 1) % 200 == 0:
                # for 2 LongTensors,  17 / 18 = 0
                # accuracy = corrects.float() / config.batch_size
                # print("accuracy = %f, corrects = %d"%(accuracy, corrects))
                # .value()[0] is the loss value y = Variable(
                #     torch.LongTensor(y), requires_grad = False)
                logger.info('TRAIN\tepoch: %d/%d\tbatch: %d/%d\tloss: %f' % (
                    epoch, epochs, batch_index, n_batch,
                    loss_meter.value()[0]))
        # print('!!!!!!!!!!!Exit tail batch')
        # after an epoch it should be evaluated
        # switch to evaluate model
        model.eval()
        # if (batch_epochsindex + 1) % 25 == 0:
        # every 50 batches peek its accuracy and get the best accuracy
        # confusion_matrix_value=confusion_matrix.value()
        # acc = 0
        # for i in range(CLASS_COUNT):
        #     correct prediction count
        #     acc += confusion_matrix_value[i][i]
        # the accuracy, overall accuracy in an epoch
        # acc = acc / confusion_matrix_value.sum()

        # a 1-dim tensor with lenth 1,
        # so you have to access the element by [0]
        # loss_meter.value() = (mean, var), mean is the average among batches
        the_overall_averaged_loss_in_epoch = loss_meter.value()[0]
        logger.info('epoch: %d/%d\taverage_loss: %f' % (
            epoch, epochs, the_overall_averaged_loss_in_epoch))
        # switch to train model
        model.train()

        # if accuracy increased, then save the model and
        # change the learning rate
        if loss_meter.value()[0] < smallest_loss:
            # save the model
            model.save(model_path_each_save)
            logger.info('model saved to %s' % model_path_each_save)

            # change the learning rate
            if epoch < 4:
                lr = lr * config.lr_decay
            else:
                if epoch < 8:
                    lr = lr * 0.97
                else:
                    lr = lr * 0.99
            logger.info('learning_rate changed to %f' % lr)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr

            smallest_loss = loss_meter.value()[0]
        else:
            print('the loss_meter = ', loss_meter.value()[0])