Esempio n. 1
0
def runTest(n_layers, pre_modelFile, hidden_size, reverse, modelFile,
            beam_size, input, corpus, diff_corpus):

    voc, pairs = loadPrepareData(corpus)
    diff_voc, diff_pairs = loadPrepareData(diff_corpus)
    embedding = nn.Embedding(300, hidden_size)
    #-----------------------------------------------------------------
    #my code
    '''
    EMBEDDING_DIM = 300 #Should be the same as hidden_size!
    if EMBEDDING_DIM != hidden_size:
        sys.exit("EMBEDDING_DIM do not equal to hidden_size. Please correct it.")
    CONTEXT_SIZE = 2
    pre_checkpoint = torch.load(pre_modelFile)
    pretrained_model = NGramLanguageModeler(voc.n_words, EMBEDDING_DIM, CONTEXT_SIZE)
    pretrained_model.load_state_dict(pre_checkpoint['w2v'])
    pretrained_model.train(False)
    embedding = pretrained_model
    '''
    if USE_CUDA:
        embedding = embedding.cuda()
    #-----------------------------------------------------------------
    encoder = EncoderRNN(300, hidden_size, embedding, n_layers)
    attn_model = 'dot'
    decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size,
                                  voc.n_words, n_layers)
    if USE_CUDA:
        checkpoint = torch.load(modelFile)
    else:
        checkpoint = torch.load(modelFile, map_location='cpu')
    encoder.load_state_dict(checkpoint['en'])
    decoder.load_state_dict(checkpoint['de'])

    # train mode set to false, effect only on dropout, batchNorm
    encoder.train(False)
    decoder.train(False)

    if USE_CUDA:
        encoder = encoder.cuda()
        decoder = decoder.cuda()

    print('Loading w2v_model ...')
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    w2v_model = gensim.models.KeyedVectors.load_word2vec_format(pre_modelFile,
                                                                binary=True)
    print("Loading complete!")

    if input:
        evaluateInput(encoder, decoder, voc, beam_size, w2v_model)
    else:
        evaluateRandomly(encoder, decoder, voc, diff_pairs, reverse, beam_size,
                         w2v_model, 20)
Esempio n. 2
0
def draw_2D_word_vector(modelFile, corpus, EMBEDDING_DIM, CONTEXT_SIZE, frequency_boundary, batch_size):
    checkpoint = torch.load(modelFile)
    voc, pairs = loadPrepareData(corpus)
    model = NGramLanguageModeler(voc.n_words, EMBEDDING_DIM, CONTEXT_SIZE)
    model.load_state_dict(checkpoint['w2v'])
    model.train(False);
    new_word = voc.index2word[0]
    labels = [new_word]
    new_word = np.array(get_word_vector(model, new_word, voc, EMBEDDING_DIM).data)
    vectors2D = np.array([new_word])
    start_word = 0
    index2vector = {start_word:new_word}
    below1000_count = 0
    #frequency_boundary = 1600
    for i in range(voc.n_words):
        new_word = voc.index2word[i]
        if voc.word2count[new_word] <= frequency_boundary:
            below1000_count += 1
        else:
            labels.append(new_word)
            new_word = np.array(get_word_vector(model, new_word, voc, EMBEDDING_DIM).data)
            vectors2D = np.concatenate((vectors2D, [new_word]), axis = 0)
        index2vector[i] = [new_word]
    print("{} words out of {} words are in low frequency({} times).".format(\
        below1000_count, voc.n_words, frequency_boundary))
    print("{} words left".format(voc.n_words - below1000_count))
    print("Shape of vectors2D: {}".format(vectors2D.shape))
    file_name = 'b{}vectors2D.png'.format(frequency_boundary)
    iteration = os.path.split(modelFile)[-1].split('_')[0]
    tsne(corpus, voc.n_words, vectors2D, labels, file_name, iteration, batch_size, EMBEDDING_DIM)
Esempio n. 3
0
def prep_net():
    modelFile = './save/model/movie_subtitles/1-1_512/50000_backup_bidir_model.tar'
    corpus = './corpus/movie_subtitles.txt'

    n_iteration = 10000

    n_layers, hidden_size, reverse = parseFilename(modelFile, True)

    beam_size = 1

    torch.set_grad_enabled(False)

    voc, pairs = loadPrepareData(corpus)
    embedding = nn.Embedding(voc.n_words, hidden_size)
    encoder = EncoderRNN(voc.n_words, hidden_size, embedding, n_layers)
    attn_model = 'dot'
    decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size,
                                  voc.n_words, n_layers)

    checkpoint = torch.load(modelFile, map_location='cpu')
    encoder.load_state_dict(checkpoint['en'])
    decoder.load_state_dict(checkpoint['de'])

    # train mode set to false, effect only on dropout, batchNorm
    encoder.train(False)
    decoder.train(False)

    encoder = encoder.to(device)
    decoder = decoder.to(device)

    return beam_size, encoder, decoder, voc
Esempio n. 4
0
def runTest(n_layers, hidden_size, reverse, modelFile, beam_size, input,
            corpus):

    voc, pairs = loadPrepareData(corpus)
    embedding = nn.Embedding(voc.n_words, hidden_size)
    encoder = EncoderRNN(voc.n_words, hidden_size, embedding, n_layers)
    attn_model = 'dot'
    decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size,
                                  voc.n_words, n_layers)

    checkpoint = torch.load(modelFile)
    encoder.load_state_dict(checkpoint['en'])
    decoder.load_state_dict(checkpoint['de'])

    # train mode set to false, effect only on dropout, batchNorm
    encoder.train(False)
    decoder.train(False)

    if USE_CUDA:
        encoder = encoder.cuda()
        decoder = decoder.cuda()

    if input:
        evaluateInput(encoder, decoder, voc, beam_size)
    else:
        evaluateRandomly(encoder, decoder, voc, pairs, reverse, beam_size, 20)
Esempio n. 5
0
def runTest(n_layers, hidden_size, reverse, modelFile, beam_size, inp, corpus):
    # TODO:beam_size控制每个输入的回答个数.beamsize不为一时EOS符号也会输出??
    torch.set_grad_enabled(False)
    voc, pairs = loadPrepareData(corpus)
    embedding = nn.Embedding(voc.n_words, hidden_size)
    #nn.Embedding可以训练,完成把X空间转换到(嵌入到)Y空间,nn.Embedding(voc.n_words, hidden_size)表示输入元素(单词)个数为voc.n_words,嵌入到hidden_size维度的空间
    #即把每一个词从onehot编码转化为每个词用hidden_size维向量表示
    encoder = EncoderRNN(voc.n_words, hidden_size, embedding, n_layers)
    attn_model = 'dot'
    decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size,
                                  voc.n_words, n_layers)

    checkpoint = torch.load(modelFile)
    encoder.load_state_dict(checkpoint['en'])
    decoder.load_state_dict(checkpoint['de'])

    # train mode set to false, effect only on dropout, batchNorm
    encoder.train(False)
    decoder.train(False)

    encoder = encoder.to(device)
    decoder = decoder.to(device)

    if inp:
        evaluateInput(encoder, decoder, voc, beam_size)
    else:
        evaluateRandomly(encoder, decoder, voc, pairs, reverse, beam_size, 20)
Esempio n. 6
0
def runTest(n_layers, hidden_size, batch_size, reverse, modelFile, beam_size,
            inp, corpus):
    torch.set_grad_enabled(False)

    voc, pairs = loadPrepareData(corpus)

    encoder = EncoderRNN(hidden_size, batch_size, n_layers)
    attn_model = 'dot'
    decoder = LuongAttnDecoderRNN(attn_model, hidden_size, batch_size,
                                  voc.loc_count, n_layers)

    checkpoint = torch.load(modelFile)
    encoder.load_state_dict(checkpoint['en'])
    decoder.load_state_dict(checkpoint['de'])

    # train mode set to false, effect only on dropout, batchNorm
    encoder.train(False)
    decoder.train(False)

    encoder = encoder.to(device)
    decoder = decoder.to(device)

    if inp:
        evaluateInput(encoder, decoder, voc, beam_size)
    else:
        evaluateRandomly(encoder, decoder, voc, pairs, reverse, beam_size, 20)
Esempio n. 7
0
def runTest(n_layers, pre_modelFile, hidden_size, reverse, modelFile, beam_size, input, corpus, diff_corpus):

    voc, pairs = loadPrepareData(corpus)
    diff_voc, diff_pairs = loadPrepareData(diff_corpus)
    #embedding = nn.Embedding(voc.n_words, hidden_size)
    #-----------------------------------------------------------------
    #my code
    EMBEDDING_DIM = 300 #Should be the same as hidden_size!
    if EMBEDDING_DIM != hidden_size:
        sys.exit("EMBEDDING_DIM do not equal to hidden_size. Please correct it.")
    CONTEXT_SIZE = 2
    pre_checkpoint = torch.load(pre_modelFile)
    pretrained_model = NGramLanguageModeler(voc.n_words, EMBEDDING_DIM, CONTEXT_SIZE)
    pretrained_model.load_state_dict(pre_checkpoint['w2v'])
    pretrained_model.train(False)
    embedding = pretrained_model
    if USE_CUDA:
        embedding = embedding.cuda()
    #-----------------------------------------------------------------
    encoder = EncoderRNN(voc.n_words, hidden_size, embedding, n_layers)
    attn_model = 'dot'
    decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.n_words, n_layers)
    if USE_CUDA:
        checkpoint = torch.load(modelFile)
    else:
        checkpoint = torch.load(modelFile, map_location='cpu')
    encoder.load_state_dict(checkpoint['en'])
    decoder.load_state_dict(checkpoint['de'])
    
    # train mode set to false, effect only on dropout, batchNorm
    encoder.train(False);
    decoder.train(False);

    if USE_CUDA:
        encoder = encoder.cuda()
        decoder = decoder.cuda()

    if input:
        evaluateInput(encoder, decoder, voc, beam_size)
    else:
        evaluateRandomly(encoder, decoder, voc, diff_pairs, reverse, beam_size, 20)
Esempio n. 8
0
def load(filename, reverse=False, n_iteration=1, batch_size=64):
    voc, pairs = loadPrepareData(filename)

    # training data
    corpus_name = os.path.split(filename)[-1].split('.')[0]
    training_batches = None
    print('Training pairs not found, generating ...')
    training_batches = [
        batch2TrainData(voc, [random.choice(pairs)
                              for _ in range(batch_size)], reverse)
        for _ in range(n_iteration)
    ]
    print(len(training_batches))
    print(training_batches)
Esempio n. 9
0
def test_word_vector(modelFile, corpus, EMBEDDING_DIM, CONTEXT_SIZE):
    checkpoint = torch.load(modelFile)
    voc, pairs = loadPrepareData(corpus)
    model = NGramLanguageModeler(voc.n_words, EMBEDDING_DIM, CONTEXT_SIZE)
    model.load_state_dict(checkpoint['w2v'])
    model.train(False)
    while(1):
        test_word = input('>')
        if test_word == 'q': break
        else:
            try:
                embeds = get_word_vector(model, voc.index2word[int(test_word)], voc, EMBEDDING_DIM)
                print("Word freauency of '{}': {}".format(voc.index2word[int(test_word)], \
                    voc.word2count[voc.index2word[int(test_word)]]))
            except KeyError:
                print("This index is vacant.")
            except ValueError:
                print("Please input an index.")
Esempio n. 10
0
def draw_manually(modelFile, corpus, EMBEDDING_DIM, CONTEXT_SIZE, frequency_boundary, batch_size):
    checkpoint = torch.load(modelFile)
    voc, pairs = loadPrepareData(corpus)
    model = NGramLanguageModeler(voc.n_words, EMBEDDING_DIM, CONTEXT_SIZE)
    model.load_state_dict(checkpoint['w2v'])
    model.train(False);
    words = input("Input space-separated words: ").split()
    labels = [words[0]]
    new_word = np.array(get_word_vector(model, words[0], voc, EMBEDDING_DIM).data)
    vectors2D = np.array([new_word])
    for w in words[1:]:
        labels.append(w)
        new_word = np.array(get_word_vector(model, w, voc, EMBEDDING_DIM).data)
        vectors2D = np.concatenate((vectors2D, [new_word]), axis = 0)
    print("Shape of vectors2D: {}".format(vectors2D.shape))
    file_name = 'manually_{}2{}.png'.format(words[0], words[-1])
    iteration = os.path.split(modelFile)[-1].split('_')[0]
    tsne(corpus, len(words), vectors2D, labels, file_name, iteration, batch_size, EMBEDDING_DIM)
Esempio n. 11
0
def test_vector_relation(modelFile, corpus, EMBEDDING_DIM, CONTEXT_SIZE):
    checkpoint = torch.load(modelFile)
    voc, pairs = loadPrepareData(corpus)
    model = NGramLanguageModeler(voc.n_words, EMBEDDING_DIM, CONTEXT_SIZE)
    model.load_state_dict(checkpoint['w2v'])
    model.train(False)
    word1, word2, word3, word4 = "heaven", "hell", "good", "cat"
    word4 = input('>')
    test_word1 = np.array(get_word_vector(model, word1, voc, EMBEDDING_DIM).data)
    test_word2 = np.array(get_word_vector(model, word2, voc, EMBEDDING_DIM).data)
    test_word3 = np.array(get_word_vector(model, word3, voc, EMBEDDING_DIM).data)
    test_word4 = np.array(get_word_vector(model, word4, voc, EMBEDDING_DIM).data)
    #test_word4_like = test_word3 - (test_word1 - test_word2)
    test_word4_like = test_word4

    _1st, _2nd, _3rd, _4th = 99999999, 99999999, 99999999, 99999999
    i_1st, i_2nd, i_3rd, i_4th = -1, -1, -1, -1
    for i in tqdm(range(0, voc.n_words)):
        i_vector = np.array(get_word_vector(model, voc.index2word[i], voc, EMBEDDING_DIM).data)
        distance = ((i_vector - test_word4_like) ** 2).mean(axis=None)
        #print(distance)
        if distance < _1st:
            _4th, _3rd, _2nd, _1st = _3rd, _2nd, _1st, distance
            i_4th, i_3rd, i_2nd, i_1st = i_3rd, i_2nd, i_1st, i
            #print("1st index:", i)
        elif distance < _2nd:
            _4th, _3rd, _2nd = _3rd, _2nd, distance
            i_4th, i_3rd, i_2nd = i_3rd, i_2nd, i
        elif distance < _3rd:
            _4th, _3rd = _3rd, distance
            i_4th, i_3rd = i_3rd, i
        elif distance < _4th:
            _4th = distance
            i_4th = i
    _1st_word = voc.index2word[i_1st]
    _2nd_word = voc.index2word[i_2nd]
    _3rd_word = voc.index2word[i_3rd]
    _4th_word = voc.index2word[i_4th]
    print("Most likely words of {}: {} > {} > {} > {} > other_words".format(word4,
     _1st_word, _2nd_word, _3rd_word, _4th_word)) 
Esempio n. 12
0
def trainWord2vec(corpus, iteration, hidden_size, frequency_boundary):
    voc, pairs = loadPrepareData(corpus)
    corpus_name = os.path.split(corpus)[-1].split('.')[0]
    sentences = []
    for pair in pairs:
        sentences.append(pair[0].split(' '))
    print("Sentences ready, start training...")
    model = Word2Vec(iter=iteration,
                     size=hidden_size,
                     window=10,
                     min_count=frequency_boundary,
                     workers=4)
    model.build_vocab(sentences)
    model.train(sentences,
                total_examples=model.corpus_count,
                epochs=model.iter)
    directory = os.path.join(
        save_dir, 'model', corpus_name, 'gensim',
        'hi{}fb{}'.format(hidden_size, frequency_boundary))
    if not os.path.exists(directory):
        os.makedirs(directory)
    model.save(os.path.join(directory, 'mymodel{}'.format(iteration)))
def runTest(n_layers, hidden_size, reverse, modelFile, attn_model, beam_size,
            k, p, v, inp, corpus):
    torch.set_grad_enabled(False)

    voc, pairs = loadPrepareData(corpus)
    embedding = nn.Embedding(voc.num_words, hidden_size)

    if v:
        embedding_decoder = nn.Embedding(voc.num_words, hidden_size * 2)
        encoder = EncoderRNN(hidden_size, embedding, n_layers)
        decoder = DecoderRNN(attn_model, embedding_decoder, hidden_size * 2,
                             voc.num_words, n_layers)
        hidvar = LatentVariation(hidden_size * 2, hidden_size)

        hidvar.load_state_dict(checkpoint['hv'])
        hidvar = hidvar.to(device)
    else:
        encoder = EncoderRNN(voc.num_words, hidden_size, embedding, n_layers)
        attn_model = attn_model
        decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size,
                                      voc.num_words, n_layers)

    checkpoint = torch.load(modelFile)
    encoder.load_state_dict(checkpoint['en'])
    decoder.load_state_dict(checkpoint['de'])

    # train mode set to false, effect only on dropout, batchNorm
    encoder.train(False)
    decoder.train(False)

    encoder = encoder.to(device)
    decoder = decoder.to(device)

    if inp:
        evaluateInput(encoder, decoder, hidvar, voc, beam_size, k, p, hidvar)
    else:
        evaluateScore(encoder, decoder, hidvar, voc, pairs, reverse, beam_size,
                      hidvar)
Esempio n. 14
0
def runTest(corpus, rnn_layers, hidden_size, embed_size, node_size, capsule_size, gcn_layers, gcn_filters, capsule_num,
            saved_aspect_model, saved_review_model, beam_size, max_length, min_length, save_dir):
    vocabs, train_pairs, valid_pairs, test_pairs = loadPrepareData(corpus, save_dir)

    print('Building aspect model ...')
    aspect_model = AspectModel(vocabs, embed_size, node_size, hidden_size, capsule_size,
                               gcn_layers, gcn_filters, rnn_layers, capsule_num).to(device)

    print('Building review model ...')
    review_model = ReviewModel(vocabs, embed_size, node_size, hidden_size, rnn_layers).to(device)
    
    checkpoint = torch.load(saved_aspect_model)
    aspect_model.load_state_dict(checkpoint['aspect_model'])
    
    checkpoint = torch.load(saved_review_model)
    review_model.load_state_dict(checkpoint['review_model'])

    # train mode set to false, effect only on dropout, batchNorm
    aspect_model.train(False)
    review_model.train(False)

    evaluateRandomly(aspect_model, review_model, vocabs, test_pairs, len(test_pairs), beam_size,
                     max_length, min_length, save_dir)
Esempio n. 15
0
def predict_word(modelFile, corpus, EMBEDDING_DIM, CONTEXT_SIZE):
    checkpoint = torch.load(modelFile)
    voc, pairs = loadPrepareData(corpus)
    model = NGramLanguageModeler(voc.n_words, EMBEDDING_DIM, CONTEXT_SIZE)
    model.load_state_dict(checkpoint['w2v'])
    model.train(False)
    print("Please input 2 space-separated words(input 'q' to exit)")
    while(1):
        test_word = input('>')
        test_word = test_word.split()
        if test_word[0] == 'q': break
        if len(test_word) != 2:
            print("You should input 2 words!")
        else:
            try:
                test_word_idxs = [voc.word2index[w] for w in test_word]
                test_word_var = Variable(torch.LongTensor(test_word_idxs))
                log_probs, embeds = model(test_word_var)
                _, i_predicted_word = torch.max(log_probs, 1)
                print("The next word of '{} {}' is '{}'".format(test_word[0], 
                    test_word[1], voc.index2word[i_predicted_word.data[0]]))
            except KeyError:
                print("Incorrect spelling or unseen word.")
Esempio n. 16
0
def trainIters(n_iteration,
               learning_rate,
               batch_size,
               n_layers,
               hidden_size,
               attn_model='dot',
               decoder_learning_ratio=5.0):

    voc, pairs = loadPrepareData()

    choise = [random.choice(pairs) for _ in range(batch_size)]
    training_batches = [
        batch2TrainData(voc, choise) for _ in range(n_iteration)
    ]

    # model
    checkpoint = None
    print('Building encoder and decoder ...')
    encoder = EncoderRNN(voc, hidden_size, n_layers)
    attn_model = 'dot'
    decoder = LuongAttnDecoderRNN(voc, attn_model, hidden_size, n_layers)

    # optimizer
    print('Building optimizers ...')
    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(),
                                   lr=learning_rate * decoder_learning_ratio)

    # initialize
    print('Initializing ...')
    start_iteration = 1
    perplexity = []
    print_loss = 0

    for iteration in tqdm(range(start_iteration, n_iteration + 1)):
        training_batch = training_batches[iteration - 1]
        input_variable, lengths, target_variable, mask, max_target_len = training_batch
Esempio n. 17
0
def runTest(n_layers, hidden_size, reverse, modelFile, beam_size, inp, corpus):

    voc, pairs = loadPrepareData(corpus)
    embedding = nn.Embedding(voc.n_words, hidden_size)
    encoder = EncoderRNN(voc.n_words, hidden_size, embedding, n_layers)
    attn_model = 'dot'
    decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.n_words, n_layers)

    checkpoint = torch.load(modelFile)
    encoder.load_state_dict(checkpoint['en'])
    decoder.load_state_dict(checkpoint['de'])

    # train mode set to false, effect only on dropout, batchNorm
    encoder.train(False);
    decoder.train(False);

    if USE_CUDA:
        encoder = encoder.cuda()
        decoder = decoder.cuda()

    if inp:
        evaluateInput(encoder, decoder, voc, beam_size)
    else:
        evaluateRandomly(encoder, decoder, voc, pairs, reverse, beam_size, 20)
Esempio n. 18
0
def trainIters(args,
               corpus,
               reverse,
               n_epoch,
               learning_rate,
               batch_size,
               n_layers,
               hidden_size,
               print_every,
               loadFilename=None,
               attn_model='dot',
               decoder_learning_ratio=1.0):

    print(args)

    currentDT = datetime.datetime.now()
    directory = os.path.join(
        save_dir, args.corpus, 'model',
        '{}_{}_{}'.format(n_layers, hidden_size,
                          currentDT.strftime('%Y-%m-%d-%H:%M:%S')))
    print(directory)

    print(
        "corpus: {}, reverse={}, n_epoch={}, learning_rate={}, batch_size={}, n_layers={}, hidden_size={}, decoder_learning_ratio={}"
        .format(corpus, reverse, n_epoch, learning_rate, batch_size, n_layers,
                hidden_size, decoder_learning_ratio))

    data, length = loadPrepareData(args)
    print('load data...')

    print(len(data.train))
    print(len(data.dev))
    print(len(data.test))
    exit(0)

    user_length, item_length = length  #, user_length2, item_length2 = length
    train_batches = batchify(data.train,
                             data.user_text,
                             user_length,
                             data.item_text,
                             item_length,
                             batch_size,
                             train_mask_idx=data.train_mask_idx,
                             shuffle=True)
    val_batches = batchify(data.dev, data.user_text, user_length,
                           data.item_text, item_length, batch_size)
    test_batches = batchify(data.test, data.user_text, user_length,
                            data.item_text, item_length, batch_size)

    # model
    checkpoint = None
    print('Building encoder and decoder ...')
    embedding = nn.Embedding(data.voc.n_words, hidden_size)
    encoderU = EncoderRNNlinear(data.voc.n_words, hidden_size, embedding,
                                data.dmax, n_layers, args.encoder_dropout)
    encoderB = EncoderRNNlinear(data.voc.n_words, hidden_size, embedding,
                                data.dmax, n_layers, args.encoder_dropout)

    attn_model = 'dot'
    decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size,
                                  data.voc.n_words, n_layers,
                                  args.decoder_dropout)
    if loadFilename:
        checkpoint = torch.load(loadFilename)
        encoderU.load_state_dict(checkpoint['enU'])
        encoderB.load_state_dict(checkpoint['enB'])
        decoder.load_state_dict(checkpoint['de'])
    # use cuda
    if USE_CUDA:
        encoderU = encoderU.cuda()
        encoderB = encoderB.cuda()
        decoder = decoder.cuda()

    # optimizer
    print('Building optimizers ...')
    encoderU_optimizer = optim.Adam(encoderU.parameters(), lr=learning_rate)
    encoderB_optimizer = optim.Adam(encoderB.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(),
                                   lr=learning_rate * decoder_learning_ratio)
    if loadFilename:
        encoderU_optimizer.load_state_dict(checkpoint['enU_opt'])
        encoderB_optimizer.load_state_dict(checkpoint['enB_opt'])
        decoder_optimizer.load_state_dict(checkpoint['de_opt'])

    # initialize
    print('Initializing ...')
    start_epoch = 0
    perplexity = []
    best_val_loss = None
    print_loss = 0
    if loadFilename:
        start_epoch = checkpoint['epoch'] + 1
        perplexity = checkpoint['plt']

    for epoch in range(start_epoch, n_epoch):
        epoch_start_time = time.time()
        # train epoch
        encoderU.train()
        encoderB.train()
        decoder.train()

        print_loss = 0
        start_time = time.time()
        for batch, training_batch in enumerate(train_batches):
            input_variable, lengths, target_variable, mask, max_target_len = training_batch
            user_input_variable, business_input_variable = input_variable
            user_lengths, business_lengths = lengths
            if batch + 5 % 1000 == 5:
                print("user_lengths: ", user_lengths)

            loss = train(user_input_variable, business_input_variable,
                         user_lengths, business_lengths, target_variable, mask,
                         max_target_len, encoderU, encoderB, decoder,
                         embedding, encoderU_optimizer, encoderB_optimizer,
                         decoder_optimizer, batch_size)
            print_loss += loss
            perplexity.append(loss)
            #print("batch {} loss={}".format(batch, loss))
            if batch % print_every == 0 and batch > 0:
                cur_loss = print_loss / print_every
                elapsed = time.time() - start_time

                print(
                    '| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                        epoch, batch, len(train_batches), learning_rate,
                        elapsed * 1000 / print_every, cur_loss,
                        math.exp(cur_loss)))

                print_loss = 0
                start_time = time.time()

        # evaluate
        val_loss = 0
        for val_batch in val_batches:
            input_variable, lengths, target_variable, mask, max_target_len = val_batch
            user_input_variable, business_input_variable = input_variable
            user_lengths, business_lengths = lengths

            loss = evaluate(user_input_variable, business_input_variable,
                            user_lengths, business_lengths, target_variable,
                            mask, max_target_len, encoderU, encoderB, decoder,
                            embedding, encoderU_optimizer, encoderB_optimizer,
                            decoder_optimizer, batch_size)
            val_loss += loss
        val_loss /= len(val_batches)

        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
              'valid ppl {:8.2f}'.format(epoch,
                                         (time.time() - epoch_start_time),
                                         val_loss, math.exp(val_loss)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            if not os.path.exists(directory):
                os.makedirs(directory)

            torch.save(
                {
                    'epoch': epoch,
                    'enU': encoderU.state_dict(),
                    'enB': encoderB.state_dict(),
                    'de': decoder.state_dict(),
                    'enU_opt': encoderU_optimizer.state_dict(),
                    'enB_opt': encoderB_optimizer.state_dict(),
                    'de_opt': decoder_optimizer.state_dict(),
                    'loss': loss,
                    'plt': perplexity
                },
                os.path.join(
                    directory,
                    '{}_{}.tar'.format(epoch,
                                       filename(reverse, 'expansion_model'))))
            best_val_loss = val_loss

            # Run on test data.
            test_loss = 0
            for test_batch in test_batches:
                input_variable, lengths, target_variable, mask, max_target_len = test_batch
                user_input_variable, business_input_variable = input_variable
                user_lengths, business_lengths = lengths

                loss = evaluate(user_input_variable, business_input_variable,
                                user_lengths, business_lengths,
                                target_variable, mask, max_target_len,
                                encoderU, encoderB, decoder, embedding,
                                encoderU_optimizer, encoderB_optimizer,
                                decoder_optimizer, batch_size)
                test_loss += loss
            test_loss /= len(test_batches)
            print('-' * 89)
            print('| test loss {:5.2f} | test ppl {:8.2f}'.format(
                test_loss, math.exp(test_loss)))
            print('-' * 89)

        if val_loss > best_val_loss:  # early stop
            break
Esempio n. 19
0
def trainIters(corpus,
               reverse,
               n_epoch,
               learning_rate,
               batch_size,
               n_layers,
               hidden_size,
               print_every,
               loadFilename=None,
               attn_model='dot',
               decoder_learning_ratio=5.0):
    print(
        "corpus: {}, reverse={}, n_epoch={}, learning_rate={}, batch_size={}, n_layers={}, hidden_size={}, decoder_learning_ratio={}"
        .format(corpus, reverse, n_epoch, learning_rate, batch_size, n_layers,
                hidden_size, decoder_learning_ratio))

    voc, pairs, valid_pairs, test_pairs = loadPrepareData(corpus)
    print('load data...')

    path = "data/attr2seq"
    # training data
    corpus_name = corpus
    training_batches = None
    try:
        training_batches = torch.load(
            os.path.join(
                save_dir, path,
                '{}_{}.tar'.format(filename(reverse, 'training_batches'),
                                   batch_size)))
    except FileNotFoundError:
        print('Training pairs not found, generating ...')
        training_batches = batchify(pairs, batch_size, voc, reverse)
        print('Complete building training pairs ...')
        torch.save(
            training_batches,
            os.path.join(
                save_dir, path,
                '{}_{}.tar'.format(filename(reverse, 'training_batches'),
                                   batch_size)))

    # validation/test data
    eval_batch_size = 10
    try:
        val_batches = torch.load(
            os.path.join(
                save_dir, path,
                '{}_{}.tar'.format(filename(reverse, 'val_batches'),
                                   eval_batch_size)))
    except FileNotFoundError:
        print('Validation pairs not found, generating ...')
        val_batches = batchify(valid_pairs,
                               eval_batch_size,
                               voc,
                               reverse,
                               evaluation=True)
        print('Complete building validation pairs ...')
        torch.save(
            val_batches,
            os.path.join(
                save_dir, path,
                '{}_{}.tar'.format(filename(reverse, 'val_batches'),
                                   eval_batch_size)))

    try:
        test_batches = torch.load(
            os.path.join(
                save_dir, path,
                '{}_{}.tar'.format(filename(reverse, 'test_batches'),
                                   eval_batch_size)))
    except FileNotFoundError:
        print('Test pairs not found, generating ...')
        test_batches = batchify(test_pairs,
                                eval_batch_size,
                                voc,
                                reverse,
                                evaluation=True)
        print('Complete building test pairs ...')
        torch.save(
            test_batches,
            os.path.join(
                save_dir, path,
                '{}_{}.tar'.format(filename(reverse, 'test_batches'),
                                   eval_batch_size)))

    # model
    checkpoint = None
    print('Building encoder and decoder ...')
    embedding = nn.Embedding(voc.n_words, hidden_size)
    encoder = EncoderRNN(voc.n_words, hidden_size, embedding, n_layers)
    attn_model = 'dot'
    decoder = DecoderRNN(embedding, hidden_size, voc.n_words, n_layers)
    if loadFilename:
        checkpoint = torch.load(loadFilename)
        encoder.load_state_dict(checkpoint['en'])
        decoder.load_state_dict(checkpoint['de'])
    # use cuda
    if USE_CUDA:
        encoder = encoder.cuda()
        decoder = decoder.cuda()

    # optimizer
    print('Building optimizers ...')
    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(),
                                   lr=learning_rate * decoder_learning_ratio)
    if loadFilename:
        encoder_optimizer.load_state_dict(checkpoint['en_opt'])
        decoder_optimizer.load_state_dict(checkpoint['de_opt'])

    # initialize
    print('Initializing ...')
    start_epoch = 0
    perplexity = []
    best_val_loss = None
    print_loss = 0
    if loadFilename:
        start_epoch = checkpoint['epoch'] + 1
        perplexity = checkpoint['plt']

    for epoch in range(start_epoch, n_epoch):
        epoch_start_time = time.time()
        # train epoch
        encoder.train()
        decoder.train()
        print_loss = 0
        start_time = time.time()
        for batch, training_batch in enumerate(training_batches):
            input_variable_attr, input_variable, lengths, target_variable, mask, max_target_len = training_batch

            loss = train(input_variable, lengths, target_variable, mask,
                         max_target_len, encoder, decoder, embedding,
                         encoder_optimizer, decoder_optimizer, batch_size)
            print_loss += loss
            perplexity.append(loss)
            #print("batch{} loss={}".format(batch, loss))
            if batch % print_every == 0 and batch > 0:
                cur_loss = print_loss / print_every
                elapsed = time.time() - start_time

                print(
                    '| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                        epoch, batch, len(training_batches), learning_rate,
                        elapsed * 1000 / print_every, cur_loss,
                        math.exp(cur_loss)))

                print_loss = 0
                start_time = time.time()
        # evaluate
        val_loss = 0
        for val_batch in val_batches:
            input_variable_attr, input_variable, lengths, target_variable, mask, max_target_len = val_batch
            loss = evaluate(input_variable, lengths, target_variable, mask,
                            max_target_len, encoder, decoder, embedding,
                            encoder_optimizer, decoder_optimizer,
                            eval_batch_size)
            val_loss += loss
        val_loss /= len(val_batches)

        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
              'valid ppl {:8.2f}'.format(epoch,
                                         (time.time() - epoch_start_time),
                                         val_loss, math.exp(val_loss)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            directory = os.path.join(save_dir, 'model',
                                     '{}_{}'.format(n_layers, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save(
                {
                    'epoch': epoch,
                    'en': encoder.state_dict(),
                    'de': decoder.state_dict(),
                    'en_opt': encoder_optimizer.state_dict(),
                    'de_opt': decoder_optimizer.state_dict(),
                    'loss': loss,
                    'plt': perplexity
                },
                os.path.join(
                    directory,
                    '{}_{}.tar'.format(epoch,
                                       filename(reverse,
                                                'text_decoder_model'))))
            best_val_loss = val_loss

            # Run on test data.
            test_loss = 0
            for test_batch in test_batches:
                input_variable_attr, input_variable, lengths, target_variable, mask, max_target_len = test_batch
                loss = evaluate(input_variable, lengths, target_variable, mask,
                                max_target_len, encoder, decoder, embedding,
                                encoder_optimizer, decoder_optimizer,
                                eval_batch_size)
                test_loss += loss
            test_loss /= len(test_batches)
            print('-' * 89)
            print('| test loss {:5.2f} | test ppl {:8.2f}'.format(
                test_loss, math.exp(test_loss)))
            print('-' * 89)

        if val_loss > best_val_loss:
            break
Esempio n. 20
0
def train_word_vector(corpus, n_iteration, hidden_size, context_size, learning_rate, batch_size, loadFilename=None):
    voc, pairs = loadPrepareData(corpus)
    corpus_name = os.path.split(corpus)[-1].split('.')[0]
    CONTEXT_SIZE = context_size
    EMBEDDING_DIM = hidden_size
    try:
        trigrams = torch.load(os.path.join(save_dir, 'training_data', corpus_name, 
                                                   '{}_{}_{}.tar'.format(n_iteration, \
                                                                         'training_batches', \
                                                                         batch_size)))
    except FileNotFoundError:
        test_sentence = []
        for i in range(batch_size):
            pair = random.choice(pairs)
            test_sentence.append(pair[0].split())
            test_sentence[i].insert(0,"SOS")
            test_sentence[i].append("EOS")
        #print(test_sentence[:3])
        trigrams = []
        for j in range(len(test_sentence)):
            for i in range(len(test_sentence[j]) - 2):
                trigram = ([test_sentence[j][i], test_sentence[j][i + 1]], test_sentence[j][i + 2])
                trigrams.append(trigram)
        torch.save(trigrams, os.path.join(save_dir, 'training_data', corpus_name, 
                                                  '{}_{}_{}.tar'.format(n_iteration, \
                                                                        'training_batches', \
                                                                        batch_size)))
    #print the first 3, just so you can see what they look like
    #print(trigrams[:30])
    #print(voc.n_words())
    #vocab = set(test_sentence)
    #word_to_ix = {word: i for i, word in enumerate(vocab)}
    losses = []
    loss_function = nn.NLLLoss()
    model = NGramLanguageModeler(voc.n_words, EMBEDDING_DIM, CONTEXT_SIZE)

    if loadFilename:
        checkpoint = torch.load(loadFilename)
        model.load_state_dict(checkpoint['w2v'])

    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    if loadFilename:
        optimizer.load_state_dict(checkpoint['w2v_opt'])
    print("There are {} trigrams.".format(len(trigrams)))
    print("Total {} iterations.".format(n_iteration))

    start_iteration = 1
    if loadFilename:
        start_iteration = checkpoint['iteration'] + 1
        losses = checkpoint['losses']
        print("{} iterations left...".format(n_iteration - start_iteration + 1))
    for iteration in tqdm(range(start_iteration, n_iteration + 1)):
        total_loss = torch.Tensor([0])
        for context, target in trigrams:
            # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
            # into integer indices and wrap them in variables)
            context_idxs = []
            for w in context:
                if w not in voc.word2index:
                    context_idxs.append(voc.word2index['UNK'])
                else:
                    context_idxs.append(voc.word2index[w])
            #context_idxs = [voc.word2index[w] for w in context]
            context_var = Variable(torch.LongTensor(context_idxs))
            # Step 2. Recall that torch *accumulates* gradients. Before passing in a
            # new instance, you need to zero out the gradients from the old
            # instance
            model.zero_grad()
            # Step 3. Run the forward pass, getting log probabilities over next
            # words
            log_probs, embeds = model(context_var)
            # Step 4. Compute your loss function. (Again, Torch wants the target
            # word wrapped in a variable)
            if target not in voc.word2index:
                target = 'UNK'

            loss = loss_function(log_probs, Variable(
                torch.LongTensor([voc.word2index[target]])))
            # Step 5. Do the backward pass and update the gradient
            loss.backward()
            optimizer.step()
            total_loss += loss.data
        losses.append(total_loss)
        save_every = 500
        if (iteration % save_every == 0):
            directory = os.path.join(save_dir, 'model', corpus_name, 'hi{}_ba{}'.format(hidden_size, batch_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save({
                    'iteration': iteration,
                    'w2v': model.state_dict(),
                    'w2v_opt': optimizer.state_dict(),
                    'loss': loss,
                    'losses': losses
                }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'backup_w2v_model')))
         
    print('\n')
    print("Training completed!")
    print('\n')
    print("Loss: {}".format(losses))  # The loss decreased every iteration over the training data!
    directory = os.path.join(save_dir, 'model', corpus_name, 'hi{}_ba{}'.format(hidden_size, batch_size))
    if not os.path.exists(directory):
        os.makedirs(directory)
    torch.save({
                'iteration': n_iteration,
                'w2v': model.state_dict(),
                'w2v_opt': optimizer.state_dict(),
                'loss': loss,
                'losses':losses
            }, os.path.join(directory, '{}_{}.tar'.format(n_iteration, 'backup_w2v_model')))
def trainIters(corpus,
               learning_rate,
               lr_decay_epoch,
               lr_decay_ratio,
               batch_size,
               n_layers,
               hidden_size,
               embed_size,
               attr_size,
               attr_num,
               overall,
               save_dir,
               loadFilename=None):

    print("corpus={}, learning_rate={}, lr_decay_epoch={}, lr_decay_ratio={}, batch_size={}, n_layers={}, \
    hidden_size={}, embed_size={}, attr_size={}, attr_num={}, overall={}, save_dir={}"                                                                                      .format(corpus, learning_rate, \
    lr_decay_epoch, lr_decay_ratio, batch_size, n_layers, hidden_size, embed_size, attr_size, attr_num, overall, save_dir))

    print('load data...')
    vocab, train_pairs, valid_pairs, test_pairs = loadPrepareData(
        corpus, save_dir)
    print('finish load data...')

    data_path = os.path.join(save_dir, "batches")
    # training data
    corpus_name = corpus
    training_batches = None
    try:
        training_batches = torch.load(
            os.path.join(data_path, '{}_{}.tar'.format('training_batches',
                                                       batch_size)))
    except FileNotFoundError:
        print('Training pairs not found, generating ...')
        training_batches = batchify(train_pairs, batch_size, vocab)
        print('Complete building training pairs ...')
        torch.save(
            training_batches,
            os.path.join(data_path, '{}_{}.tar'.format('training_batches',
                                                       batch_size)))

    # validation/test data
    eval_batch_size = 10
    try:
        val_batches = torch.load(
            os.path.join(data_path, '{}_{}.tar'.format('val_batches',
                                                       eval_batch_size)))
    except FileNotFoundError:
        print('Validation pairs not found, generating ...')
        val_batches = batchify(valid_pairs,
                               eval_batch_size,
                               vocab,
                               evaluation=True)  # 测试不需要求导
        print('Complete building validation pairs ...')
        torch.save(
            val_batches,
            os.path.join(data_path, '{}_{}.tar'.format('val_batches',
                                                       eval_batch_size)))

    try:
        test_batches = torch.load(
            os.path.join(data_path, '{}_{}.tar'.format('test_batches',
                                                       eval_batch_size)))
    except FileNotFoundError:
        print('Test pairs not found, generating ...')
        test_batches = batchify(test_pairs,
                                eval_batch_size,
                                vocab,
                                evaluation=True)
        print('Complete building test pairs ...')
        torch.save(
            test_batches,
            os.path.join(data_path, '{}_{}.tar'.format('test_batches',
                                                       eval_batch_size)))

    # aspect
    with open(os.path.join(save_dir, 'aspect_ids.pkl'), 'rb') as fp:
        ids = pickle.load(fp)

    # model
    checkpoint = None
    print('Building encoder and decoder ...')

    # topic encoder
    with open(os.path.join(save_dir, 'user.pkl'), 'rb') as fp:
        user_dict = pickle.load(fp)
    with open(os.path.join(save_dir, 'item.pkl'), 'rb') as fp:
        item_dict = pickle.load(fp)

    num_user = len(user_dict)
    num_item = len(item_dict)
    num_over = overall

    attr_embeddings = []
    uemb = nn.Embedding(num_user, attr_size)
    attr_embeddings.append(uemb)

    iemb = nn.Embedding(num_item, attr_size)
    attr_embeddings.append(iemb)

    remb = from_pretrained(
        torch.cat(
            (torch.eye(num_over), torch.zeros(num_over, attr_size - num_over)),
            dim=1))
    attr_embeddings.append(remb)

    if USE_CUDA:
        for attr_embedding in attr_embeddings:
            attr_embedding = attr_embedding.cuda()

    encoder = AttributeEncoder(attr_size, attr_num, hidden_size,
                               attr_embeddings, n_layers)

    # sketch encoder
    sketch_embedding = nn.Embedding(vocab.n_sketchs, embed_size)
    if USE_CUDA:
        sketch_embedding = sketch_embedding.cuda()

    birnn_encoder = EncoderRNN(embed_size, hidden_size, sketch_embedding,
                               n_layers)

    # review decoder
    topic_embedding = nn.Embedding(vocab.n_topics, embed_size)
    sketch_embedding = nn.Embedding(vocab.n_sketchs, embed_size)
    word_embedding = nn.Embedding(vocab.n_words, embed_size)

    if USE_CUDA:
        topic_embedding = topic_embedding.cuda()
        sketch_embedding = sketch_embedding.cuda()
        word_embedding = word_embedding.cuda()

    aspect_ids = nn.Embedding(vocab.n_topics - 3,
                              100)  # remove [SOS] [EOS] [PAD]
    aspect_ids.weight.data.copy_(torch.from_numpy(np.array(ids)))
    aspect_ids.weight.requires_grad = False

    attn_model = 'dot'
    review_decoder = ReviewAttnDecoderRNN(topic_embedding, sketch_embedding,
                                          word_embedding, embed_size,
                                          hidden_size, attr_size,
                                          vocab.n_words, aspect_ids, n_layers)

    if loadFilename:
        checkpoint = torch.load(loadFilename)
        encoder.load_state_dict(checkpoint['encoder'])
        birnn_encoder.load_state_dict(checkpoint['birnn_encoder'])
        review_decoder.load_state_dict(checkpoint['review_decoder'])

    # use cuda
    if USE_CUDA:
        encoder = encoder.cuda()
        birnn_encoder = birnn_encoder.cuda()
        review_decoder = review_decoder.cuda()

    # optimizer
    print('Building optimizers ...')
    encoder_optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                          encoder.parameters()),
                                   lr=learning_rate)
    birnn_encoder_optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                                birnn_encoder.parameters()),
                                         lr=learning_rate)
    review_decoder_optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                                 review_decoder.parameters()),
                                          lr=learning_rate)

    if loadFilename:
        encoder_optimizer.load_state_dict(checkpoint['encoder_opt'])
        birnn_encoder_optimizer.load_state_dict(
            checkpoint['birnn_encoder_opt'])
        review_decoder_optimizer.load_state_dict(
            checkpoint['review_decoder_opt'])

    # initialize
    print('Initializing ...')
    step = 0
    epoch = 0
    perplexity = []
    _loss = []

    log_path = os.path.join('ckpt/' + corpus_name)
    if not os.path.exists(log_path):
        os.makedirs(log_path)
    writer = SummaryWriter(log_path)

    best_val_loss = None

    if loadFilename:
        step = checkpoint['step']
        epoch = checkpoint['epoch'] + 1
        perplexity = checkpoint['plt']
        _loss = checkpoint['loss']
        for i in range(len(_loss)):
            writer.add_scalar("Train/loss", _loss[i], i)
            writer.add_scalar("Train/perplexity", perplexity[i], i)

    while True:

        # learning rate adjust
        adjust_learning_rate(encoder_optimizer, epoch, learning_rate,
                             lr_decay_epoch, lr_decay_ratio)
        adjust_learning_rate(birnn_encoder_optimizer, epoch, learning_rate,
                             lr_decay_epoch, lr_decay_ratio)
        adjust_learning_rate(review_decoder_optimizer, epoch, learning_rate,
                             lr_decay_epoch, lr_decay_ratio)

        # train epoch
        encoder.train()
        birnn_encoder.train()
        review_decoder.train()

        tr_loss = 0
        for batch_idx, training_batch in enumerate(training_batches):
            attr_input, topic_input, sketch_output, review_input, review_output, mask = training_batch

            loss = train(attr_input, topic_input, sketch_output, review_input,
                         review_output, mask, encoder, birnn_encoder,
                         review_decoder, encoder_optimizer,
                         birnn_encoder_optimizer, review_decoder_optimizer)
            step += 1

            tr_loss += loss

            _loss.append(loss)
            perplexity.append(math.exp(loss))

            writer.add_scalar("Train/loss", loss, step)
            writer.add_scalar("Train/perplexity", math.exp(loss), step)

            print(
                "epoch {} batch {} loss={} perplexity={} en_lr={:05.5f} bi_lr={:05.5f} de_lr={:05.5f}"
                .format(epoch, batch_idx, loss, math.exp(loss),
                        encoder_optimizer.param_groups[0]['lr'],
                        birnn_encoder_optimizer.param_groups[0]['lr'],
                        review_decoder_optimizer.param_groups[0]['lr']))

        cur_loss = tr_loss / len(training_batches)

        print('\n' + '-' * 30)
        print(
            'train | epoch {:3d} | average loss {:5.5f} | average ppl {:8.3f}'.
            format(epoch, cur_loss, math.exp(cur_loss)))
        print('-' * 30)

        print_loss = 0

        # evaluate
        vl_loss = 0
        for val_batch in val_batches:
            attr_input, topic_input, sketch_output, review_input, review_output, mask = val_batch

            loss = evaluate(attr_input, topic_input, sketch_output,
                            review_input, review_output, mask, encoder,
                            birnn_encoder, review_decoder, encoder_optimizer,
                            birnn_encoder_optimizer, review_decoder_optimizer)

            vl_loss += loss
        vl_loss /= len(val_batches)

        writer.add_scalar("Valid/loss", vl_loss, step)

        print('\n' + '-' * 30)
        print('valid | epoch {:3d} | valid loss {:5.5f} | valid ppl {:8.3f}'.
              format(epoch, vl_loss, math.exp(vl_loss)))
        print('-' * 30)

        # Save the model if the validation loss is the best we've seen so far.
        model_path = os.path.joion(save_dir, "model")
        if not best_val_loss or vl_loss < best_val_loss:
            directory = os.path.join(
                model_path, '{}_{}_{}'.format(n_layers, hidden_size,
                                              batch_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save(
                {
                    'step': step,
                    'epoch': epoch,
                    'encoder': encoder.state_dict(),
                    'encoder_opt': encoder_optimizer.state_dict(),
                    'birnn_encoder': birnn_encoder.state_dict(),
                    'birnn_encoder_opt': birnn_encoder_optimizer.state_dict(),
                    'review_decoder': review_decoder.state_dict(),
                    'review_decoder_opt':
                    review_decoder_optimizer.state_dict(),
                    'loss': _loss,
                    'plt': perplexity
                },
                os.path.join(directory,
                             '{}_{}.tar'.format(epoch, 'review_model')))
            best_val_loss = vl_loss

            # Run on test data.
            ts_loss = 0
            for test_batch in test_batches:
                attr_input, topic_input, sketch_output, review_input, review_output, mask = test_batch

                loss = evaluate(attr_input, topic_input, sketch_output,
                                review_input, review_output, mask, encoder,
                                birnn_encoder, review_decoder,
                                encoder_optimizer, birnn_encoder_optimizer,
                                review_decoder_optimizer)

                ts_loss += loss
            ts_loss /= len(test_batches)
            writer.add_scalar("Test/loss", ts_loss, step)

            print('\n' + '-' * 30)
            print('| test loss {:5.2f} | test ppl {:8.2f}'.format(
                ts_loss, math.exp(ts_loss)))
            print('-' * 30 + '\n')

        if vl_loss > best_val_loss:
            print(
                'validation loss is larger than best validation loss. Break!')
            break

        epoch += 1
Esempio n. 22
0
def trainIters(corpus,
               reverse,
               n_iteration,
               learning_rate,
               batch_size,
               n_layers,
               hidden_size,
               print_every,
               save_every,
               dropout,
               loadFilename=None,
               attn_model='dot',
               decoder_learning_ratio=5.0):

    voc, pairs = loadPrepareData(corpus)
    embedding_dict = concate_embedding(pairs, voc, hidden_size)

    # training data
    corpus_name = os.path.split(corpus)[-1].split('.')[0]
    training_batches = None
    try:
        training_batches = torch.load(os.path.join(save_dir, 'training_data', corpus_name,
                                                   '{}_{}_{}.tar'.format(n_iteration, \
                                                                         filename(reverse, 'training_batches'), \
                                                                         batch_size)))
    except FileNotFoundError:
        print('Generating training batches...')
        training_batches = [
            batch2TrainData([random.choice(pairs)
                             for _ in range(batch_size)], voc, reverse)
            for _ in range(n_iteration)
        ]
        torch.save(training_batches, os.path.join(save_dir, 'training_data', corpus_name,
                                                  '{}_{}_{}.tar'.format(n_iteration, \
                                                                            filename(reverse, 'training_batches'), \
                                                                            batch_size)))

    # model
    checkpoint = None
    print('Building encoder and decoder ...')
    encoder = EncoderRNN(hidden_size, batch_size, n_layers, dropout)
    attn_model = 'dot'
    decoder = LuongAttnDecoderRNN(attn_model, hidden_size, batch_size,
                                  voc.loc_count, n_layers, dropout)
    if loadFilename:
        checkpoint = torch.load(loadFilename)
        encoder.load_state_dict(checkpoint['en'])
        decoder.load_state_dict(checkpoint['de'])
    # use cuda
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # optimizer
    print('Building optimizers ...')
    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(),
                                   lr=learning_rate * decoder_learning_ratio)
    if loadFilename:
        encoder_optimizer.load_state_dict(checkpoint['en_opt'])
        decoder_optimizer.load_state_dict(checkpoint['de_opt'])

    # initialize
    print('Initializing ...')
    start_iteration = 1
    perplexity = []
    print_loss = 0
    if loadFilename:
        start_iteration = checkpoint['iteration'] + 1
        perplexity = checkpoint['plt']

    for iteration in tqdm(range(start_iteration, n_iteration + 1)):
        training_batch = training_batches[iteration - 1]
        input_vec, input_lengths, target_vec, max_target_len = training_batch
        # print("input_lengths:", input_lengths)

        loss = train(input_vec, input_lengths, target_vec, max_target_len,
                     encoder, decoder, embedding_dict, encoder_optimizer,
                     decoder_optimizer, batch_size)
        print_loss += loss
        perplexity.append(loss)

        if iteration % print_every == 0:
            print_loss_avg = math.exp(print_loss / print_every)
            print('%d %d%% %.4f' %
                  (iteration, iteration / n_iteration * 100, print_loss_avg))
            print_loss = 0

        if (iteration % save_every == 0):
            directory = os.path.join(
                save_dir, 'model', corpus_name,
                '{}-{}_{}'.format(n_layers, batch_size, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save(
                {
                    'iteration': iteration,
                    'en': encoder.state_dict(),
                    'de': decoder.state_dict(),
                    'en_opt': encoder_optimizer.state_dict(),
                    'de_opt': decoder_optimizer.state_dict(),
                    'loss': loss,
                    'plt': perplexity
                },
                os.path.join(
                    directory,
                    '{}_{}.tar'.format(iteration,
                                       filename(reverse,
                                                'backup_bidir_model'))))
Esempio n. 23
0
    new_pair_batch = []
    end_of_group = []
    count = 0
    dict_pairs = {}
    for group in pair_batch:
        for pair in group:
            new_pair_batch.append(pair)
            count += 1
            dict_pairs[count - 1] = pair
        end_of_group.append(count)
    return new_pair_batch, end_of_group, dict_pairs


corpus_index = './data/movie_conversations.txt'
corpus = './data/movie_lines.txt'
voc, pairs = loadPrepareData(corpus, corpus_index, 3)
#pprint.pprint(pairs[0])
#print("length of pair: ",len(pairs))
pairs = pairs_transform(pairs)
#pprint.pprint(pairs[:5])


class EncoderRNN(nn.Module):
    def __init__(self,
                 input_size,
                 hidden_size,
                 embedding,
                 n_layers=1,
                 dropout=0.1):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
Esempio n. 24
0
def trainIters(corpus, learning_rate, lr_decay_epoch, lr_decay_ratio, weight_decay, batch_size, rnn_layers,
               hidden_size, embed_size, node_size, epochs, save_dir, load_file=None):

    print('load data...')
    vocabs, train_pairs, valid_pairs, test_pairs = loadPrepareData(corpus, save_dir)
    print('load data finish...')

    data_path = os.path.join(save_dir, "batches")
    if not os.path.exists(data_path):
        os.makedirs(data_path)

    corpus_name = corpus
    try:
        training_batches = torch.load(os.path.join(data_path, '{}_{}.tar'.format('training_batches', batch_size)))
    except FileNotFoundError:
        print('Training pairs not found, generating ...')
        training_batches = batchify(train_pairs, batch_size, vocabs)
        print('Complete building training pairs ...')
        torch.save(training_batches, os.path.join(data_path, '{}_{}.tar'.format('training_batches', batch_size)))

    # validation/test data
    eval_batch_size = 10
    try:
        val_batches = torch.load(os.path.join(data_path, '{}_{}.tar'.format('val_batches', eval_batch_size)))
    except FileNotFoundError:
        print('Validation pairs not found, generating ...')
        val_batches = batchify(valid_pairs, eval_batch_size, vocabs)
        print('Complete building validation pairs ...')
        torch.save(val_batches, os.path.join(data_path, '{}_{}.tar'.format('val_batches', eval_batch_size)))

    print('Building review model ...')
    review_model = ReviewModel(vocabs, embed_size, node_size, hidden_size, rnn_layers).to(device)

    print('Building optimizers ...')
    review_optimizer = optim.Adam(review_model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    print('Initializing ...')
    global_step = 1
    last_epoch = 1
    perplexities = []
    losses = []
    best_val_loss = None

    log_path = os.path.join('ckpt/' + corpus_name)
    if not os.path.exists(log_path):
        os.makedirs(log_path)
    writer = SummaryWriter(log_path)

    if load_file:
        checkpoint = torch.load(load_file)
        review_model.load_state_dict(checkpoint['review_model'])
        global_step = checkpoint['global_step']
        last_epoch = checkpoint['epoch'] + 1
        perplexities = checkpoint['perplexity']
        losses = checkpoint['loss']
        for i in range(len(losses)):
            writer.add_scalar("Train/loss", losses[i], i)
            writer.add_scalar("Train/perplexity", perplexities[i], i)

    for epoch in tqdm(range(last_epoch, epochs+1), desc="Epoch: ", leave=True):

        # train epoch
        review_model.train()

        tr_loss = 0
        steps = trange(len(training_batches), desc="Train Loss")
        for step in steps:
            context_input, aspect_input, review_input, review_output, extend_input = training_batches[step]

            loss = train(context_input, aspect_input, review_input, review_output, extend_input,
                         review_model, review_optimizer)

            global_step += 1
            tr_loss += loss

            losses.append(loss)
            perplexities.append(math.exp(loss))

            writer.add_scalar("Train/loss", loss, global_step)
            writer.add_scalar("Train/perplexity", math.exp(loss), global_step)

            steps.set_description("ReviewModel (Loss=%g, PPL=%g)" % (round(loss, 4), round(math.exp(loss), 4)))

        cur_loss = tr_loss / len(training_batches)
        cur_ppl = math.exp(cur_loss)

        print('\nTrain | Epoch: {:3d} | Avg Loss={:4.4f} | Avg PPL={:4.4f}\n'.format(epoch, cur_loss, cur_ppl))

        # evaluate
        review_model.eval()
        with torch.no_grad():
            vl_loss = 0
            for val_batch in val_batches:
                context_input, aspect_input, review_input, review_output, extend_input = val_batch

                loss = evaluate(context_input, aspect_input, review_input, review_output,
                                extend_input, review_model)

                vl_loss += loss
            vl_loss /= len(val_batches)
            vl_ppl = math.exp(vl_loss)

        writer.add_scalar("Valid/loss", vl_loss, global_step)
        writer.add_scalar("Valid/perplexity", vl_ppl, global_step)

        print('\nValid | Epoch: {:3d} | Avg Loss={:4.4f} | Avg PPL={:4.4f}\n'.format(epoch, vl_loss, vl_ppl))

        # Save the model if the validation loss is the best we've seen so far.
        model_path = os.path.join(save_dir, "model")
        if not best_val_loss or vl_loss < best_val_loss:
            directory = os.path.join(model_path, '{}_{}_{}'.format(batch_size, hidden_size, rnn_layers))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save({
                'global_step': global_step,
                'epoch': epoch,
                'review_model': review_model.state_dict(),
                'loss': losses,
                'perplexity': perplexities
            }, os.path.join(directory, '{}_{}_{}.tar'.format(epoch, round(vl_loss, 4), 'review_model')))
            best_val_loss = vl_loss

        if vl_loss > best_val_loss:
            print('validation loss is larger than best validation loss. Break!')
            break

        # learning rate adjust
        adjust_learning_rate(review_optimizer, epoch-last_epoch+1, learning_rate, lr_decay_epoch, lr_decay_ratio)
Esempio n. 25
0
    decoder = decoder.to(device)

    if inp:
        evaluateInput(encoder, decoder, voc, beam_size)
    else:
        evaluateRandomly(encoder, decoder, voc, pairs, reverse, beam_size, 20)


if __name__ == '__main__':
    torch.set_grad_enabled(False)

    hidden_size = 512
    n_layers = 1
    attn_mode = 'dot'

    pinyin_voc, word_voc, pairs = loadPrepareData('data/touchpal_done.txt')
    pinyin_embedding = nn.Embedding(pinyin_voc.n_words, hidden_size)
    word_embedding = nn.Embedding(word_voc.n_words, hidden_size)
    encoder = EncoderRNN(pinyin_voc.n_words, hidden_size, pinyin_embedding,
                         n_layers)
    decoder = LuongAttnDecoderRNN(attn_mode, word_embedding, hidden_size,
                                  word_voc.n_words, n_layers)

    checkpoint = torch.load(
        'save/model/touchpal_done/1-1_512/6000_backup_bidir_model.tar')
    encoder.load_state_dict(checkpoint['en'])
    decoder.load_state_dict(checkpoint['de'])

    encoder.train(False)
    decoder.train(False)
Esempio n. 26
0
                    help="Pretrained fine-tuned model.")

args = parser.parse_args()
print(args)

# Load pre-trained model (weights)
model_version = 'bert-base-uncased'
model = BertMLMDecoder.from_pretrained(model_version)
model_file = args.model_file
model.load_state_dict(torch.load(model_file))
model.eval()
cuda = torch.cuda.is_available()
if cuda:
    model = model.cuda()

data, length = loadPrepareData(args)
user_length, item_length = length  #, user_length2, item_length2 = length

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained(
    model_version, do_lower_case=model_version.endswith("uncased"))


def tokenize_batch(batch):
    return [tokenizer.convert_tokens_to_ids(sent) for sent in batch]


def untokenize_batch(batch):
    return [tokenizer.convert_ids_to_tokens(sent) for sent in batch]

Esempio n. 27
0
def trainIters(corpus,
               reverse,
               n_iteration,
               learning_rate,
               batch_size,
               n_layers,
               hidden_size,
               print_every,
               save_every,
               dropout,
               loadFilename=None,
               attn_model='dot',
               decoder_learning_ratio=5.0):

    voc, pairs = loadPrepareData(corpus)
    #todo:string转数字的字典,pairs为等待转换的对话

    # training data
    corpus_name = os.path.split(corpus)[-1].split('.')[0]
    training_batches = None
    #todo:training_batches=随机抽取64组对话,交给batch2TrainData构成一组batch
    #TODO:没有采用epoch的模式,batch2TrainData负责將 load.py 所整理好的training pairs,轉換成input, output Variable。 总计循环n_iteration次,
    #TODO: 每次iteration调用batch2TrainData构造一个batch。每个batch为随机抽取64组对话,交给batch2TrainData构成一组batch。 因此此处有待改造
    try:
        training_batches = torch.load(os.path.join(save_dir, 'training_data', corpus_name,
                                                   '{}_{}_{}.tar'.format(n_iteration, \
                                                                         filename(reverse, 'training_batches'), \
                                                                         batch_size)))
    except FileNotFoundError:
        print('Training pairs not found, generating ...')
        training_batches = [
            batch2TrainData(voc,
                            [random.choice(pairs)
                             for _ in range(batch_size)], reverse)
            for _ in range(n_iteration)
        ]
    # # model
    checkpoint = None
    print('Building encoder and decoder ...')

    embedding = nn.Embedding(voc.n_words, hidden_size)
    encoder = EncoderRNN(voc.n_words, hidden_size, embedding, n_layers,
                         dropout)
    attn_model = 'dot'
    decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size,
                                  voc.n_words, n_layers, dropout)
    if loadFilename:
        checkpoint = torch.load(loadFilename)
        encoder.load_state_dict(checkpoint['en'])
        decoder.load_state_dict(checkpoint['de'])
    # use cuda
# if torch.cuda.device_count()>1:
# encoder=nn.DataParallel(encoder)
#decoder=nn.DataParallel(decoder)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # optimizer
    print('Building optimizers ...')
    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(),
                                   lr=learning_rate * decoder_learning_ratio)
    if loadFilename:
        encoder_optimizer.load_state_dict(checkpoint['en_opt'])
        decoder_optimizer.load_state_dict(checkpoint['de_opt'])

    # initialize
    print('Initializing ...')
    start_iteration = 1
    perplexity = []
    print_loss = 0
    if loadFilename:
        start_iteration = checkpoint['iteration'] + 1
        perplexity = checkpoint['plt']

    for iteration in range(start_iteration, n_iteration + 1):
        training_batch = training_batches[iteration - 1]
        input_variable, lengths, target_variable, mask, max_target_len = training_batch

        loss = train(input_variable, lengths, target_variable, mask,
                     max_target_len, encoder, decoder, embedding,
                     encoder_optimizer, decoder_optimizer, batch_size)
        print_loss += loss
        perplexity.append(loss)

        if iteration % print_every == 0:
            print_loss_avg = math.exp(print_loss / print_every)
            #print('%d %d%% %.4f' % (iteration, iteration / n_iteration * 100, print_loss_avg))
            with open('log.txt', 'a') as f:
                import time
                template = ' Iter: {:0>6d} process: {:.2f} avg_loss: {:.4f} time: {}\n'
                str = template.format(
                    iteration, iteration / n_iteration * 100, print_loss_avg,
                    time.asctime(time.localtime(time.time())))
                f.write(str)
            print_loss = 0

        if (iteration % save_every == 0):
            directory = os.path.join(
                save_dir, 'model', corpus_name,
                '{}-{}_{}'.format(n_layers, n_layers, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save(
                {
                    'iteration': iteration,
                    'en': encoder.state_dict(),
                    'de': decoder.state_dict(),
                    'en_opt': encoder_optimizer.state_dict(),
                    'de_opt': decoder_optimizer.state_dict(),
                    'loss': loss,
                    'plt': perplexity
                },
                os.path.join(
                    directory,
                    '{}_{}.tar'.format(iteration,
                                       filename(reverse,
                                                'backup_bidir_model'))))
Esempio n. 28
0
def trainIters(corpus,
               pre_modelFile,
               reverse,
               n_iteration,
               learning_rate,
               batch_size,
               n_layers,
               hidden_size,
               print_every,
               save_every,
               loadFilename=None,
               attn_model='dot',
               decoder_learning_ratio=5.0):

    voc, pairs = loadPrepareData(corpus)

    # training data
    corpus_name = os.path.split(corpus)[-1].split('.')[0]
    training_batches = None
    try:
        training_batches = torch.load(os.path.join(save_dir, 'training_data', corpus_name,
                                                   '{}_{}_{}.tar'.format(n_iteration, \
                                                                         filename(reverse, 'training_batches'), \
                                                                         batch_size)))
    except FileNotFoundError:
        print('Training pairs not found, generating ...')
        training_batches = [
            batch2TrainData(voc,
                            [random.choice(pairs)
                             for _ in range(batch_size)], reverse)
            for _ in range(n_iteration)
        ]
        torch.save(training_batches, os.path.join(save_dir, 'training_data', corpus_name,
                                                  '{}_{}_{}.tar'.format(n_iteration, \
                                                                        filename(reverse, 'training_batches'), \
                                                                        batch_size)))
    # model
    checkpoint = None
    #print('Building pretrained word2vector model...')
    embedding = nn.Embedding(
        300, hidden_size)  #The dimension of google's model is 300
    #-----------------------------------------------------------------
    #my code
    '''
    EMBEDDING_DIM = 300 #Should be the same as hidden_size!
    if EMBEDDING_DIM != hidden_size:
        sys.exit("EMBEDDING_DIM do not equal to hidden_size. Please correct it.")
    CONTEXT_SIZE = 2
    pre_checkpoint = torch.load(pre_modelFile)
    pretrained_model = NGramLanguageModeler(voc.n_words, EMBEDDING_DIM, CONTEXT_SIZE)
    pretrained_model.load_state_dict(pre_checkpoint['w2v'])
    pretrained_model.train(False)
    embedding = pretrained_model
    '''
    if USE_CUDA:
        embedding = embedding.cuda()

    #-----------------------------------------------------------------
    #replace embedding by pretrained_model
    print('Building encoder and decoder ...')
    encoder = EncoderRNN(300, hidden_size, embedding, n_layers)
    attn_model = 'dot'
    decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size,
                                  voc.n_words, n_layers)
    if loadFilename:
        checkpoint = torch.load(loadFilename)
        encoder.load_state_dict(checkpoint['en'])
        decoder.load_state_dict(checkpoint['de'])
    # use cuda
    if USE_CUDA:
        encoder = encoder.cuda()
        decoder = decoder.cuda()

    # optimizer
    print('Building optimizers ...')
    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(),
                                   lr=learning_rate * decoder_learning_ratio)
    if loadFilename:
        encoder_optimizer.load_state_dict(checkpoint['en_opt'])
        decoder_optimizer.load_state_dict(checkpoint['de_opt'])

    # Load Google's pre-trained Word2Vec model.
    print('Loading w2v_model ...')
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    w2v_model = gensim.models.KeyedVectors.load_word2vec_format(pre_modelFile,
                                                                binary=True)
    print("Loading complete!")

    # initialize
    print('Initializing ...')
    start_iteration = 1
    perplexity = []
    print_loss = 0
    if loadFilename:
        start_iteration = checkpoint['iteration'] + 1
        perplexity = checkpoint['plt']

    for iteration in tqdm(range(start_iteration, n_iteration + 1)):
        training_batch = training_batches[iteration - 1]
        input_variable, lengths, target_variable, mask, max_target_len = training_batch

        loss = train(input_variable, lengths, target_variable, mask,
                     max_target_len, encoder, decoder, embedding,
                     encoder_optimizer, decoder_optimizer, batch_size,
                     w2v_model, voc)
        print_loss += loss
        perplexity.append(loss)

        if iteration % print_every == 0:
            print_loss_avg = math.exp(print_loss / print_every)
            # perplexity.append(print_loss_avg)
            # plotPerplexity(perplexity, iteration)
            print('%d %d%% %.4f' %
                  (iteration, iteration / n_iteration * 100, print_loss_avg))
            print_loss = 0

        if (iteration % save_every == 0):
            directory = os.path.join(
                save_dir, 'model', corpus_name,
                '{}-{}_{}'.format(n_layers, n_layers, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save(
                {
                    'iteration': iteration,
                    'en': encoder.state_dict(),
                    'de': decoder.state_dict(),
                    'en_opt': encoder_optimizer.state_dict(),
                    'de_opt': decoder_optimizer.state_dict(),
                    'loss': loss,
                    'plt': perplexity
                },
                os.path.join(
                    directory,
                    '{}_{}.tar'.format(iteration,
                                       filename(reverse,
                                                'backup_bidir_model'))))
Esempio n. 29
0
                    'de': decoder.state_dict(),
                    'en_opt': encoder_optimizer.state_dict(),
                    'de_opt': decoder_optimizer.state_dict(),
                    'loss': loss,
                    'plt': perplexity
                },
                os.path.join(
                    directory,
                    '{}_{}.tar'.format(iteration,
                                       filename(reverse,
                                                'backup_bidir_model'))))


if __name__ == "__main__":
    corpus = "data/greeting.txt"
    voc, pairs = loadPrepareData(corpus)
    corpus_name = os.path.split(corpus)[-1].split('.')[0]
    # print(corpus_name)

    hidden_size = 768

    embedding_dict = concate_embedding(pairs, voc, hidden_size)
    print(len(embedding_dict))
    print(embedding_dict[3])
    print(embedding_dict[0])

    print('Generating training batches...')
    n_iteration = 10
    batch_size = 16
    reverse = False
Esempio n. 30
0
def trainIters(corpus,
               reverse,
               n_iteration,
               learning_rate,
               batch_size,
               n_layers,
               hidden_size,
               print_every,
               save_every,
               loadFilename=None,
               attn_model='dot',
               decoder_learning_ratio=5.0):

    voc, pairs = loadPrepareData(corpus)

    # training data
    corpus_name = os.path.split(corpus)[-1].split('.')[0]
    training_batches = None
    try:
        training_batches = torch.load(os.path.join(save_dir, 'training_data', corpus_name,
                                                   '{}_{}_{}.tar'.format(n_iteration, \
                                                                         filename(reverse, 'training_batches'), \
                                                                         batch_size)))
    except BaseException:  #OWEN: was FileNotFoundError
        print('Training pairs not found, generating ...')
        training_batches = [
            batch2TrainData(voc,
                            [random.choice(pairs)
                             for _ in range(batch_size)], reverse)
            for _ in range(n_iteration)
        ]
        torch.save(training_batches, os.path.join(save_dir, 'training_data', corpus_name,
                                                  '{}_{}_{}.tar'.format(n_iteration, \
                                                                        filename(reverse, 'training_batches'), \
                                                                        batch_size)))
    # model
    checkpoint = None
    print('Building encoder and decoder ...')
    embedding = nn.Embedding(voc.n_words, hidden_size)
    encoder = EncoderRNN(voc.n_words, hidden_size, embedding, n_layers)
    attn_model = 'dot'
    decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size,
                                  voc.n_words, n_layers)
    if loadFilename:
        checkpoint = torch.load(loadFilename)
        encoder.load_state_dict(checkpoint['en'])
        decoder.load_state_dict(checkpoint['de'])
    # use cuda
    if USE_CUDA:
        encoder = encoder.cuda()
        decoder = decoder.cuda()

    # optimizer
    print('Building optimizers ...')
    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(),
                                   lr=learning_rate * decoder_learning_ratio)
    if loadFilename:
        encoder_optimizer.load_state_dict(checkpoint['en_opt'])
        decoder_optimizer.load_state_dict(checkpoint['de_opt'])

    # initialize
    print('Initializing ...')
    start_iteration = 1
    perplexity = []
    print_loss = 0
    if loadFilename:
        start_iteration = checkpoint['iteration'] + 1
        perplexity = checkpoint['plt']

    for iteration in tqdm(range(start_iteration, n_iteration + 1)):
        training_batch = training_batches[iteration - 1]
        input_variable, lengths, target_variable, mask, max_target_len = training_batch

        loss = train(input_variable, lengths, target_variable, mask,
                     max_target_len, encoder, decoder, embedding,
                     encoder_optimizer, decoder_optimizer, batch_size)
        print_loss += loss
        perplexity.append(loss)

        if iteration % print_every == 0:
            print_loss_avg = math.exp(print_loss / print_every)
            perplexity.append(print_loss_avg)
            # show perplexity (lots of numbers!):
            #print(perplexity, iteration)
            # plotPerplexity(perplexity, iteration)
            print('%d %d%% %.4f' %
                  (iteration, iteration / n_iteration * 100, print_loss_avg))
            print_loss = 0

        if (iteration % save_every == 0):
            directory = os.path.join(
                save_dir, 'model', corpus_name,
                '{}-{}_{}'.format(n_layers, n_layers, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save(
                {
                    'iteration': iteration,
                    'en': encoder.state_dict(),
                    'de': decoder.state_dict(),
                    'en_opt': encoder_optimizer.state_dict(),
                    'de_opt': decoder_optimizer.state_dict(),
                    'loss': loss,
                    'plt': perplexity
                },
                os.path.join(
                    directory,
                    '{}_{}.tar'.format(iteration,
                                       filename(reverse,
                                                'backup_bidir_model'))))
Esempio n. 31
0
def trainIters(corpus,
               reverse,
               n_epoch,
               learning_rate,
               batch_size,
               n_layers,
               hidden_size,
               print_every,
               loadFilename=None,
               attn_model='dot',
               decoder_learning_ratio=1.0):
    print(
        "corpus: {}, reverse={}, n_epoch={}, learning_rate={}, batch_size={}, n_layers={}, hidden_size={}, decoder_learning_ratio={}"
        .format(corpus, reverse, n_epoch, learning_rate, batch_size, n_layers,
                hidden_size, decoder_learning_ratio))

    voc, pairs, valid_pairs, test_pairs = loadPrepareData(corpus)
    print('load data...')

    path = "data/expansion"
    # training data
    corpus_name = corpus
    training_batches = None
    try:
        training_batches = torch.load(
            os.path.join(
                save_dir, path,
                '{}_{}.tar'.format(filename(reverse, 'training_batches'),
                                   batch_size)))
    except FileNotFoundError:
        print('Training pairs not found, generating ...')
        training_batches = batchify(pairs, batch_size, voc, reverse)
        print('Complete building training pairs ...')
        torch.save(
            training_batches,
            os.path.join(
                save_dir, path,
                '{}_{}.tar'.format(filename(reverse, 'training_batches'),
                                   batch_size)))

    # validation/test data
    eval_batch_size = 10
    try:
        val_batches = torch.load(
            os.path.join(
                save_dir, path,
                '{}_{}.tar'.format(filename(reverse, 'val_batches'),
                                   eval_batch_size)))
    except FileNotFoundError:
        print('Validation pairs not found, generating ...')
        val_batches = batchify(valid_pairs,
                               eval_batch_size,
                               voc,
                               reverse,
                               evaluation=True)
        print('Complete building validation pairs ...')
        torch.save(
            val_batches,
            os.path.join(
                save_dir, path,
                '{}_{}.tar'.format(filename(reverse, 'val_batches'),
                                   eval_batch_size)))

    try:
        test_batches = torch.load(
            os.path.join(
                save_dir, path,
                '{}_{}.tar'.format(filename(reverse, 'test_batches'),
                                   eval_batch_size)))
    except FileNotFoundError:
        print('Test pairs not found, generating ...')
        test_batches = batchify(test_pairs,
                                eval_batch_size,
                                voc,
                                reverse,
                                evaluation=True)
        print('Complete building test pairs ...')
        torch.save(
            test_batches,
            os.path.join(
                save_dir, path,
                '{}_{}.tar'.format(filename(reverse, 'test_batches'),
                                   eval_batch_size)))

    # model
    checkpoint = None
    print('Building encoder and decoder ...')
    # aspect
    with open(os.path.join(save_dir, '15_aspect.pkl'), 'rb') as fp:
        aspect_ids = pickle.load(fp)
    aspect_num = 15  # 15 | 20 main aspects and each of them has 100 words
    aspect_ids = Variable(
        torch.LongTensor(aspect_ids), requires_grad=False
    )  # convert list into torch Variable, used to index word embedding
    # attribute embeddings
    attr_size = 64  #
    attr_num = 2

    print(
        "corpus: {}, reverse={}, n_words={}, n_epoch={}, learning_rate={}, batch_size={}, n_layers={}, hidden_size={}, decoder_learning_ratio={}, attr_size={}, aspect_num={}"
        .format(corpus, reverse, voc.n_words, n_epoch, learning_rate,
                batch_size, n_layers, hidden_size, decoder_learning_ratio,
                attr_size, aspect_num))
    with open(os.path.join(save_dir, 'user_item.pkl'), 'rb') as fp:
        user_dict, item_dict = pickle.load(fp)
    num_user = len(user_dict)
    num_item = len(item_dict)
    attr_embeddings = []
    attr_embeddings.append(nn.Embedding(num_user, attr_size))
    attr_embeddings.append(nn.Embedding(num_item, attr_size))
    aspect_embeddings = []
    aspect_embeddings.append(nn.Embedding(num_user, aspect_num))
    aspect_embeddings.append(nn.Embedding(num_item, aspect_num))
    if USE_CUDA:
        for attr_embedding in attr_embeddings:
            attr_embedding = attr_embedding.cuda()
        for aspect_embedding in aspect_embeddings:
            aspect_embedding = aspect_embedding.cuda()
        aspect_ids = aspect_ids.cuda()

    encoder1 = AttributeEncoder(attr_size, attr_num, hidden_size,
                                attr_embeddings, n_layers)
    encoder2 = AttributeEncoder(aspect_num, attr_num, hidden_size,
                                aspect_embeddings, n_layers)
    embedding = nn.Embedding(voc.n_words, hidden_size)
    encoder3 = EncoderRNN(voc.n_words, hidden_size, embedding, n_layers)
    attn_model = 'dot'
    decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size,
                                  attr_size, voc.n_words, aspect_ids, n_layers)
    if loadFilename:
        checkpoint = torch.load(loadFilename)
        encoder1.load_state_dict(checkpoint['en1'])
        encoder2.load_state_dict(checkpoint['en2'])
        encoder3.load_state_dict(checkpoint['en3'])
        decoder.load_state_dict(checkpoint['de'])
    # use cuda
    if USE_CUDA:
        encoder1 = encoder1.cuda()
        encoder2 = encoder2.cuda()
        encoder3 = encoder3.cuda()
        decoder = decoder.cuda()

    # optimizer
    print('Building optimizers ...')
    encoder1_optimizer = optim.Adam(encoder1.parameters(), lr=learning_rate)
    encoder2_optimizer = optim.Adam(encoder2.parameters(), lr=learning_rate)
    encoder3_optimizer = optim.Adam(encoder3.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(),
                                   lr=learning_rate * decoder_learning_ratio)
    if loadFilename:
        encoder1_optimizer.load_state_dict(checkpoint['en1_opt'])
        encoder2_optimizer.load_state_dict(checkpoint['en2_opt'])
        encoder3_optimizer.load_state_dict(checkpoint['en3_opt'])
        decoder_optimizer.load_state_dict(checkpoint['de_opt'])

    # initialize
    print('Initializing ...')
    start_epoch = 0
    perplexity = []
    best_val_loss = None
    print_loss = 0
    if loadFilename:
        start_epoch = checkpoint['epoch'] + 1
        perplexity = checkpoint['plt']

    for epoch in range(start_epoch, n_epoch):
        epoch_start_time = time.time()
        # train epoch
        encoder1.train()
        encoder2.train()
        encoder3.train()
        decoder.train()
        print_loss = 0
        start_time = time.time()
        for batch, training_batch in enumerate(training_batches):
            attr_input, summary_input, summary_input_lengths, title_input, title_input_lengths, target_variable, mask, max_target_len = training_batch

            loss = train(attr_input, summary_input, summary_input_lengths,
                         title_input, title_input_lengths, target_variable,
                         mask, max_target_len, encoder1, encoder2, encoder3,
                         decoder, embedding, encoder1_optimizer,
                         encoder2_optimizer, encoder3_optimizer,
                         decoder_optimizer, batch_size)
            print_loss += loss
            perplexity.append(loss)
            #print("batch {} loss={}".format(batch, loss))
            if batch % print_every == 0 and batch > 0:
                cur_loss = print_loss / print_every
                elapsed = time.time() - start_time

                print(
                    '| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                        epoch, batch, len(training_batches), learning_rate,
                        elapsed * 1000 / print_every, cur_loss,
                        math.exp(cur_loss)))

                print_loss = 0
                start_time = time.time()
        # evaluate
        val_loss = 0
        for val_batch in val_batches:
            attr_input, summary_input, summary_input_lengths, title_input, title_input_lengths, target_variable, mask, max_target_len = val_batch
            loss = evaluate(attr_input, summary_input, summary_input_lengths,
                            title_input, title_input_lengths, target_variable,
                            mask, max_target_len, encoder1, encoder2, encoder3,
                            decoder, embedding, encoder1_optimizer,
                            encoder2_optimizer, encoder3_optimizer,
                            decoder_optimizer, batch_size)
            val_loss += loss
        val_loss /= len(val_batches)

        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
              'valid ppl {:8.2f}'.format(epoch,
                                         (time.time() - epoch_start_time),
                                         val_loss, math.exp(val_loss)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            directory = os.path.join(save_dir, 'model',
                                     '{}_{}'.format(n_layers, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save(
                {
                    'epoch': epoch,
                    'en1': encoder1.state_dict(),
                    'en2': encoder2.state_dict(),
                    'en3': encoder3.state_dict(),
                    'de': decoder.state_dict(),
                    'en1_opt': encoder1_optimizer.state_dict(),
                    'en2_opt': encoder2_optimizer.state_dict(),
                    'en3_opt': encoder3_optimizer.state_dict(),
                    'de_opt': decoder_optimizer.state_dict(),
                    'loss': loss,
                    'plt': perplexity
                },
                os.path.join(
                    directory, '{}_{}.tar'.format(
                        epoch,
                        filename(reverse, 'lexicon_title_expansion_model'))))
            best_val_loss = val_loss

            # Run on test data.
            test_loss = 0
            for test_batch in test_batches:
                attr_input, summary_input, summary_input_lengths, title_input, title_input_lengths, target_variable, mask, max_target_len = test_batch
                loss = evaluate(attr_input, summary_input,
                                summary_input_lengths, title_input,
                                title_input_lengths, target_variable, mask,
                                max_target_len, encoder1, encoder2, encoder3,
                                decoder, embedding, encoder1_optimizer,
                                encoder2_optimizer, encoder3_optimizer,
                                decoder_optimizer, batch_size)
                test_loss += loss
            test_loss /= len(test_batches)
            print('-' * 89)
            print('| test loss {:5.2f} | test ppl {:8.2f}'.format(
                test_loss, math.exp(test_loss)))
            print('-' * 89)

        if val_loss > best_val_loss:
            break