def test(opt):

    # 数据
    dataloader = get_dataloader(opt)
    _data = dataloader.dataset._data
    word2ix, ix2word = _data['word2ix'], _data['ix2word']
    sos = word2ix.get(_data.get('sos'))
    eos = word2ix.get(_data.get('eos'))
    unknown = word2ix.get(_data.get('unknown'))
    voc_length = len(word2ix)

    #定义模型
    encoder = EncoderRNN(opt, voc_length)
    decoder = LuongAttnDecoderRNN(opt, voc_length)

    #加载模型
    if opt.model_ckpt == None:
        raise ValueError('model_ckpt is None.')
        return False
    checkpoint = torch.load(opt.model_ckpt, map_location=lambda s, l: s)
    encoder.load_state_dict(checkpoint['en'])
    decoder.load_state_dict(checkpoint['de'])

    with torch.no_grad():
        #切换模式
        encoder = encoder.to(opt.device)
        decoder = decoder.to(opt.device)
        encoder.eval()
        decoder.eval()
        #定义seracher
        searcher = GreedySearchDecoder(encoder, decoder)
        return searcher, sos, eos, unknown, word2ix, ix2word
Example #2
0
def runTest(n_layers, hidden_size, reverse, modelFile, beam_size, inp, corpus):
    torch.set_grad_enabled(False)

    voc, pairs = loadPrepareData(corpus)
    embedding = nn.Embedding(voc.num_words, hidden_size)
    encoder = EncoderRNN(hidden_size, embedding, n_layers)
    attn_model = 'dot'
    decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size,
                                  voc.num_words, n_layers)

    checkpoint = torch.load(modelFile,
                            map_location=lambda storage, loc: storage)
    encoder.load_state_dict(checkpoint['en'])
    decoder.load_state_dict(checkpoint['de'])

    # train mode set to false, effect only on dropout, batchNorm
    encoder.train(False)
    decoder.train(False)

    encoder = encoder.to(device)
    decoder = decoder.to(device)

    if inp:
        evaluateInput(encoder, decoder, voc, beam_size)
    else:
        evaluateRandomly(encoder, decoder, voc, pairs, reverse, beam_size, 20)
Example #3
0
def eval():
    parameter = Config()
    # 加载参数
    save_dir = parameter.save_dir
    loadFilename = parameter.model_ckpt

    pretrained_embedding_path = parameter.pretrained_embedding_path
    dropout = parameter.dropout
    hidden_size = parameter.hidden_size
    num_layers = parameter.num_layers
    attn_model = parameter.method

    max_input_length = parameter.max_input_length
    max_generate_length = parameter.max_generate_length
    embedding_dim = parameter.embedding_dim
    #加载embedding
    voc = read_voc_file('./data/voc.pkl')
    embedding = get_weight(voc,pretrained_embedding_path)
    #输入
    inputs = get_input_line('./test/test.txt')
    input_batches, lengths = get_batch_id(inputs)
    #
    encoder = EncoderRNN(hidden_size, embedding, num_layers, dropout)
    decoder = LuongAttnDecoderRNN(attn_model,embedding,hidden_size,len(voc),num_layers,dropout)
    if loadFilename == None:
        raise ValueError('model_ckpt is None.')
        return False
    checkpoint = torch.load(loadFilename, map_location=lambda s, l: s)
    print(checkpoint['plt'])
    encoder.load_state_dict(checkpoint['en'])
    decoder.load_state_dict(checkpoint['de'])
    answer =[]
    with torch.no_grad():
        encoder.to(device)
        decoder.to(device)
        #切换到测试模式
        encoder.eval()
        decoder.eval()
        search = GreedySearchDecoder(encoder, decoder)
        for input_batch in input_batches:
            #print(input_batch)
            token,score = generate(input_batch, search, GO_ID, EOS_ID, device)
            print(token)
            answer.append(token)
        print(answer)
    return answer
def eval(**kwargs):

    opt = Config()
    for k, v in kwargs.items():  #设置参数
        setattr(opt, k, v)

    # 数据
    dataloader = get_dataloader(opt)
    _data = dataloader.dataset._data
    word2ix, ix2word = _data['word2ix'], _data['ix2word']
    sos = word2ix.get(_data.get('sos'))
    eos = word2ix.get(_data.get('eos'))
    unknown = word2ix.get(_data.get('unknown'))
    voc_length = len(word2ix)

    #定义模型
    encoder = EncoderRNN(opt, voc_length)
    decoder = LuongAttnDecoderRNN(opt, voc_length)

    #加载模型
    if opt.model_ckpt == None:
        raise ValueError('model_ckpt is None.')
        return False
    checkpoint = torch.load(opt.model_ckpt, map_location=lambda s, l: s)
    encoder.load_state_dict(checkpoint['en'])
    decoder.load_state_dict(checkpoint['de'])

    with torch.no_grad():
        #切换模式
        encoder = encoder.to(opt.device)
        decoder = decoder.to(opt.device)
        encoder.eval()
        decoder.eval()
        #定义seracher
        searcher = GreedySearchDecoder(encoder, decoder)

        while (1):
            input_sentence = input('> ')
            if input_sentence == 'q' or input_sentence == 'quit': break
            cop = re.compile("[^\u4e00-\u9fa5^a-z^A-Z^0-9]")  #分词处理正则
            input_seq = jieba.lcut(cop.sub("", input_sentence))  #分词序列
            input_seq = input_seq[:opt.max_input_length] + ['</EOS>']
            input_seq = [word2ix.get(word, unknown) for word in input_seq]
            tokens = generate(input_seq, searcher, sos, eos, opt)
            output_words = ''.join([ix2word[token.item()] for token in tokens])
            print('BOT: ', output_words)
def train(**kwargs):

    opt = Config()
    for k, v in kwargs.items():  #设置参数
        setattr(opt, k, v)

    # 数据
    dataloader = get_dataloader(opt)
    _data = dataloader.dataset._data
    word2ix = _data['word2ix']
    sos = word2ix.get(_data.get('sos'))
    voc_length = len(word2ix)

    #定义模型
    encoder = EncoderRNN(opt, voc_length)
    decoder = LuongAttnDecoderRNN(opt, voc_length)

    #加载断点,从上次结束地方开始
    if opt.model_ckpt:
        checkpoint = torch.load(opt.model_ckpt)
        encoder.load_state_dict(checkpoint['en'])
        decoder.load_state_dict(checkpoint['de'])

    #切换模式
    encoder = encoder.to(opt.device)
    decoder = decoder.to(opt.device)
    encoder.train()
    decoder.train()

    #定义优化器(注意与encoder.to(device)前后不要反)
    encoder_optimizer = torch.optim.Adam(encoder.parameters(),
                                         lr=opt.learning_rate)
    decoder_optimizer = torch.optim.Adam(decoder.parameters(),
                                         lr=opt.learning_rate *
                                         opt.decoder_learning_ratio)
    if opt.model_ckpt:
        encoder_optimizer.load_state_dict(checkpoint['en_opt'])
        decoder_optimizer.load_state_dict(checkpoint['de_opt'])

    #定义打印loss的变量
    print_loss = 0

    for epoch in range(opt.epoch):
        for ii, data in enumerate(dataloader):
            #取一个batch训练
            loss = train_by_batch(sos, opt, data, encoder_optimizer,
                                  decoder_optimizer, encoder, decoder)
            print_loss += loss
            #打印损失
            if ii % opt.print_every == 0:
                print_loss_avg = print_loss / opt.print_every
                print(
                    "Epoch: {}; Epoch Percent complete: {:.1f}%; Average loss: {:.4f}"
                    .format(epoch, epoch / opt.epoch * 100, print_loss_avg))
                print_loss = 0

        # 保存checkpoint
        if epoch % opt.save_every == 0:
            checkpoint_path = '{prefix}_{time}'.format(
                prefix=opt.prefix, time=time.strftime('%m%d_%H%M'))
            torch.save(
                {
                    'en': encoder.state_dict(),
                    'de': decoder.state_dict(),
                    'en_opt': encoder_optimizer.state_dict(),
                    'de_opt': decoder_optimizer.state_dict(),
                }, checkpoint_path)
Example #6
0
def trainIters(corpus,
               reverse,
               n_iteration,
               learning_rate,
               batch_size,
               n_layers,
               hidden_size,
               print_every,
               save_every,
               dropout,
               loadFilename=None,
               attn_model='dot',
               decoder_learning_ratio=5.0):

    voc, pairs = loadPrepareData(corpus)
    #todo:string转数字的字典,pairs为等待转换的对话

    # training data
    corpus_name = os.path.split(corpus)[-1].split('.')[0]
    training_batches = None
    #todo:training_batches=随机抽取64组对话,交给batch2TrainData构成一组batch
    #TODO:没有采用epoch的模式,batch2TrainData负责將 load.py 所整理好的training pairs,轉換成input, output Variable。 总计循环n_iteration次,
    #TODO: 每次iteration调用batch2TrainData构造一个batch。每个batch为随机抽取64组对话,交给batch2TrainData构成一组batch。 因此此处有待改造
    try:
        training_batches = torch.load(os.path.join(save_dir, 'training_data', corpus_name,
                                                   '{}_{}_{}.tar'.format(n_iteration, \
                                                                         filename(reverse, 'training_batches'), \
                                                                         batch_size)))
    except FileNotFoundError:
        print('Training pairs not found, generating ...')
        training_batches = [
            batch2TrainData(voc,
                            [random.choice(pairs)
                             for _ in range(batch_size)], reverse)
            for _ in range(n_iteration)
        ]
    # # model
    checkpoint = None
    print('Building encoder and decoder ...')

    embedding = nn.Embedding(voc.n_words, hidden_size)
    encoder = EncoderRNN(voc.n_words, hidden_size, embedding, n_layers,
                         dropout)
    attn_model = 'dot'
    decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size,
                                  voc.n_words, n_layers, dropout)
    if loadFilename:
        checkpoint = torch.load(loadFilename)
        encoder.load_state_dict(checkpoint['en'])
        decoder.load_state_dict(checkpoint['de'])
    # use cuda
# if torch.cuda.device_count()>1:
# encoder=nn.DataParallel(encoder)
#decoder=nn.DataParallel(decoder)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # optimizer
    print('Building optimizers ...')
    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(),
                                   lr=learning_rate * decoder_learning_ratio)
    if loadFilename:
        encoder_optimizer.load_state_dict(checkpoint['en_opt'])
        decoder_optimizer.load_state_dict(checkpoint['de_opt'])

    # initialize
    print('Initializing ...')
    start_iteration = 1
    perplexity = []
    print_loss = 0
    if loadFilename:
        start_iteration = checkpoint['iteration'] + 1
        perplexity = checkpoint['plt']

    for iteration in range(start_iteration, n_iteration + 1):
        training_batch = training_batches[iteration - 1]
        input_variable, lengths, target_variable, mask, max_target_len = training_batch

        loss = train(input_variable, lengths, target_variable, mask,
                     max_target_len, encoder, decoder, embedding,
                     encoder_optimizer, decoder_optimizer, batch_size)
        print_loss += loss
        perplexity.append(loss)

        if iteration % print_every == 0:
            print_loss_avg = math.exp(print_loss / print_every)
            #print('%d %d%% %.4f' % (iteration, iteration / n_iteration * 100, print_loss_avg))
            with open('log.txt', 'a') as f:
                import time
                template = ' Iter: {:0>6d} process: {:.2f} avg_loss: {:.4f} time: {}\n'
                str = template.format(
                    iteration, iteration / n_iteration * 100, print_loss_avg,
                    time.asctime(time.localtime(time.time())))
                f.write(str)
            print_loss = 0

        if (iteration % save_every == 0):
            directory = os.path.join(
                save_dir, 'model', corpus_name,
                '{}-{}_{}'.format(n_layers, n_layers, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save(
                {
                    'iteration': iteration,
                    'en': encoder.state_dict(),
                    'de': decoder.state_dict(),
                    'en_opt': encoder_optimizer.state_dict(),
                    'de_opt': decoder_optimizer.state_dict(),
                    'loss': loss,
                    'plt': perplexity
                },
                os.path.join(
                    directory,
                    '{}_{}.tar'.format(iteration,
                                       filename(reverse,
                                                'backup_bidir_model'))))
Example #7
0
def trainIters(corpus,
               reverse,
               n_iteration,
               learning_rate,
               batch_size,
               n_layers,
               hidden_size,
               print_every,
               save_every,
               dropout,
               loadFilename=None,
               attn_model='dot',
               decoder_learning_ratio=5.0):

    voc, pairs = loadPrepareData(corpus)

    # training data
    corpus_name = os.path.split(corpus)[-1].split('.')[0]
    training_batches = None
    try:
        training_batches = torch.load(os.path.join(save_dir, 'training_data', corpus_name,
                                                   '{}_{}_{}.tar'.format(n_iteration, \
                                                                         filename(reverse, 'training_batches'), \
                                                                         batch_size)))
    except FileNotFoundError:
        print('Training pairs not found, generating ...')
        training_batches = [
            batch2TrainData(voc,
                            [random.choice(pairs)
                             for _ in range(batch_size)], reverse)
            for _ in range(n_iteration)
        ]
        torch.save(training_batches, os.path.join(save_dir, 'training_data', corpus_name,
                                                  '{}_{}_{}.tar'.format(n_iteration, \
                                                                        filename(reverse, 'training_batches'), \
                                                                        batch_size)))
    # model
    checkpoint = None
    print('Building encoder and decoder ...')
    embedding = nn.Embedding(voc.n_words, hidden_size)
    encoder = EncoderRNN(voc.n_words, hidden_size, embedding, n_layers,
                         dropout)
    attn_model = 'dot'
    decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size,
                                  voc.n_words, n_layers, dropout)
    if loadFilename:
        checkpoint = torch.load(loadFilename)
        encoder.load_state_dict(checkpoint['en'])
        decoder.load_state_dict(checkpoint['de'])
    # use cuda
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # optimizer
    print('Building optimizers ...')
    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(),
                                   lr=learning_rate * decoder_learning_ratio)
    if loadFilename:
        encoder_optimizer.load_state_dict(checkpoint['en_opt'])
        decoder_optimizer.load_state_dict(checkpoint['de_opt'])

    # initialize
    print('Initializing ...')
    start_iteration = 1
    perplexity = []
    print_loss = 0
    if loadFilename:
        start_iteration = checkpoint['iteration'] + 1
        perplexity = checkpoint['plt']

    for iteration in tqdm(range(start_iteration, n_iteration + 1)):
        training_batch = training_batches[iteration - 1]
        input_variable, lengths, target_variable, mask, max_target_len = training_batch

        loss = train(input_variable, lengths, target_variable, mask,
                     max_target_len, encoder, decoder, embedding,
                     encoder_optimizer, decoder_optimizer, batch_size)
        print_loss += loss
        perplexity.append(loss)

        if iteration % print_every == 0:
            print_loss_avg = math.exp(print_loss / print_every)
            print('%d %d%% %.4f' %
                  (iteration, iteration / n_iteration * 100, print_loss_avg))
            print_loss = 0

        if (iteration % save_every == 0):
            directory = os.path.join(
                save_dir, 'model', corpus_name,
                '{}-{}_{}'.format(n_layers, n_layers, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save(
                {
                    'iteration': iteration,
                    'en': encoder.state_dict(),
                    'de': decoder.state_dict(),
                    'en_opt': encoder_optimizer.state_dict(),
                    'de_opt': decoder_optimizer.state_dict(),
                    'loss': loss,
                    'plt': perplexity
                },
                os.path.join(
                    directory,
                    '{}_{}.tar'.format(iteration,
                                       filename(reverse,
                                                'backup_bidir_model'))))
Example #8
0
print('Building encoder and decoder ...')
# 初始化词向量
embedding = nn.Embedding(voc.num_words, hidden_size)
if loadFilename:
    embedding.load_state_dict(embedding_sd)
# 初始化编码器 & 解码器模型
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size,
                              voc.num_words, decoder_n_layers, dropout)
if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)
# 使用合适的设备
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Models built and ready to go!')

# step8: do train
# 配置训练/优化
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 4000
print_every = 1
save_every = 500

# 确保dropout layers在训练模型中
encoder.train()
decoder.train()
Example #9
0
def train():
    parameter = Config()
    model_name = parameter.model_name
    save_dir = parameter.save_dir
    loadFilename = parameter.model_ckpt

    pretrained_embedding_path = parameter.pretrained_embedding_path
    max_input_length = parameter.max_input_length
    max_generate_length = parameter.max_generate_length
    embedding_dim = parameter.embedding_dim
    batch_size = parameter.batch_size
    hidden_size = parameter.hidden_size
    attn_model = parameter.method
    dropout = parameter.dropout
    clip = parameter.clip
    num_layers = parameter.num_layers

    learning_rate = parameter.learning_rate
    teacher_forcing_ratio = parameter.teacher_forcing_ratio
    decoder_learning_ratio = parameter.decoder_learning_ratio
    n_iteration = parameter.epoch
    print_every = parameter.print_every
    save_every = parameter.save_every
    print(max_input_length,max_generate_length)
    #data
    voc = read_voc_file() #从保存的词汇表之中读取词汇
    print(voc)
    pairs = get_pairs()
    train_batches = None
    try :
        training_batches = torch.load( os.path.join(save_dir, '{}_{}_{}.tar'.format(n_iteration, 'training_batches', batch_size)))
    except FileNotFoundError:
        training_batches = [get_batch(voc, batch_size, pairs, max_input_length, max_generate_length) for _ in
                            range(n_iteration)]
        torch.save(training_batches, os.path.join(save_dir, '{}_{}_{}.tar'.format(n_iteration, 'training_batches', batch_size)))

    #model
    checkpoint = None
    print('Building encoder and decoder ...')
    if pretrained_embedding_path == None :
        embedding = nn.Embedding(len(voc), embedding_dim)
    else:
        embedding = get_weight(voc, pretrained_embedding_path, embedding_dim)
    print('embedding加载完成')
    encoder = EncoderRNN(hidden_size, embedding, num_layers, dropout)
    decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, len(voc), num_layers, dropout)
    if loadFilename:
        checkpoint = torch.load(loadFilename)
        encoder.load_state_dict(checkpoint['en'])
        decoder.load_state_dict(checkpoint['de'])
    # use cuda
    encoder = encoder.to(device)
    decoder = decoder.to(device)
    # optimizer
    print('Building optimizers ...')
    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
    if loadFilename:
        encoder_optimizer.load_state_dict(checkpoint['en_opt'])
        decoder_optimizer.load_state_dict(checkpoint['de_opt'])
    # initialize
    print('Initializing ...')
    start_iteration = 1
    perplexity = []
    print_loss = 0
    if loadFilename:
        start_iteration = checkpoint['iteration'] + 1
        perplexity = checkpoint['plt']
    
    f = open('record.txt','w',encoding ='utf-8')
    for iteration in tqdm(range(start_iteration, n_iteration + 1)):
        training_batch = training_batches[iteration - 1]
        input_variable, lengths, target_variable, mask, max_target_len = training_batch
        loss = train_by_batch(input_variable, lengths, target_variable, mask, max_target_len, encoder,
                     decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size,clip,teacher_forcing_ratio)
        print_loss += loss
        perplexity.append(loss)

        if iteration % print_every == 0:
            print_loss_avg = math.exp(print_loss / print_every)
            print('%d %d%% %.4f' % (iteration, iteration / n_iteration * 100, print_loss_avg))
            print_loss = 0

        if (iteration % save_every == 0):
            directory = os.path.join(save_dir, 'model', model_name, '{}-{}_{}'.format(num_layers, num_layers, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save({
                'iteration': iteration,
                'en': encoder.state_dict(),
                'de': decoder.state_dict(),
                'en_opt': encoder_optimizer.state_dict(),
                'de_opt': decoder_optimizer.state_dict(),
                'loss': loss,
                'plt': perplexity
            }, os.path.join(directory, '{}_{}.tar'.format(iteration,  'backup_bidir_model')))
            print(perplexity)
Example #10
0
def main():
    USE_CUDA = torch.cuda.is_available()
    device = torch.device("cuda" if USE_CUDA else "cpu")

    # load dict
    corpus_name = "cornell movie-dialogs corpus"
    corpus = os.path.join("data", corpus_name)
    datafile = os.path.join(corpus, "formatted_movie_lines.txt")
    voc, pairs = loadPrepareData(corpus_name, datafile)

    # model parameters
    save_dir = os.path.join("data", "save")
    model_name = 'cb_model'
    attn_model = 'dot'
    encoder_n_layers = 2
    decoder_n_layers = 2
    hidden_size = 500
    checkpoint_iter = 4000
    loadFilename = os.path.join(
        save_dir, model_name, corpus_name,
        '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
        '{}_checkpoint.tar'.format(checkpoint_iter))

    # Load model if a loadFilename is provided
    if loadFilename:
        # If loading on same machine the model was trained on
        checkpoint = torch.load(loadFilename)
        # If loading a model trained on GPU to CPU
        # checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
        encoder_sd = checkpoint['en']
        decoder_sd = checkpoint['de']
        encoder_optimizer_sd = checkpoint['en_opt']
        decoder_optimizer_sd = checkpoint['de_opt']
        embedding_sd = checkpoint['embedding']
        voc.__dict__ = checkpoint['voc_dict']

    print('Building encoder and decoder ...')
    # Initialize word embeddings
    embedding = nn.Embedding(voc.num_words, hidden_size)
    if loadFilename:
        embedding.load_state_dict(embedding_sd)
    # Initialize encoder & decoder models
    encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout=0)
    decoder = LuongAttnDecoderRNN(attn_model,
                                  embedding,
                                  hidden_size,
                                  voc.num_words,
                                  decoder_n_layers,
                                  dropout=0)
    if loadFilename:
        encoder.load_state_dict(encoder_sd)
        decoder.load_state_dict(decoder_sd)
    # Use appropriate device
    encoder = encoder.to(device)
    decoder = decoder.to(device)
    print('Models built and ready to go!')

    # Set dropout layers to eval mode
    encoder.eval()
    decoder.eval()

    # Initialize search module
    searcher = GreedySearchDecoder(encoder, decoder, device)

    # Begin chatting (uncomment and run the following line to begin)
    evaluateInput(device, encoder, decoder, searcher, voc)
Example #11
0
def main():
    USE_CUDA = torch.cuda.is_available()
    device = torch.device("cuda" if USE_CUDA else "cpu")

    # load data
    corpus_name = "cornell movie-dialogs corpus"
    corpus = os.path.join("data", corpus_name)
    datafile = os.path.join(corpus, "formatted_movie_lines.txt")
    voc, pairs = loadPrepareData(corpus_name, datafile)
    # Trim voc and pairs
    pairs = trimRareWords(voc, pairs, MIN_COUNT)

    # Configure models
    model_name = 'cb_model'
    attn_model = 'dot'
    # attn_model = 'general'
    # attn_model = 'concat'
    hidden_size = 500
    encoder_n_layers = 2
    decoder_n_layers = 2
    dropout = 0.1
    batch_size = 64

    # Set checkpoint to load from; set to None if starting from scratch
    loadFilename = None
    # checkpoint_iter = 4000
    # loadFilename = os.path.join(save_dir, model_name, corpus_name,
    #                            '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
    #                            '{}_checkpoint.tar'.format(checkpoint_iter))

    # Load model if a loadFilename is provided
    checkpoint = None
    if loadFilename:
        # If loading on same machine the model was trained on
        checkpoint = torch.load(loadFilename)
        # If loading a model trained on GPU to CPU
        # checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
        encoder_sd = checkpoint['en']
        decoder_sd = checkpoint['de']
        encoder_optimizer_sd = checkpoint['en_opt']
        decoder_optimizer_sd = checkpoint['de_opt']
        embedding_sd = checkpoint['embedding']
        voc.__dict__ = checkpoint['voc_dict']

    print('Building encoder and decoder ...')
    # Initialize word embeddings
    embedding = nn.Embedding(voc.num_words, hidden_size)
    if loadFilename:
        embedding.load_state_dict(embedding_sd)
    # Initialize encoder & decoder models
    encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
    decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size,
                                  voc.num_words, decoder_n_layers, dropout)
    if loadFilename:
        encoder.load_state_dict(encoder_sd)
        decoder.load_state_dict(decoder_sd)
    # Use appropriate device
    encoder = encoder.to(device)
    decoder = decoder.to(device)
    print('Models built and ready to go!')

    # Configure training/optimization
    clip = 50.0
    teacher_forcing_ratio = 1.0
    learning_rate = 0.0001
    decoder_learning_ratio = 5.0
    n_iteration = 4000
    print_every = 1
    save_every = 500

    # Ensure dropout layers are in train mode
    encoder.train()
    decoder.train()

    # Initialize optimizers
    print('Building optimizers ...')
    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(),
                                   lr=learning_rate * decoder_learning_ratio)
    if loadFilename:
        encoder_optimizer.load_state_dict(encoder_optimizer_sd)
        decoder_optimizer.load_state_dict(decoder_optimizer_sd)

    # Run training iterations
    print("Starting Training!")
    save_dir = os.path.join("data", "save")
    trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer,
               decoder_optimizer, embedding, encoder_n_layers,
               decoder_n_layers, save_dir, n_iteration, batch_size,
               print_every, save_every, clip, corpus_name, checkpoint,
               hidden_size, teacher_forcing_ratio, device)
Example #12
0
def trainIters(corpus,
               reverse,
               n_iteration,
               learning_rate,
               batch_size,
               n_layers,
               hidden_size,
               print_every,
               save_every,
               dropout,
               loadFilename=None,
               attn_model='dot',
               decoder_learning_ratio=5.0):

    voc, pairs = loadPrepareData(corpus)

    # training data
    corpus_name = os.path.split(corpus)[-1].split('.')[0]
    training_batches = None
    training_batches = [
        batch2TrainData(voc, [random.choice(pairs)
                              for _ in range(batch_size)], reverse)
        for _ in range(n_iteration)
    ]
    # model
    checkpoint = None
    print('Building encoder and decoder ...')
    embedding = nn.Embedding(voc.n_words, hidden_size)
    encoder = EncoderRNN(voc.n_words, hidden_size, embedding, n_layers,
                         dropout)
    attn_model = 'dot'
    decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size,
                                  voc.n_words, n_layers, dropout)
    if loadFilename:
        checkpoint = torch.load(loadFilename)
        encoder.load_state_dict(checkpoint['en'])
        decoder.load_state_dict(checkpoint['de'])
    # use cuda
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # optimizer
    print('Building optimizers ...')
    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(),
                                   lr=learning_rate * decoder_learning_ratio)
    if loadFilename:
        encoder_optimizer.load_state_dict(checkpoint['en_opt'])
        decoder_optimizer.load_state_dict(checkpoint['de_opt'])

    # initialize
    print('Initializing ...')
    start_iteration = 1
    perplexity = []
    print_loss = 0
    if loadFilename:
        start_iteration = checkpoint['iteration'] + 1
        perplexity = checkpoint['plt']
    # 进度条显示
    for iteration in tqdm(range(start_iteration, n_iteration + 1)):
        # 得到当前iteration的数据
        training_batch = training_batches[iteration - 1]
        input_variable, lengths, target_variable, mask, max_target_len = training_batch

        loss = train(input_variable, lengths, target_variable, mask,
                     max_target_len, encoder, decoder, embedding,
                     encoder_optimizer, decoder_optimizer, batch_size)
        print_loss += loss
        perplexity.append(loss)
Example #13
0
def main():

    phase = {"train": {"pairs": []}, "test": {"pairs": []}}

    if run_mode == 'train':
        with open(datafiles["qr_train"], "r") as file_obj:
            for line in file_obj:
                phase["train"]["pairs"].append(line.split("\n")[0].split("\t"))
        with open(f"{os.path.join(split_path, 'voc.pickle')}", "rb") as f:
            phase["train"]["voc"] = pickle.load(f)

        # Shuffle both sets ONCE before the entire training
        random.seed(1)  # seed can be any number
        random.shuffle(phase["train"]["pairs"])

        print('Building training set encoder and decoder ...')
        # Initialize word embeddings for both encoder and decoder
        embedding = nn.Embedding(phase["train"]["voc"].num_words,
                                 HIDDEN_SIZE).to(device)

        # Initialize encoder & decoder models
        encoder = EncoderRNN(HIDDEN_SIZE,
                             embedding,
                             ENCODER_N_LAYERS,
                             DROPOUT,
                             gate=encoder_name,
                             bidirectional=BIDIRECTION)
        decoder = LuongAttnDecoderRNN(attn_model,
                                      embedding,
                                      HIDDEN_SIZE,
                                      phase["train"]["voc"].num_words,
                                      DECODER_N_LAYERS,
                                      DROPOUT,
                                      gate=decoder_name)

        # Use appropriate device
        encoder = encoder.to(device)
        decoder = decoder.to(device)
        encoder.train()
        decoder.train()
        print('Models built and ready to go!')

        # Initialize optimizers
        print('Building optimizers ...')
        if args.get('optimizer') == "ADAM":
            encoder_optimizer = optim.Adam(encoder.parameters(),
                                           lr=LR,
                                           weight_decay=WD)
            decoder_optimizer = optim.Adam(decoder.parameters(),
                                           lr=LR,
                                           weight_decay=WD)
        elif args.get('optimizer') == "SGD":
            encoder_optimizer = optim.SGD(encoder.parameters(), lr=LR)
            decoder_optimizer = optim.SGD(decoder.parameters(), lr=LR)
        else:
            raise ValueError(
                "Wrong optimizer type has been given as an argument.")

        # If you have cuda, configure cuda to call
        for optimizer in [encoder_optimizer, decoder_optimizer]:
            for state in optimizer.state.values():
                for k, v in state.items():
                    if isinstance(v, torch.Tensor):
                        state[k] = v.cuda()

        print("Starting Training!")
        save_model = run(encoder,
                         decoder,
                         encoder_optimizer,
                         decoder_optimizer,
                         EPOCH_NUM,
                         BATCH_SIZE,
                         CLIP,
                         phase,
                         evaluation=True)
        if save_model:
            try:
                save_seq2seq(encoder, decoder, encoder_name, decoder_name,
                             encoder_optimizer, decoder_optimizer,
                             phase["train"]["losses"], phase["train"]["bleu"],
                             phase["train"]["voc"], embedding, DROPOUT, CLIP,
                             WD)
                print("Model has been saved successfully.")
            except Exception as error:
                print("Saving the model has caused an exception:", error)

        write_results("loss", "train", encoder, encoder_name, decoder_name,
                      DROPOUT, CLIP, WD, phase["train"]["losses"])
        write_results("bleu", "train", encoder, encoder_name, decoder_name,
                      DROPOUT, CLIP, WD, phase["train"]["bleu"])

    else:
        # Loading basic objects needed for all 3 of validation, testing and chatting
        checkpoint = torch.load(args.get('model_path'))
        embedding = load_embedding(checkpoint, HIDDEN_SIZE)
        encoder = load_encoder(checkpoint, EncoderRNN, HIDDEN_SIZE, embedding,
                               ENCODER_N_LAYERS, DROPOUT, encoder_name,
                               BIDIRECTION)
        voc = load_voc(checkpoint)
        decoder = load_decoder(checkpoint, LuongAttnDecoderRNN, attn_model,
                               embedding, HIDDEN_SIZE, voc.num_words,
                               DECODER_N_LAYERS, DROPOUT, decoder_name)
        encoder = encoder.to(device)
        decoder = decoder.to(device)

        if run_mode == "test":
            with open(datafiles["qr_train"], "r") as file_obj:
                for line in file_obj:
                    phase["train"]["pairs"].append(
                        line.split("\n")[0].split("\t"))
            with open(datafiles["qr_test"], "r") as file_obj:
                for line in file_obj:
                    phase["test"]["pairs"].append(
                        line.split("\n")[0].split("\t"))
            with open(f"{os.path.join(split_path, 'voc.pickle')}", "rb") as f:
                phase["train"]["voc"] = pickle.load(f)
            _ = run(encoder,
                    decoder,
                    None,
                    None,
                    EPOCH_NUM,
                    BATCH_SIZE,
                    CLIP,
                    phase,
                    evaluation=True)
            write_results("loss", "train", encoder, encoder_name, decoder_name,
                          DROPOUT, CLIP, WD, phase["train"]["losses"])
            write_results("bleu", "train", encoder, encoder_name, decoder_name,
                          DROPOUT, CLIP, WD, phase["train"]["bleu"])

            write_results("loss", "test", encoder, encoder_name, decoder_name,
                          DROPOUT, CLIP, WD, phase["test"]["losses"])
            write_results("bleu", "test", encoder, encoder_name, decoder_name,
                          DROPOUT, CLIP, WD, phase["test"]["bleu"])

        elif run_mode == "chat":
            # Initialize search module
            searcher = GreedySearchDecoder(encoder, decoder)
            chat(searcher, voc)

        else:
            raise ValueError(
                "Wrong run_mode has been given, options: ['train', 'test', 'chat']"
            )
Example #14
0
embedding = nn.Embedding(voc.num_words, config.hidden_size)

if config.loadFilename:
    embedding.load_state_dict(embedding_sd)

encoder = EncoderRNN(config.hidden_size, embedding, config.encoder_n_layers,
                     config.dropout)
decoder = LuongAttnDecoderRNN(config.attn_model, embedding, config.hidden_size,
                              voc.num_words, config.decoder_n_layers,
                              config.dropout)
if config.loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)

encoder = encoder.to(config.device)
decoder = decoder.to(config.device)
print('Models built and ready to go!')

if config.training:

    print('Building optimizers ...')
    encoder_optimizer = optim.Adam(encoder.parameters(),
                                   lr=config.learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(),
                                   lr=config.learning_rate *
                                   config.decoder_learning_ratio)
    if config.loadFilename:
        encoder_optimizer.load_state_dict(encoder_optimizer_sd)
        decoder_optimizer.load_state_dict(decoder_optimizer_sd)

    if config.USE_CUDA: