def test_sampling(): # Create Theano variables sampling_input = theano.tensor.lmatrix("input") # Construct model encoder = BidirectionalEncoder(vocab_size=10, embedding_dim=5, state_dim=8) decoder = Decoder(vocab_size=12, embedding_dim=6, state_dim=8, representation_dim=16, theano_seed=1234) sampling_representation = encoder.apply(sampling_input, theano.tensor.ones(sampling_input.shape)) generateds = decoder.generate(sampling_input, sampling_representation) model = Model(generateds[1]) # Initialize model encoder.weights_init = decoder.weights_init = IsotropicGaussian(0.01) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # Compile a function for the generated sampling_fn = model.get_theano_function() # Create literal variables numpy.random.seed(1234) x = numpy.random.randint(0, 10, size=(1, 2)) # Call function and check result generated_step = sampling_fn(x) assert len(generated_step[0].flatten()) == 4
def test_search_model(): # Create Theano variables floatX = theano.config.floatX source_sentence = theano.tensor.lmatrix("source") source_sentence_mask = theano.tensor.matrix("source_mask", dtype=floatX) target_sentence = theano.tensor.lmatrix("target") target_sentence_mask = theano.tensor.matrix("target_mask", dtype=floatX) # Construct model encoder = BidirectionalEncoder(vocab_size=10, embedding_dim=5, state_dim=8) decoder = Decoder(vocab_size=12, embedding_dim=6, state_dim=8, representation_dim=16) cost = decoder.cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask, ) # Compile a function for the cost f_cost = theano.function( inputs=[source_sentence, source_sentence_mask, target_sentence, target_sentence_mask], outputs=cost ) # Create literal variables numpy.random.seed(1234) x = numpy.random.randint(0, 10, size=(22, 4)) y = numpy.random.randint(0, 12, size=(22, 6)) x_mask = numpy.ones_like(x).astype(floatX) y_mask = numpy.ones_like(y).astype(floatX) # Initialize model encoder.weights_init = decoder.weights_init = IsotropicGaussian(0.01) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() cost_ = f_cost(x, x_mask, y, y_mask) assert_allclose(cost_, 14.90944)
def main(args): """ Training and validation the model """ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") logger.debug("DEVICE: {}".format(device)) # load vocabulary with open(args.vocab_path, "rb") as f: vocab = pickle.load(f) # encoder model setting encoder = EncoderResNet() encoder_optimizer = torch.optim.Adam( params=filter(lambda p: p.requires_grad, encoder.parameters()), lr=args.encoder_lr) if args.fine_tune_encoder else None # decoder model setting decoder = Decoder(vis_dim=args.vis_dim, vis_num=args.vis_num, embed_dim=args.embed_dim, hidden_dim=args.hidden_dim, vocab_size=args.vocab_size, num_layers=args.num_layers, dropout_ratio=args.dropout_ratio) decoder_optimizer = torch.optim.Adam(params=filter( lambda p: p.requires_grad, decoder.parameters()), lr=args.decoder_lr) # move to GPU encoder = nn.DataParallel(encoder).to(device) decoder = nn.DataParallel(decoder).to(device) # loss function criterion = nn.CrossEntropyLoss() # data loader transform = set_transform(args.resize, args.crop_size, horizontal_flip=True, normalize=True) train_img_dirc = os.path.join(args.root_img_dirc, "train2014") train_loader = get_image_loader(train_img_dirc, args.train_data_path, vocab, transform, args.batch_size, args.shuffle, args.num_workers) val_img_dirc = os.path.join(args.root_img_dirc, "val2014") val_loader = get_image_loader(val_img_dirc, args.val_data_path, vocab, transform, 1, args.shuffle, args.num_workers) # initialization best_bleu_score = -100 not_improved_cnt = 0 for epoch in range(1, args.num_epochs): # training train(encoder, decoder, encoder_optimizer, decoder_optimizer, train_loader, criterion, epoch) # validation pred_df = validation(encoder, decoder, val_loader, criterion, epoch, vocab, args.beam_size) # calculate BLEU-4 score pred_cap_lst = decode_caption(pred_df["pred"], vocab.idx2word) ans_cap_lst = decode_caption(pred_df["ans"], vocab.idx2word) assert len(pred_cap_lst) == len(ans_cap_lst) bleu_score_lst = [] for i in range(len(pred_cap_lst)): bleu_score_lst.append( bleu(pred_cap_lst[i], ans_cap_lst[i], mode="4-gram")) bleu_score = np.mean(bleu_score_lst) # early stopping if bleu_score < best_bleu_score: not_improved_cnt += 1 else: # learning is going well best_bleu_score = bleu_score not_improved_cnt = 0 # save best params model torch.save(encoder.state_dict(), args.save_encoder_path) torch.save(decoder.state_dict(), args.save_decoder_path) # logging status logger.debug( "\n************************ VAL ************************\n" "EPOCH : [{0}/{1}]\n" "BLEU-4 : {2}\n" "EARLY STOPPING : [{3}/{4}]\n" "*****************************************************\n".format( epoch, args.num_epochs, bleu_score, not_improved_cnt, args.stop_count)) if not_improved_cnt == args.stop_count: logger.debug("Early Stopping") break # decay learning rate if there is no improvement for 10 consecutive epochs if not_improved_cnt % 10 == 0: if args.fine_tune_encoder: adjust_learning_rate(encoder_optimizer, 0.8) adjust_learning_rate(decoder_optimizer, 0.8)
train_iter, valid_iter, test_iter = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=device) IN_DIM = len(SRC.vocab) OUT_DIM = len(TRG.vocab) ENC_EMB_DIM = 256 DEC_EMB_DIM = 256 HID_DIM = 512 N_LAYER = 2 ENC_DROPOUT = 0.5 DEC_DROPOUT = 0.5 enc = Encoder(IN_DIM, ENC_EMB_DIM, HID_DIM, N_LAYER, ENC_DROPOUT) dec = Decoder(OUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYER, DEC_DROPOUT) model = Seq2Seq(enc, dec, device).to(device) model.apply(init_weights) print(f"Model has {count_parameters(model):,} trainable parameters") # TRG.pad_token = <pad> # TRG_PAD_IDX = 1 TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token] criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX) mode = 'train' # mode = 'eval' model.load_state_dict(torch.load('tut1-model.pt'))
parser.add_argument('--lr', type=float, default=1e-4) parser.add_argument('--lr_decay', type=float, default=5e-5) parser.add_argument('--max_iter', type=int, default=600000) parser.add_argument('--batch_size', type=int, default=5) parser.add_argument('--style_weight', type=float, default=3.0) parser.add_argument('--content_weight', type=float, default=1.0) parser.add_argument('--temporal_weight', type=float, default=2.0) parser.add_argument('--v_weight', type=float, default=20.0) parser.add_argument('--n_threads', type=int, default=16) parser.add_argument('--save_model_interval', type=int, default=10000) parser.add_argument('--start_iter', type=float, default=500000) args = parser.parse_args('') device = torch.device('cuda') decoder = Decoder('Decoder') vgg = VGG('VGG19') vgg.features.load_state_dict(torch.load(args.vgg)) vgg = nn.Sequential(*list(vgg.features.children())[:44]) network = Net(vgg, decoder, args.start_iter) network.train() network.to(device) optimizer = torch.optim.Adam([ {'params': network.decoder.parameters()}, {'params': network.transform.parameters()}], lr=args.lr) style_tf = train_transform()
# convert labels to indices indexed_target_test = prepare_data.label_to_idx(target_test, char2idx) indexed_target_word_test = prepare_data.word_to_idx( target_test, embeddings) test_data = prepare_data.combine_data(features_test, indexed_target_test) # initialize the Encoder encoder = Encoder(features_test[0].size(1), encoder_hidden_size, encoder_layers, len(char2idx_ctc), batch_size, device).to(device) # initialize the Decoder decoder = Decoder(embedding_dim_chars, encoder_hidden_size, attention_hidden_size, num_filters, len(char2idx) + 1, decoder_layers, encoder_layers, batch_size, attention_type, device).to(device) # load the model checkpoint = torch.load('weights/libri/state_dict_10.pt', map_location=torch.device('cpu')) encoder.load_state_dict(checkpoint['encoder']) decoder.load_state_dict(checkpoint['decoder']) # evaluate batch_size = 1 pairs_batch_train = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=False, collate_fn=prepare_data.collate,
batch_size = 16 enc_lr = 0.0001 dec_lr = 0.0005 emb_lr = 0.0001 # ----------------------------------- text_embedding = nn.Embedding(num_embeddings=len(_symbol_to_id), embedding_dim=512).to(device) pos_embedding = nn.Embedding.from_pretrained(positional_encoding(512, 512), freeze=True).to(device) pos_embedding_ = nn.Embedding.from_pretrained(positional_encoding(256, 512), freeze=True).to(device) encoder = Encoder(emb_channels=512).to(device) decoder = Decoder(mel_channels=80, enc_channels=512, emb_channels=512).to(device) optimizer = torch.optim.Adam([{ 'params': text_embedding.parameters(), 'lr': emb_lr }, { 'params': encoder.parameters(), 'lr': enc_lr }, { 'params': decoder.parameters(), 'lr': dec_lr }], lr=0.001) # -----------------------------------
if args.resume_snapshot: model = torch.load(args.resume_snapshot, map_location=lambda storage, loc: storage) print('load model from %s' % args.resume_snapshot) else: word_embedding = torch.FloatTensor(vocab.word_embedding) char_embedding = torch.FloatTensor(vocab.char_embedding) args.char_embed_size = word_embedding.size(1) args.word_embed_size = char_embedding.size(1) print("[word_vocab]:%d [char_vocab]:%d" % (args.word_size, args.char_size)) print("[!] Instantiating models...") encoder = Encoder(args, word_embedding) decoder = Decoder(args, char_embedding) model = Seq2Seq(encoder, decoder) # set model dir model_folder, model_prefix = utils.get_folder_prefix(args, model) log_file = model_prefix + '.log' # setup logger log = logging.getLogger(__name__) log.setLevel(logging.DEBUG) fh = logging.FileHandler(log_file) fh.setLevel(logging.DEBUG) ch = logging.StreamHandler(sys.stdout) ch.setLevel(logging.INFO) formatter = logging.Formatter(fmt='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S')
def train_dynamics(env, args, writer=None): """ Trains the Dynamics module. Supervised. Arguments: env: the initialized environment (rllab/gym) args: input arguments writer: initialized summary writer for tensorboard """ args.action_space = env.action_space # Initialize models enc = Encoder(env.observation_space.shape[0], args.dim, use_conv=args.use_conv) dec = Decoder(env.observation_space.shape[0], args.dim, use_conv=args.use_conv) d_module = D_Module(env.action_space.shape[0], args.dim, args.discrete) if args.from_checkpoint is not None: results_dict = torch.load(args.from_checkpoint) enc.load_state_dict(results_dict['enc']) dec.load_state_dict(results_dict['dec']) d_module.load_state_dict(results_dict['d_module']) all_params = chain(enc.parameters(), dec.parameters(), d_module.parameters()) if args.transfer: for p in enc.parameters(): p.requires_grad = False for p in dec.parameters(): p.requires_grad = False all_params = d_module.parameters() optimizer = torch.optim.Adam(all_params, lr=args.lr, weight_decay=args.weight_decay) if args.gpu: enc = enc.cuda() dec = dec.cuda() d_module = d_module.cuda() # Initialize datasets val_loader = None train_dataset = DynamicsDataset(args.train_set, args.train_size, batch=args.train_batch, rollout=args.rollout) val_dataset = DynamicsDataset(args.test_set, 5000, batch=args.test_batch, rollout=args.rollout) val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) results_dict = { 'dec_losses': [], 'forward_losses': [], 'inverse_losses': [], 'total_losses': [], 'enc': None, 'dec': None, 'd_module': None, 'd_init': None, 'args': args } total_action_taken = 0 correct_predicted_a_hat = 0 # create the mask here for re-weighting dec_mask = None if args.dec_mask is not None: dec_mask = torch.ones(9) game_vocab = dict([ (b, a) for a, b in enumerate(sorted(env.game.all_possible_features())) ]) dec_mask[game_vocab['Agent']] = args.dec_mask dec_mask[game_vocab['Goal']] = args.dec_mask dec_mask = dec_mask.expand(args.batch_size, args.maze_length, args.maze_length, 9).contiguous().view(-1) dec_mask = Variable(dec_mask, requires_grad=False) if args.gpu: dec_mask = dec_mask.cuda() for epoch in range(1, args.num_epochs + 1): enc.train() dec.train() d_module.train() if args.framework == "mazebase": d_init.train() # for measuring the accuracy train_acc = 0 current_epoch_actions = 0 current_epoch_predicted_a_hat = 0 start = time.time() for i, (states, target_actions) in enumerate(train_loader): optimizer.zero_grad() if args.framework != "mazebase": forward_loss, inv_loss, dec_loss, recon_loss, model_loss, _, _ = forward_planning( i, states, target_actions, enc, dec, d_module, args) else: forward_loss, inv_loss, dec_loss, recon_loss, model_loss, current_epoch_predicted_a_hat, current_epoch_actions = multiple_forward( i, states, target_actions, enc, dec, d_module, args, d_init, dec_mask) loss = forward_loss + args.inv_loss_coef * inv_loss + \ args.dec_loss_coef * dec_loss if i % args.log_interval == 0: log( 'Epoch [{}/{}]\tIter [{}/{}]\t'.format( epoch, args.num_epochs, i+1, len( train_dataset)//args.batch_size) + \ 'Time: {:.2f}\t'.format(time.time() - start) + \ 'Decoder Loss: {:.2f}\t'.format(dec_loss.data[0]) + \ 'Forward Loss: {:.2f}\t'.format(forward_loss.data[0] ) + \ 'Inverse Loss: {:.2f}\t'.format(inv_loss.data[0]) + \ 'Loss: {:.2f}\t'.format(loss.data[0])) results_dict['dec_losses'].append(dec_loss.data[0]) results_dict['forward_losses'].append(forward_loss.data[0]) results_dict['inverse_losses'].append(inv_loss.data[0]) results_dict['total_losses'].append(loss.data[0]) # write the summaries here if writer: writer.add_scalar('dynamics/total_loss', loss.data[0], epoch) writer.add_scalar('dynamics/decoder', dec_loss.data[0], epoch) writer.add_scalar('dynamics/reconstruction_loss', recon_loss.data[0], epoch) writer.add_scalar('dynamics/next_state_prediction_loss', model_loss.data[0], epoch) writer.add_scalar('dynamics/inv_loss', inv_loss.data[0], epoch) writer.add_scalar('dynamics/forward_loss', forward_loss.data[0], epoch) writer.add_scalars( 'dynamics/all_losses', { "total_loss": loss.data[0], "reconstruction_loss": recon_loss.data[0], "next_state_prediction_loss": model_loss.data[0], "decoder_loss": dec_loss.data[0], "inv_loss": inv_loss.data[0], "forward_loss": forward_loss.data[0], }, epoch) loss.backward() correct_predicted_a_hat += current_epoch_predicted_a_hat total_action_taken += current_epoch_actions # does it not work at all without grad clipping ? torch.nn.utils.clip_grad_norm(all_params, args.max_grad_norm) optimizer.step() # maybe add the generated image to add the logs # writer.add_image() # Run validation if val_loader is not None: enc.eval() dec.eval() d_module.eval() forward_loss, inv_loss, dec_loss = 0, 0, 0 for i, (states, target_actions) in enumerate(val_loader): f_loss, i_loss, d_loss, _, _, _, _ = forward_planning( i, states, target_actions, enc, dec, d_module, args) forward_loss += f_loss inv_loss += i_loss dec_loss += d_loss loss = forward_loss + args.inv_loss_coef * inv_loss + \ args.dec_loss_coef * dec_loss if writer: writer.add_scalar('val/forward_loss', forward_loss.data[0] / i, epoch) writer.add_scalar('val/inverse_loss', inv_loss.data[0] / i, epoch) writer.add_scalar('val/decoder_loss', dec_loss.data[0] / i, epoch) log( '[Validation]\t' + \ 'Decoder Loss: {:.2f}\t'.format(dec_loss.data[0] / i) + \ 'Forward Loss: {:.2f}\t'.format(forward_loss.data[0] / i) + \ 'Inverse Loss: {:.2f}\t'.format(inv_loss.data[0] / i) + \ 'Loss: {:.2f}\t'.format(loss.data[0] / i)) if epoch % args.checkpoint == 0: results_dict['enc'] = enc.state_dict() results_dict['dec'] = dec.state_dict() results_dict['d_module'] = d_module.state_dict() if args.framework == "mazebase": results_dict['d_init'] = d_init.state_dict() torch.save( results_dict, os.path.join(args.out, 'dynamics_module_epoch%s.pt' % epoch)) log('Saved model %s' % epoch) results_dict['enc'] = enc.state_dict() results_dict['dec'] = dec.state_dict() results_dict['d_module'] = d_module.state_dict() torch.save(results_dict, os.path.join(args.out, 'dynamics_module_epoch%s.pt' % epoch)) print(os.path.join(args.out, 'dynamics_module_epoch%s.pt' % epoch))
def main(): make_deterministic() # region Prepare data with Timer('\nData preparation time: %s\n'): ru_lang = Language() en_lang = Language() yandex = Yandex( 'datasets/yandex/corpus.en_ru.1m.ru', 'datasets/yandex/corpus.en_ru.1m.en', ru_lang, en_lang, data_slice=H.dataset_slice, ) paracrawl = ParaCrawl( 'datasets/paracrawl/en-ru.txt', ru_lang, en_lang, data_slice=slice(0), ) low = ru_lang.lower_than(H.ru_word_count_minimum) infrequent_words_n = max( ceil(ru_lang.words_n * H.infrequent_words_percent), len(low)) if infrequent_words_n > 0: ru_lang.drop_words(ru_lang.lowk(infrequent_words_n)) print( f'{infrequent_words_n:,} infrequent Russian words are dropped') low = en_lang.lower_than(H.en_word_count_minimum) if len(low) > 0: en_lang.drop_words(*low) print(f'{len(low):,} infrequent English words are dropped') print( f'Russian language: {ru_lang.words_n:,} words, {ru_lang.sentence_length:,} words in a sentence' ) print( f'English language: {en_lang.words_n:,} words, {en_lang.sentence_length:,} words in a sentence' ) batch = H.batch_size dataset = ConcatDataset((yandex, paracrawl)) loader = DataLoader(dataset, batch, shuffle=True) # endregion # region Models and optimizers model = Seq2Seq( Encoder(ru_lang.words_n, H.encoder_embed_dim, H.encoder_hidden_dim, H.encoder_bi, H.decoder_hd), Attention(H.encoder_hd, H.decoder_hd), Decoder(en_lang.words_n, H.decoder_embed_dim, H.decoder_hidden_dim, H.encoder_hd), ).to(Device).train() optimizer = Adam(model.parameters(), lr=H.learning_rate) criterion = CrossEntropyLoss(ignore_index=Token_PAD, reduction='sum') # endregion # region Training teaching_percent = H.teaching_percent total = len(dataset) log_interval = max(5, round(total / batch / 1000)) for epoch in range(1, H.epochs + 1): with Printer() as printer: printer.print(f'Train epoch {epoch}: starting...') for i, ((ru, ru_l), en_sos, en_eos) in enumerate(loader, 1): # Zero the parameter gradients optimizer.zero_grad() # Run data through model predictions = model(ru, ru_l, en_sos, teaching_percent) # Calculate loss loss = criterion(predictions, en_eos) # Back propagate and perform optimization loss.backward() clip_grad_norm_(model.parameters(), H.gradient_norm_clip) optimizer.step() # Print log if i % log_interval == 0: printer.print( f'Train epoch {epoch}: {i * batch / total:.1%} [{i * batch:,}/{total:,}]' ) printer.print(f'Train epoch {epoch}: completed') # endregion torch.save( ( ru_lang.__getnewargs__(), en_lang.__getnewargs__(), model.cpu().eval().data, ), 'data/data.pt', ) evaluate(model.to(Device), ru_lang, en_lang, 'datasets/yandex/corpus.en_ru.1m.ru', slice(H.dataset_slice.stop + 1, H.dataset_slice.stop + 1 + 100))
batched_data = [] for i in random.choice(len(data), batch_size): batched_data.append(data[i]) return batched_data data = load_data(data_dir, "weibo_pair_train_Q_after") vocab_list, word2index, index2word, embedding, question_words = build_vocab(data_dir, data) keywords_list, keywords_index, PMI = load_PMI() key_to_vocab = [0] * len(keywords_list) for i in range(len(keywords_list)): if keywords_list[i] in word2index: key_to_vocab[i] = word2index[keywords_list[i]] encoder = Encoder(len(word2index), embedding_dim, hidden_dim) decoder = Decoder(len(word2index), embedding_dim, hidden_dim) model = Seq2Seq(encoder, decoder, device).to(device) optimizer = torch.optim.SGD(model.parameters()) criterion = nn.CrossEntropyLoss(ignore_index=0).to(device) def train(): model.train() for batch_id in range(0, 100): batched_data = random_batch(batch_size, data) posts_index, questions_index, keyword_tensor, word_type = batch_data(batched_data, question_words, keywords_index, key_to_vocab, word2index) optimizer.zero_grad() output = model(posts_index, questions_index) loss = criterion(output.view(-1, output.shape[2]), questions_index.view(-1)) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
def main(): max_len = 50 n_vocab = params.n_vocab n_layer = params.n_layer n_hidden = params.n_hidden n_embed = params.n_embed temperature = params.temperature assert torch.cuda.is_available() if os.path.exists("data/vocab.json"): vocab = Vocabulary() with open('data/vocab.json', 'r') as fp: vocab.stoi = json.load(fp) for key, value in vocab.stoi.items(): vocab.itos.append(key) else: print("vocabulary doesn't exist!") return print("loading model...") encoder = Encoder(n_vocab, n_embed, n_hidden, n_layer).cuda() Kencoder = KnowledgeEncoder(n_vocab, n_embed, n_hidden, n_layer).cuda() manager = Manager(n_hidden, n_vocab, temperature).cuda() decoder = Decoder(n_vocab, n_embed, n_hidden, n_layer).cuda() encoder = init_model(encoder, restore=params.encoder_restore) Kencoder = init_model(Kencoder, restore=params.Kencoder_restore) manager = init_model(manager, restore=params.manager_restore) decoder = init_model(decoder, restore=params.decoder_restore) print("successfully loaded!\n") utterance = "" while True: if utterance == "exit": break k1 = input("Type first Knowledge: ").lower() while not k1: print("Please type first Knowledge.\n") k1 = input("Type first Knowledge: ").lower() k2 = input("Type second Knowledge: ").lower() while not k2: print("Please type second Knowledge.\n") k2 = input("Type second Knowledge: ").lower() k3 = input("Type third Knowledge: ").lower() while not k3: print("Please type third Knowledge.\n") k3 = input("Type third Knowledge: ").lower() K = [k1, k2, k3] K = knowledgeToIndex(K, vocab) K = Kencoder(K) print() while True: utterance = input("you: ").lower() while not utterance: print("Please type utterance.\n") utterance = input("you: ") if utterance == "change knowledge" or utterance == "exit": print() break X = [] tokens = nltk.word_tokenize(utterance) for word in tokens: if word in vocab.stoi: X.append(vocab.stoi[word]) else: X.append(vocab.stoi["<UNK>"]) X = torch.LongTensor(X).unsqueeze(0).cuda() # X: [1, x_seq_len] encoder_outputs, hidden, x = encoder(X) k_i = manager(x, None, K) outputs = torch.zeros( max_len, 1, n_vocab).cuda() # outputs: [max_len, 1, n_vocab] hidden = hidden[decoder.n_layer:] output = torch.LongTensor([params.SOS]).cuda() for t in range(max_len): output, hidden, attn_weights = decoder(output, k_i, hidden, encoder_outputs) outputs[t] = output output = output.data.max(1)[1] outputs = outputs.max(2)[1] answer = "" for idx in outputs: if idx == params.EOS: break answer += vocab.itos[idx] + " " print("bot:", answer[:-1], "\n")
from torch.utils.tensorboard import SummaryWriter from tqdm import tqdm from data import PersianLexicon, collate_fn from model import Encoder, Decoder from config import DataConfig, ModelConfig, TrainConfig # data prep ds = PersianLexicon(DataConfig.graphemes_path, DataConfig.phonemes_path, DataConfig.lexicon_path) dl = DataLoader(ds, collate_fn=collate_fn, batch_size=TrainConfig.batch_size) # models encoder_model = Encoder(ModelConfig.graphemes_size, ModelConfig.hidden_size).to(TrainConfig.device) decoder_model = Decoder(ModelConfig.phonemes_size, ModelConfig.hidden_size).to(TrainConfig.device) # log log = SummaryWriter(TrainConfig.log_path) # loss criterion = nn.CrossEntropyLoss() # optimizer optimizer = torch.optim.Adam(list(encoder_model.parameters()) + list(decoder_model.parameters()), lr=TrainConfig.lr) # training loop counter = 0 for e in range(TrainConfig.epochs):
def convert(cfg): dataset_path = Path(utils.to_absolute_path("datasets")) / cfg.dataset.path with open(dataset_path / "speakers.json") as file: speakers = sorted(json.load(file)) synthesis_list_path = Path(utils.to_absolute_path(cfg.synthesis_list)) with open(synthesis_list_path) as file: synthesis_list = json.load(file) in_dir = Path(utils.to_absolute_path(cfg.in_dir)) out_dir = Path(utils.to_absolute_path(cfg.out_dir)) out_dir.mkdir(exist_ok=True, parents=True) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder = Encoder(**cfg.model.encoder) decoder = Decoder(**cfg.model.decoder) encoder.to(device) decoder.to(device) print("Load checkpoint from: {}:".format(cfg.checkpoint)) checkpoint_path = utils.to_absolute_path(cfg.checkpoint) checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage) encoder.load_state_dict(checkpoint["encoder"]) decoder.load_state_dict(checkpoint["decoder"]) encoder.eval() decoder.eval() meter = pyloudnorm.Meter(cfg.preprocessing.sr) for wav_path, speaker_id, out_filename in tqdm(synthesis_list): wav_path = in_dir / wav_path wav, _ = librosa.load(wav_path.with_suffix(".wav"), sr=cfg.preprocessing.sr) ref_loudness = meter.integrated_loudness(wav) wav = wav / np.abs(wav).max() * 0.999 mel = librosa.feature.melspectrogram( preemphasis(wav, cfg.preprocessing.preemph), sr=cfg.preprocessing.sr, n_fft=cfg.preprocessing.n_fft, n_mels=cfg.preprocessing.n_mels, hop_length=cfg.preprocessing.hop_length, win_length=cfg.preprocessing.win_length, fmin=cfg.preprocessing.fmin, power=1) logmel = librosa.amplitude_to_db(mel, top_db=cfg.preprocessing.top_db) logmel = logmel / cfg.preprocessing.top_db + 1 mel = torch.FloatTensor(logmel).unsqueeze(0).to(device) speaker = torch.LongTensor([speakers.index(speaker_id)]).to(device) with torch.no_grad(): z, _ = encoder.encode(mel) output = decoder.generate(z, speaker) output_loudness = meter.integrated_loudness(output) output = pyloudnorm.normalize.loudness(output, output_loudness, ref_loudness) path = out_dir / out_filename librosa.output.write_wav(path.with_suffix(".wav"), output.astype(np.float32), sr=cfg.preprocessing.sr)
def main(mode, config, use_bokeh=False): # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder( config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder( config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2,config['topical_embedding_dim']) topical_transformer=topicalq_transformer(config['topical_vocab_size'],config['topical_embedding_dim'], config['enc_nhids'],config['topical_word_num'],config['batch_size']); if mode == "train": # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') source_topical_word=tensor.lmatrix('source_topical') source_topical_mask=tensor.matrix('source_topical_mask') # Get training and development set streams tr_stream = get_tr_stream_with_topicalq(**config) dev_stream = get_dev_stream_with_topicalq(**config) topic_embedding=topical_transformer.apply(source_topical_word); # Get cost of the model representation=encoder.apply(source_sentence, source_sentence_mask); tw_representation=topical_transformer.look_up.apply(source_topical_word.T); content_embedding=representation[0,:,(representation.shape[2]/2):]; cost = decoder.cost( representation,source_sentence_mask,tw_representation, source_topical_mask, target_sentence, target_sentence_mask,topic_embedding,content_embedding); logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() topical_transformer.weights_init=IsotropicGaussian( config['weight_scale']); topical_transformer.biases_init=Constant(0); topical_transformer.push_allocation_config();#don't know whether the initialize is for topical_transformer.look_up.weights_init=Orthogonal(); topical_transformer.transformer.weights_init=Orthogonal(); topical_transformer.initialize(); word_topical_embedding=cPickle.load(open(config['topical_embeddings'], 'rb')); np_word_topical_embedding=numpy.array(word_topical_embedding,dtype='float32'); topical_transformer.look_up.W.set_value(np_word_topical_embedding); topical_transformer.look_up.W.tag.role=[]; # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise( cg, enc_params+dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge(Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}" .format(len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] ''' # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate( sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if config['bleu_script'] is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) ''' # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Cs-En', channels=[['decoder_cost_cost']], after_batch=True)) # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent( cost=cost, parameters=cg.parameters,on_unused_sources='warn', step_rule=CompositeRule([StepClipping(config['step_clipping']), eval(config['step_rule'])()]) ) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions ) # Train! main_loop.run() elif mode == 'translate': # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_topical_word=tensor.lmatrix('source_topical') # Get test set stream test_stream = get_dev_stream_with_topicalq( config['test_set'], config['src_vocab'], config['src_vocab_size'],config['topical_test_set'],config['topical_vocab'],config['topical_vocab_size'],config['unk_id']) ftrans = open(config['test_set'] + '.trans.out', 'w') # Helper utilities sutils = SamplingBase() unk_idx = config['unk_id'] src_eos_idx = config['src_vocab_size'] - 1 trg_eos_idx = config['trg_vocab_size'] - 1 # Get beam search logger.info("Building sampling model") topic_embedding=topical_transformer.apply(source_topical_word); representation=encoder.apply(source_sentence, tensor.ones(source_sentence.shape)); tw_representation=topical_transformer.look_up.apply(source_topical_word.T); content_embedding=representation[0,:,(representation.shape[2]/2):]; generated = decoder.generate(source_sentence,representation, tw_representation,topical_embedding=topic_embedding,content_embedding=content_embedding); _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs beam_search = BeamSearch(samples=samples) logger.info("Loading the model..") model = Model(generated) loader = LoadNMT(config['saveto']) loader.set_model_parameters(model, loader.load_parameters()) # Get target vocabulary trg_vocab = _ensure_special_tokens( pickle.load(open(config['trg_vocab'], 'rb')), bos_idx=0, eos_idx=trg_eos_idx, unk_idx=unk_idx) trg_ivocab = {v: k for k, v in trg_vocab.items()} logger.info("Started translation: ") total_cost = 0.0 for i, line in enumerate(test_stream.get_epoch_iterator()): seq = sutils._oov_to_unk( line[0], config['src_vocab_size'], unk_idx) seq2 = line[1]; input_ = numpy.tile(seq, (config['beam_size'], 1)) input_topical=numpy.tile(seq2,(config['beam_size'],1)) # draw sample, checking to ensure we don't get an empty string back trans, costs = \ beam_search.search( input_values={source_sentence: input_,source_topical_word:input_topical}, max_length=10*len(seq), eol_symbol=src_eos_idx, ignore_first_eol=True) ''' # normalize costs according to the sequence lengths if config['normalized_bleu']: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths ''' #best = numpy.argsort(costs)[0] best=numpy.argsort(costs)[0:config['beam_size']]; for b in best: try: total_cost += costs[b] trans_out = trans[b] # convert idx to words trans_out = sutils._idx_to_word(trans_out, trg_ivocab) except ValueError: logger.info( "Can NOT find a translation for line: {}".format(i+1)) trans_out = '<UNK>' print(trans_out, file=ftrans) if i != 0 and i % 100 == 0: logger.info( "Translated {} lines of test set...".format(i)) logger.info("Total cost of the test: {}".format(total_cost)) ftrans.close() elif mode == 'rerank': # Create Theano variables ftrans = open(config['val_set'] + '.scores.out', 'w') logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') config['src_data']=config['val_set'] config['trg_data']=config['val_set_grndtruth'] config['batch_size']=1; config['sort_k_batches']=1; test_stream=get_tr_stream_unsorted(**config); logger.info("Building sampling model") representations= encoder.apply( source_sentence, source_sentence_mask) costs = decoder.cost(representations, source_sentence_mask, target_sentence, target_sentence_mask) logger.info("Loading the model..") model = Model(costs) loader = LoadNMT(config['saveto']) loader.set_model_parameters(model, loader.load_parameters()) costs_computer = function([source_sentence,source_sentence_mask, target_sentence, target_sentence_mask],costs) iterator = test_stream.get_epoch_iterator() scores = [] for i, (src, src_mask, trg, trg_mask) in enumerate(iterator): costs = costs_computer(*[src, src_mask, trg, trg_mask]) cost = costs.sum() print(i, cost) scores.append(cost) ftrans.write(str(cost)+"\n"); ftrans.close();
import mxnet as mx import pickle from model import Encoder, Decoder, beam_search_translate with open(f"./data/in_vocab.pkl", "rb") as fp: in_vocab = pickle.load(fp) with open(f"./data/out_vocab.pkl", "rb") as fp: out_vocab = pickle.load(fp) embed_size, num_hiddens, num_layers, ctx = 200, 200, 3, mx.cpu() attention_size, drop_prob = 20, 0.1 encoder = Encoder(len(in_vocab), embed_size, num_hiddens, num_layers, drop_prob) decoder = Decoder(len(out_vocab), embed_size, num_hiddens, num_layers, attention_size, drop_prob) encoder.load_parameters('./data/params_encoder_180') decoder.load_parameters('./data/params_decoder_180') # testing "should return 我无法做到" input_seq = "I can't do it ." beam_search_translate(encoder, decoder, input_seq, 20, ctx, 3, in_vocab, out_vocab) "should return 他很穷" input_seq = "He is poor ." beam_search_translate(encoder, decoder, input_seq, 20, ctx, 3, in_vocab, out_vocab)
def train(resume=False): it = 0 writer = SummaryWriter('../runs/' + hparams.exp_name) for k in hparams.__dict__.keys(): writer.add_text(str(k), str(hparams.__dict__[k])) train_dataset = ChestData( data_csv=hparams.train_csv, data_dir=hparams.train_dir, transform=transforms.Compose([ transforms.ToTensor(), # transforms.Normalize((0.485), (0.229)) ])) validation_dataset = ChestData( data_csv=hparams.valid_csv, data_dir=hparams.valid_dir, transform=transforms.Compose([ transforms.ToTensor(), # transforms.Normalize((0.485), (0.229)) ])) train_loader = DataLoader(train_dataset, batch_size=hparams.batch_size, shuffle=True, num_workers=0) validation_loader = DataLoader(validation_dataset, batch_size=hparams.batch_size, shuffle=True, num_workers=0) print('loaded train data of length : {}'.format(len(train_dataset))) Tensor = torch.cuda.FloatTensor if hparams.cuda else torch.FloatTensor def validation(encoder_, decoder_=None, send_stats=False, epoch=0): encoder_ = encoder_.eval() if decoder_: decoder_ = decoder_.eval() # print('Validating model on {0} examples. '.format(len(validation_loader))) with torch.no_grad(): scores_list = [] labels_list = [] val_loss = 0 for (img, labels, imgs_names) in validation_loader: img = Variable(img.float(), requires_grad=False) labels = Variable(labels.float(), requires_grad=False) scores = None if hparams.cuda: img = img.cuda(hparams.gpu_device) labels = labels.cuda(hparams.gpu_device) z = encoder_(img) if decoder_: outputs = decoder_(z) scores = torch.sum( (outputs - img)**2, dim=tuple(range( 1, outputs.dim()))) # (outputs - img) ** 2 # rec_loss = rec_loss.view(outputs.shape[0], -1) # rec_loss = torch.sum(torch.sum(rec_loss, dim=1)) val_loss += torch.sum(scores) save_image(img, 'tmp/img_{}.png'.format(epoch), normalize=True) save_image(outputs, 'tmp/reconstructed_{}.png'.format(epoch), normalize=True) else: dist = torch.sum((z - encoder.center)**2, dim=1) if hparams.objective == 'soft-boundary': scores = dist - encoder.radius**2 val_loss += (1 / hparams.nu) * torch.sum( torch.max(torch.zeros_like(scores), scores)) else: scores = dist val_loss += torch.sum(dist) scores_list.append(scores) labels_list.append(labels) scores = torch.cat(scores_list, dim=0) labels = torch.cat(labels_list, dim=0) val_loss /= len(validation_dataset) val_loss += encoder_.radius**2 if decoder_ and hparams.objective == 'soft-boundary' else 0 if hparams.cuda: labels = labels.cpu() scores = scores.cpu() labels = labels.view(-1).numpy() scores = scores.view(-1).detach().numpy() auc = roc_auc_score(labels, scores) return auc, val_loss ### validation function ends. if hparams.cuda: encoder = Encoder().cuda(hparams.gpu_device) decoder = Decoder().cuda(hparams.gpu_device) else: encoder = Encoder() decoder = Decoder() params_count = 0 for param in encoder.parameters(): params_count += np.prod(param.size()) for param in decoder.parameters(): params_count += np.prod(param.size()) print('Model has {0} trainable parameters'.format(params_count)) if not hparams.load_model: encoder.apply(weights_init_normal) decoder.apply(weights_init_normal) optim_params = list(encoder.parameters()) optimizer_train = optim.Adam(optim_params, lr=hparams.train_lr, weight_decay=hparams.weight_decay, amsgrad=hparams.optimizer == 'amsgrad') if hparams.pretrain: optim_params += list(decoder.parameters()) optimizer_pre = optim.Adam(optim_params, lr=hparams.pretrain_lr, weight_decay=hparams.ae_weight_decay, amsgrad=hparams.optimizer == 'amsgrad') # scheduler_pre = ReduceLROnPlateau(optimizer_pre, mode='min', factor=0.5, patience=10, verbose=True, cooldown=20) scheduler_pre = MultiStepLR(optimizer_pre, milestones=hparams.lr_milestones, gamma=0.1) # scheduler_train = ReduceLROnPlateau(optimizer_train, mode='min', factor=0.5, patience=10, verbose=True, cooldown=20) scheduler_train = MultiStepLR(optimizer_train, milestones=hparams.lr_milestones, gamma=0.1) print('Starting training.. (log saved in:{})'.format(hparams.exp_name)) start_time = time.time() mode = 'pretrain' if hparams.pretrain else 'train' best_valid_loss = 100000000000000000 best_valid_auc = 0 encoder = init_center(encoder, train_loader) # print(model) for epoch in range(hparams.num_epochs): if mode == 'pretrain' and epoch == hparams.pretrain_epoch: print('Pretraining done.') mode = 'train' best_valid_loss = 100000000000000000 best_valid_auc = 0 encoder = init_center(encoder, train_loader) for batch, (imgs, labels, _) in enumerate(train_loader): # imgs = Variable(imgs.float(), requires_grad=False) if hparams.cuda: imgs = imgs.cuda(hparams.gpu_device) if mode == 'pretrain': optimizer_pre.zero_grad() z = encoder(imgs) outputs = decoder(z) # print(torch.max(outputs), torch.mean(imgs), torch.min(outputs), torch.mean(imgs)) scores = torch.sum((outputs - imgs)**2, dim=tuple(range(1, outputs.dim()))) # print(scores) loss = torch.mean(scores) loss.backward() optimizer_pre.step() writer.add_scalar('pretrain_loss', loss.item(), global_step=batch + len(train_loader) * epoch) else: optimizer_train.zero_grad() z = encoder(imgs) dist = torch.sum((z - encoder.center)**2, dim=1) if hparams.objective == 'soft-boundary': scores = dist - encoder.radius**2 loss = encoder.radius**2 + (1 / hparams.nu) * torch.mean( torch.max(torch.zeros_like(scores), scores)) else: loss = torch.mean(dist) loss.backward() optimizer_train.step() if hparams.objective == 'soft-boundary' and epoch >= hparams.warmup_epochs: R = np.quantile(np.sqrt(dist.clone().data.cpu().numpy()), 1 - hparams.nu) encoder.radius = torch.tensor(R) if hparams.cuda: encoder.radius = encoder.radius.cuda( hparams.gpu_device) writer.add_scalar('radius', encoder.radius.item(), global_step=batch + len(train_loader) * epoch) writer.add_scalar('train_loss', loss.item(), global_step=batch + len(train_loader) * epoch) # pred_labels = (scores >= hparams.thresh) # save_image(imgs, 'train_imgs.png') # save_image(noisy_imgs, 'train_noisy.png') # save_image(gen_imgs, 'train_z.png') if batch % hparams.print_interval == 0: print('[Epoch - {0:.1f}, batch - {1:.3f}, loss - {2:.6f}]'.\ format(1.0*epoch, 100.0*batch/len(train_loader), loss.item())) if mode == 'pretrain': val_auc, rec_loss = validation(copy.deepcopy(encoder), copy.deepcopy(decoder), epoch=epoch) else: val_auc, val_loss = validation(copy.deepcopy(encoder), epoch=epoch) writer.add_scalar('val_auc', val_auc, global_step=epoch) if mode == 'pretrain': best_valid_auc = max(best_valid_auc, val_auc) scheduler_pre.step() writer.add_scalar('rec_loss', rec_loss, global_step=epoch) writer.add_scalar('pretrain_lr', optimizer_pre.param_groups[0]['lr'], global_step=epoch) torch.save( { 'epoch': epoch, 'encoder_state_dict': encoder.state_dict(), 'decoder_state_dict': decoder.state_dict(), 'optimizer_pre_state_dict': optimizer_pre.state_dict(), }, hparams.model + '.pre') if best_valid_loss >= rec_loss: best_valid_loss = rec_loss torch.save( { 'epoch': epoch, 'encoder_state_dict': encoder.state_dict(), 'decoder_state_dict': decoder.state_dict(), 'optimizer_pre_state_dict': optimizer_pre.state_dict(), }, hparams.model + '.pre.best') print('best model on validation set saved.') print('[Epoch - {0:.1f} ---> rec_loss - {1:.4f}, current_lr - {2:.6f}, val_auc - {3:.4f}, best_valid_auc - {4:.4f}] - time - {5:.1f}'\ .format(1.0*epoch, rec_loss, optimizer_pre.param_groups[0]['lr'], val_auc, best_valid_auc, time.time()-start_time)) else: scheduler_train.step() writer.add_scalar('val_loss', val_loss, global_step=epoch) writer.add_scalar('train_lr', optimizer_train.param_groups[0]['lr'], global_step=epoch) torch.save( { 'epoch': epoch, 'encoder_state_dict': encoder.state_dict(), 'center': encoder.center, 'radius': encoder.radius, 'optimizer_train_state_dict': optimizer_train.state_dict(), }, hparams.model + '.train') if best_valid_loss >= val_loss: best_valid_loss = val_loss torch.save( { 'epoch': epoch, 'encoder_state_dict': encoder.state_dict(), 'center': encoder.center, 'radius': encoder.radius, 'optimizer_train_state_dict': optimizer_train.state_dict(), }, hparams.model + '.train.best') print('best model on validation set saved.') if best_valid_auc <= val_auc: best_valid_auc = val_auc torch.save( { 'epoch': epoch, 'encoder_state_dict': encoder.state_dict(), 'center': encoder.center, 'radius': encoder.radius, 'optimizer_train_state_dict': optimizer_train.state_dict(), }, hparams.model + '.train.auc') print('best model on validation set saved.') print('[Epoch - {0:.1f} ---> val_loss - {1:.4f}, current_lr - {2:.6f}, val_auc - {3:.4f}, best_valid_auc - {4:.4f}] - time - {5:.1f}'\ .format(1.0*epoch, val_loss, optimizer_train.param_groups[0]['lr'], val_auc, best_valid_auc, time.time()-start_time)) start_time = time.time()
opts_loss = { 'title': 'sequence loss', 'xlabel': 'every 200 batch', 'ylabel': 'loss', 'showlegend': 'true' } opts_acc = { 'title': 'Accuracy', 'xlabel': 'every 200 batch', 'ylabel': 'accuracy', 'showlegend': 'true' } # 创建编码器解码器 encoder = Encoder().to(DEVICE) decoder = Decoder().to(DEVICE) viz.text("Seq2seq model built", win='summary') # 统计参数量 def count_parameters(model): return sum(p.numel() for p in model.parameters() if p.requires_grad) print("encoder params %d" % count_parameters(encoder)) print("decoder params %d" % count_parameters(decoder)) # 加载预训练词嵌入 encoder.load_pretrained_vectors(EMBED_PATH_SRC) decoder.load_pretrained_vectors(EMBED_PATH_TGT) viz.text("Pretrained embeddings loaded", win='summary', append=True)
pickle print("Preprocessing complete!") return train_data, word2index, tag2index, intent2index _,word2index,tag2index,intent2index = preprocessing('../dataset/corpus/atis-2.train.w-intent.iob',60) index2tag = {v:k for k,v in tag2index.items()} index2intent = {v:k for k,v in intent2index.items()} encoder = Encoder(len(word2index),64,64) decoder = Decoder(len(tag2index),len(intent2index),len(tag2index)//3,64*2) encoder.load_state_dict(torch.load('models_006_60_64_64_1_16_16_0p01/jointnlu-encoder.pkl')) decoder.load_state_dict(torch.load('models_006_60_64_64_1_16_16_0p01/jointnlu-decoder.pkl')) if USE_CUDA: encoder = encoder.cuda() decoder = decoder.cuda() test = open("./dataset/corpus/atis.test.w-intent.iob","r").readlines() test = [t[:-1] for t in test] test = [[t.split("\t")[0].split(" "),t.split("\t")[1].split(" ")[:-1],t.split("\t")[1].split(" ")[-1]] for t in test] test = [[t[0][1:-1],t[1][1:],t[2]] for t in test] #index = random.choice(range(len(test))) error=0
def main(): options = parse_args() is_cuda = use_cuda and not options.no_cuda hardware = "cuda" if is_cuda else "cpu" device = torch.device(hardware) for dataset_name in options.dataset: results = {"best": {}, "mean": {}, "highest_prob": {}} for checkpoint_path in options.checkpoint: checkpoint_name, _ = os.path.splitext( os.path.basename(checkpoint_path)) checkpoint = (load_checkpoint(checkpoint_path, cuda=is_cuda) if checkpoint_path else default_checkpoint) encoder_checkpoint = checkpoint["model"].get("encoder") decoder_checkpoint = checkpoint["model"].get("decoder") test_set = test_sets[dataset_name] dataset = CrohmeDataset( test_set["groundtruth"], tokensfile, root=test_set["root"], transform=transformers, ) data_loader = DataLoader( dataset, batch_size=options.batch_size, shuffle=False, num_workers=options.num_workers, collate_fn=collate_batch, ) enc = Encoder(img_channels=3, checkpoint=encoder_checkpoint).to(device) dec = Decoder( len(dataset.id_to_token), low_res_shape, high_res_shape, checkpoint=decoder_checkpoint, device=device, ).to(device) enc.eval() dec.eval() result = evaluate( enc, dec, data_loader=data_loader, device=device, checkpoint=checkpoint, beam_width=options.beam_width, prefix=options.prefix, ) results["best"][checkpoint_name] = result["best"] results["mean"][checkpoint_name] = result["mean"] results["highest_prob"][checkpoint_name] = result["highest_prob"] highest_prob_err_table, highest_prob_correct_table = create_markdown_tables( results["highest_prob"]) best_err_table, best_correct_table = create_markdown_tables( results["best"]) mean_err_table, mean_correct_table = create_markdown_tables( results["mean"]) print(("\n# Dataset {name}\n\n" "Beam width: {beam_width}\n\n" "## Highest Probability\n\n{highest_prob_err_table}\n\n" "{highest_prob_correct_table}\n\n" "## Best\n\n{best_err_table}\n\n{best_correct_table}\n\n" "## Mean\n\n{mean_err_table}\n\n{mean_correct_table}").format( name=dataset_name, beam_width=options.beam_width, highest_prob_err_table=highest_prob_err_table, highest_prob_correct_table=highest_prob_correct_table, best_err_table=best_err_table, best_correct_table=best_correct_table, mean_err_table=mean_err_table, mean_correct_table=mean_correct_table, ))
def main(): construct_vocab = False encode_images = False train = True # Read and Process Raw data data = CaptioningData() # Finding image files as data data.set_all_images(cfg.images_path) captions_dict = data.get_captions(cfg.token_file) caption_maxlen = data.get_caption_maxlen() # Construct vocabulary if construct_vocab: # get all caption to construct Vocab all_captions = data.get_all_captions() vocab = build_vocab(vocab_path=cfg.data_path, vocab_name=cfg.vocab_name, captions=all_captions, threshold=2) else: vocab = load_vocab(vocab_path=cfg.data_path, vocab_name=cfg.vocab_name) # print(vocab.word2idx) inception_encoding = Encoder() # train data if train: train_images = data.get_train_images(cfg.train_image_files) train_pairs = [ ImgCaptionPair(img_id, captions_dict[img_id]) for img_id in train_images ] # Image Encoding if encode_images: train_img_encoding = inception_encoding.encode_images( file_path=cfg.images_path, image_list=train_images, encoding_file=cfg.train_img_encoding_file) else: train_img_encoding = inception_encoding.load_image_encoding( encoding_file=cfg.train_img_encoding_file) train_data_generator = data_generator(vocab, train_pairs, train_img_encoding, batch_size=1800, max_len=caption_maxlen) # next(g) # Decoder model decoder = Decoder(vocab_size=len(vocab), embedding_size=300, input_shape=2048, caption_max_len=caption_maxlen) decoder_model = decoder.get_model() decoder_model.load_weights('best_weights.97-0.95.hdf5') if train: decoder_model.compile(loss='categorical_crossentropy', optimizer=RMSprop(), metrics=['accuracy']) ckpt = ModelCheckpoint('weights.{epoch:02d}-{loss:.2f}.hdf5', monitor='loss', verbose=0, save_best_only=False, save_weights_only=False, mode='auto', period=30) best_ckpt = ModelCheckpoint('best_weights.{epoch:02d}-{loss:.2f}.hdf5', monitor='loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1) decoder_model.fit_generator(train_data_generator, steps_per_epoch=30, epochs=100, callbacks=[ckpt, best_ckpt]) decoder_model.save('decoder_model.h5') img_ids = data.get_val_images(cfg.val_image_files) img_name = img_ids[9] enc_img = inception_encoding.encode_single_img(file_path=cfg.images_path, img_name=img_name) caption = ["<start>"] while True: par_caps = [vocab(i) for i in caption] par_caps = sequence.pad_sequences([par_caps], maxlen=40, padding='post') preds = decoder_model.predict( [np.array([enc_img]), np.array(par_caps)]) word_pred = vocab.idx2word[np.argmax(preds[0])] caption.append(word_pred) if word_pred == "<end>" or len(caption) > 40: break full_img_path = os.path.join(cfg.images_path, img_name) print(captions_dict[img_name]) print(full_img_path) print(' '.join(caption[1:-1]))
def train(): opt = parse_args() cuda = True if torch.cuda.is_available() else False input_shape = (opt.channels, opt.img_width, opt.img_height) FloatTensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor transform = transforms.Compose([ transforms.Resize(int(opt.img_height * 1.12), Image.BICUBIC), transforms.RandomCrop((opt.img_height, opt.img_width)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) # Get dataloader train_loader = coco_loader(opt, mode='train', transform=transform) test_loader = coco_loader(opt, mode='test', transform=transform) # Get vgg vgg = VGGNet() # Initialize two generators and the discriminator shared_E = Encoder(opt.channels, opt.dim, opt.n_downsample) shared_D = Decoder(3, 256, opt.n_upsample) G_A = GeneratorA(opt.n_residual, 256, shared_E, shared_D) G_B = GeneratorB(opt.n_residual, 256, shared_E, shared_D) D_B = Discriminator(input_shape) # Initialize weights G_A.apply(weights_init_normal) G_B.apply(weights_init_normal) D_B.apply(weights_init_normal) # Losses criterion_GAN = torch.nn.MSELoss() criterion_pixel = torch.nn.L1Loss() if cuda: vgg = vgg.cuda().eval() G_A = G_A.cuda() G_B = G_B.cuda() D_B = D_B.cuda() criterion_GAN.cuda() criterion_pixel.cuda() optimizer_G = torch.optim.Adam(itertools.chain(G_A.parameters(), G_B.parameters()), lr=opt.lr, betas=(0.5, 0.999)) optimizer_D = torch.optim.Adam(D_B.parameters(), lr=opt.lr, betas=(0.5, 0.999)) lr_scheduler_G = torch.optim.lr_scheduler.LambdaLR( optimizer_G, lr_lambda=LambdaLR(opt.n_epochs, opt.epoch, opt.decay_epoch).step) lr_scheduler_D = torch.optim.lr_scheduler.LambdaLR( optimizer_D, lr_lambda=LambdaLR(opt.n_epochs, opt.epoch, opt.decay_epoch).step) # Compute the style features in advance style_img = Variable(load_img(opt.style_img, transform).type(FloatTensor)) style_feature = vgg(style_img) prev_time = time.time() for epoch in range(opt.epoch, opt.n_epochs): for batch_i, content_img in enumerate(train_loader): content_img = Variable(content_img.type(FloatTensor)) valid = Variable(FloatTensor( np.ones((content_img.size(0), *D_B.output_shape))), requires_grad=False) fake = Variable(FloatTensor( np.zeros((content_img.size(0), *D_B.output_shape))), requires_grad=False) # --------------------- # Train Generators # --------------------- optimizer_G.zero_grad() # 生成的图像并没有做反正则化,得保证:内容,风格,生成图,图像预处理的一致性! stylized_img = G_A(content_img) target_feature = vgg(stylized_img) content_feature = vgg(content_img) loss_st = opt.lambda_st * vgg.compute_st_loss( target_feature, content_feature, style_feature, opt.lambda_style) reconstructed_img = G_B(stylized_img) loss_adv = opt.lambda_adv * criterion_GAN(D_B(reconstructed_img), valid) loss_G = loss_st + loss_adv loss_G.backward(retain_graph=True) optimizer_G.step() # ---------------------- # Train Discriminator # ---------------------- optimizer_D.zero_grad() loss_D = criterion_GAN(D_B(content_img), valid) + criterion_GAN( D_B(reconstructed_img.detach()), fake) loss_D.backward() optimizer_D.step() # ------------------ # Log Information # ------------------ batches_done = epoch * len(train_loader) + batch_i batches_left = opt.n_epochs * len(train_loader) - batches_done time_left = datetime.timedelta(seconds=batches_left * (time.time() - prev_time)) prev_time = time.time() print( "[Epoch %d/%d] [Batch %d/%d] [D loss: %f] [G loss: %f] ETA: %s" % (epoch, opt.n_epochs, batch_i, len(train_loader), loss_D.item(), loss_G.item(), time_left)) if batches_done % opt.sample_interval == 0: save_sample(opt.style_name, test_loader, batches_done, G_A, G_B, FloatTensor) if batches_done % opt.checkpoint_interval == 0: torch.save( G_A.state_dict(), "checkpoints/%s/G_A_%d.pth" % (opt.style_name, epoch)) torch.save( G_B.state_dict(), "checkpoints/%s/G_B_%d.pth" % (opt.style_name, epoch)) # Update learning rates lr_scheduler_G.step() lr_scheduler_D.step() torch.save(G_A.state_dict(), "checkpoints/%s/G_A_done.pth" % opt.style_name) torch.save(G_B.state_dict(), "checkpoints/%s/G_B_done.pth" % opt.style_name) print("Training Process has been Done!")
def train(args, logger): task_time = time.strftime("%Y-%m-%d %H:%M", time.localtime()) Path("./saved_models/").mkdir(parents=True, exist_ok=True) Path("./pretrained_models/").mkdir(parents=True, exist_ok=True) MODEL_SAVE_PATH = './saved_models/' Pretrained_MODEL_PATH = './pretrained_models/' get_model_name = lambda part: f'{part}-{args.data}-{args.tasks}-{args.prefix}.pth' get_pretrain_model_name = lambda part: f'{part}-{args.data}-LP-{args.prefix}.pth' device_string = 'cuda:{}'.format( args.gpu) if torch.cuda.is_available() and args.gpu >= 0 else 'cpu' print('Model trainging with ' + device_string) device = torch.device(device_string) g = load_graphs(f"./data/{args.data}.bin")[0][0] print(g) efeat_dim = g.edata['feat'].shape[1] nfeat_dim = efeat_dim train_loader, val_loader, test_loader, num_val_samples, num_test_samples = dataloader( args, g) encoder = Encoder(args, nfeat_dim, n_head=args.n_head, dropout=args.dropout).to(device) decoder = Decoder(args, nfeat_dim).to(device) msg2mail = Msg2Mail(args, nfeat_dim) fraud_sampler = frauder_sampler(g) optimizer = torch.optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=args.lr, weight_decay=args.weight_decay) scheduler_lr = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=40) if args.warmup: scheduler_warmup = GradualWarmupScheduler(optimizer, multiplier=1, total_epoch=3, after_scheduler=scheduler_lr) optimizer.zero_grad() optimizer.step() loss_fcn = torch.nn.BCEWithLogitsLoss() loss_fcn = loss_fcn.to(device) early_stopper = EarlyStopMonitor(logger=logger, max_round=args.patience, higher_better=True) if args.pretrain: logger.info( f'Loading the linkpred pretrained attention based encoder model') encoder.load_state_dict( torch.load(Pretrained_MODEL_PATH + get_pretrain_model_name('Encoder'))) for epoch in range(args.n_epoch): # reset node state g.ndata['mail'] = torch.zeros( (g.num_nodes(), args.n_mail, nfeat_dim + 2), dtype=torch.float32) g.ndata['feat'] = torch.zeros( (g.num_nodes(), nfeat_dim), dtype=torch.float32 ) # init as zero, people can init it using others. g.ndata['last_update'] = torch.zeros((g.num_nodes()), dtype=torch.float32) encoder.train() decoder.train() start_epoch = time.time() m_loss = [] logger.info('start {} epoch, current optim lr is {}'.format( epoch, optimizer.param_groups[0]['lr'])) for batch_idx, (input_nodes, pos_graph, neg_graph, blocks, frontier, current_ts) in enumerate(train_loader): pos_graph = pos_graph.to(device) neg_graph = neg_graph.to(device) if neg_graph is not None else None if not args.no_time or not args.no_pos: current_ts, pos_ts, num_pos_nodes = get_current_ts( args, pos_graph, neg_graph) pos_graph.ndata['ts'] = current_ts else: current_ts, pos_ts, num_pos_nodes = None, None, None _ = dgl.add_reverse_edges( neg_graph) if neg_graph is not None else None emb, _ = encoder(dgl.add_reverse_edges(pos_graph), _, num_pos_nodes) if batch_idx != 0: if 'LP' not in args.tasks and args.balance: neg_graph = fraud_sampler.sample_fraud_event( g, args.bs // 5, current_ts.max().cpu()).to(device) logits, labels = decoder(emb, pos_graph, neg_graph) loss = loss_fcn(logits, labels) optimizer.zero_grad() loss.backward() optimizer.step() m_loss.append(loss.item()) # MSG Passing with torch.no_grad(): mail = msg2mail.gen_mail(args, emb, input_nodes, pos_graph, frontier, 'train') if not args.no_time: g.ndata['last_update'][pos_graph.ndata[dgl.NID] [:num_pos_nodes]] = pos_ts.to('cpu') g.ndata['feat'][pos_graph.ndata[dgl.NID]] = emb.to('cpu') g.ndata['mail'][input_nodes] = mail if batch_idx % 100 == 1: gpu_mem = torch.cuda.max_memory_allocated( ) / 1.074e9 if torch.cuda.is_available( ) and args.gpu >= 0 else 0 torch.cuda.empty_cache() mem_perc = psutil.virtual_memory().percent cpu_perc = psutil.cpu_percent(interval=None) output_string = f'Epoch {epoch} | Step {batch_idx}/{len(train_loader)} | CPU {cpu_perc:.1f}% | Sys Mem {mem_perc:.1f}% | GPU Mem {gpu_mem:.4f}GB ' output_string += f'| {args.tasks} Loss {np.mean(m_loss):.4f}' logger.info(output_string) total_epoch_time = time.time() - start_epoch logger.info(' training epoch: {} took {:.4f}s'.format( epoch, total_epoch_time)) val_ap, val_auc, val_acc, val_loss = eval_epoch( args, logger, g, val_loader, encoder, decoder, msg2mail, loss_fcn, device, num_val_samples) logger.info( 'Val {} Task | ap: {:.4f} | auc: {:.4f} | acc: {:.4f} | Loss: {:.4f}' .format(args.tasks, val_ap, val_auc, val_acc, val_loss)) if args.warmup: scheduler_warmup.step(epoch) else: scheduler_lr.step() early_stopper_metric = val_ap if 'LP' in args.tasks else val_auc if early_stopper.early_stop_check(early_stopper_metric): logger.info('No improvement over {} epochs, stop training'.format( early_stopper.max_round)) logger.info( f'Loading the best model at epoch {early_stopper.best_epoch}') encoder.load_state_dict( torch.load(MODEL_SAVE_PATH + get_model_name('Encoder'))) decoder.load_state_dict( torch.load(MODEL_SAVE_PATH + get_model_name('Decoder'))) test_result = [ early_stopper.best_ap, early_stopper.best_auc, early_stopper.best_acc, early_stopper.best_loss ] break test_ap, test_auc, test_acc, test_loss = eval_epoch( args, logger, g, test_loader, encoder, decoder, msg2mail, loss_fcn, device, num_test_samples) logger.info( 'Test {} Task | ap: {:.4f} | auc: {:.4f} | acc: {:.4f} | Loss: {:.4f}' .format(args.tasks, test_ap, test_auc, test_acc, test_loss)) test_result = [test_ap, test_auc, test_acc, test_loss] if early_stopper.best_epoch == epoch: early_stopper.best_ap = test_ap early_stopper.best_auc = test_auc early_stopper.best_acc = test_acc early_stopper.best_loss = test_loss logger.info( f'Saving the best model at epoch {early_stopper.best_epoch}') torch.save(encoder.state_dict(), MODEL_SAVE_PATH + get_model_name('Encoder')) torch.save(decoder.state_dict(), MODEL_SAVE_PATH + get_model_name('Decoder'))
def main(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") logging.info("Device type is '%s'" % (device)) logging.info("Load %s" % (INDEX2WORD_PATH)) with open(INDEX2WORD_PATH) as f: i2w = json.load(f) with open(WORD2INDEX_PATH) as f: w2i = json.load(f) logging.info("Load word embedding") word_embedding = np.load(WORD_EMBEDDING_DATA) logging.info("Read validation dataset") valSet = buildS2SDatasetForTest.build(VAL_DATASET_PATH) logging.info("Build data loader for validation data") validation_generator = data.DataLoader(valSet, batch_size=BATCH_SIZE, shuffle=False, collate_fn=valSet.collate_fn) logging.info("Data loader loading is complete") loss_function = nn.CrossEntropyLoss(ignore_index=0) logging.info("Build encoder with hidden dimension %s" % (HIDDEN_DIMENSION)) encoder = Encoder(EMBEDDING_DIMENSION, HIDDEN_DIMENSION, word_embedding.shape[0], word_embedding, RNN_LAYER, DROPOUT, BIDIRECTION) logging.info("Build decoder with hidden dimension %s" % (HIDDEN_DIMENSION)) decoder = Decoder(EMBEDDING_DIMENSION, HIDDEN_DIMENSION, word_embedding.shape[0], word_embedding, RNN_LAYER, DROPOUT) logging.info("Build seq2seq model") model = Seq2Seq(encoder, decoder, device) del word_embedding model = torch.load(MODEL) model.to(device) check_model_performance = -1 #torch.set_printoptions(threshold=100000) model.eval() logging.info("Start validation") final = [] box = [] ID = [] with torch.no_grad(): for step, d in enumerate(validation_generator): if step % 50 == 0: logging.info("Valid step %s" % (step)) text = d['text'].to(device, dtype=torch.long) length = d['len_text'] mask = d['attention_mask'].to(device, dtype=torch.long) out, predict = model.predict(text, 1, 2, length, mask) box.append(predict) ID.append(d['id']) del text, mask, out, predict for predict, test_idx in zip(box, ID): pre = 3 for idx, ii in enumerate(predict): ans = [] for s, j in enumerate(ii): if j == pre or i2w[j] == "<unk>": continue if i2w[j] == "</s>" or s > 80: break ans.append(i2w[j]) pre = j sent = " ".join(ans) s_ans = {"id": test_idx[idx], "predict": sent} final.append(s_ans) logging.info("end predict") f = open(OUTPUT_PATH, "w") f.write("\n".join([json.dumps(p) for p in final]) + "\n")
val_dataset = CelebA(args.ann_file, args.image_dir, eval_index_list, transform_val, transform_val, args.att_num) val_loader = DataLoader(val_dataset, shuffle=True, batch_size=args.batch_size, num_workers=args.nthreads) print("| Data Loaded: # training data: %d, # val data: %d" % (len(train_loader) * args.batch_size, len(val_loader) * args.batch_size)) ############################################################################### # Build the model ############################################################################### encoder = Encoder() decoder = Decoder(att_num=args.att_num) classifier = Classifier(args.att_num) optimizer = optim.Adam( [ { 'params': encoder.parameters() }, { 'params': decoder.parameters() }, # {'params': classifier.parameters()} ], lr=args.lr, weight_decay=args.weight_decay)
def main(): options = parse_args() torch.manual_seed(options.seed) is_cuda = use_cuda and not options.no_cuda hardware = "cuda" if is_cuda else "cpu" device = torch.device(hardware) checkpoint = (load_checkpoint(options.checkpoint, cuda=is_cuda) if options.checkpoint else default_checkpoint) print("Running {} epochs on {}".format(options.num_epochs, hardware)) encoder_checkpoint = checkpoint["model"].get("encoder") decoder_checkpoint = checkpoint["model"].get("decoder") if encoder_checkpoint is not None: print(("Resuming from - Epoch {}: " "Train Accuracy = {train_accuracy:.5f}, " "Train Loss = {train_loss:.5f}, " "Validation Accuracy = {validation_accuracy:.5f}, " "Validation Loss = {validation_loss:.5f}, ").format( checkpoint["epoch"], train_accuracy=checkpoint["train_accuracy"][-1], train_loss=checkpoint["train_losses"][-1], validation_accuracy=checkpoint["validation_accuracy"][-1], validation_loss=checkpoint["validation_losses"][-1], )) train_dataset = CrohmeDataset(gt_train, tokensfile, root=root, crop=options.crop, transform=transformers) train_data_loader = DataLoader( train_dataset, batch_size=options.batch_size, shuffle=True, num_workers=options.num_workers, collate_fn=collate_batch, ) validation_dataset = CrohmeDataset(gt_validation, tokensfile, root=root, crop=options.crop, transform=transformers) validation_data_loader = DataLoader( validation_dataset, batch_size=options.batch_size, shuffle=True, num_workers=options.num_workers, collate_fn=collate_batch, ) criterion = nn.CrossEntropyLoss().to(device) enc = Encoder(img_channels=3, dropout_rate=options.dropout_rate, checkpoint=encoder_checkpoint).to(device) dec = Decoder( len(train_dataset.id_to_token), low_res_shape, high_res_shape, checkpoint=decoder_checkpoint, device=device, ).to(device) enc.train() dec.train() enc_params_to_optimise = [ param for param in enc.parameters() if param.requires_grad ] dec_params_to_optimise = [ param for param in dec.parameters() if param.requires_grad ] params_to_optimise = [*enc_params_to_optimise, *dec_params_to_optimise] optimiser = optim.Adadelta(params_to_optimise, lr=options.lr, weight_decay=options.weight_decay) optimiser_state = checkpoint.get("optimiser") if optimiser_state: optimiser.load_state_dict(optimiser_state) # Set the learning rate instead of using the previous state. # The scheduler somehow overwrites the LR to the initial LR after loading, # which would always reset it to the first used learning rate instead of # the one from the previous checkpoint. So might as well set it manually. for param_group in optimiser.param_groups: param_group["initial_lr"] = options.lr # Decay learning rate by a factor of lr_factor (default: 0.1) # every lr_epochs (default: 3) lr_scheduler = optim.lr_scheduler.StepLR(optimiser, step_size=options.lr_epochs, gamma=options.lr_factor) train( enc, dec, optimiser, criterion, train_data_loader, validation_data_loader, teacher_forcing_ratio=options.teacher_forcing, lr_scheduler=lr_scheduler, print_epochs=options.print_epochs, device=device, num_epochs=options.num_epochs, checkpoint=checkpoint, prefix=options.prefix, max_grad_norm=options.max_grad_norm, )
def __init__(self): current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") train_log_dir = './logs/gradient_tape/' + current_time + '/train' test_log_dir = './logs/gradient_tape/' + current_time + '/test' self.train_summary_writer = tf.summary.create_file_writer( train_log_dir) self.test_summary_writer = tf.summary.create_file_writer(test_log_dir) self.m = tf.keras.metrics.SparseCategoricalAccuracy() # self.recall = tf.keras.metrics.Recall() self.recall = [0] # self.F1Score = 2*self.m.result()*self.recall.result()/(self.recall.result()+self.m.result()) self.BATCH_SIZE = 128 self.embedding_dim = 24 self.units = 64 # 尝试实验不同大小的数据集 stop_word_dir = './stop_words.utf8' self.stop_words = self.get_stop_words(stop_word_dir) + [''] num_examples = 30000 QA_dir = './QA_data.txt' # QA_dir = 'C:/Users/Administrator/raw_chat_corpus/qingyun-11w/qinyun-11w.csv' self.input_tensor, self.target_tensor, self.inp_tokenizer, self.targ_tokenizer = self.load_dataset( QA_dir, num_examples) self.num_classes = len(self.targ_tokenizer.index_word) #目标词类别 #初始化混淆矩阵(训练用和测试用): self.train_confusion_matrix = tfa.metrics.MultiLabelConfusionMatrix( num_classes=self.num_classes) self.test_confusion_matrix = tfa.metrics.MultiLabelConfusionMatrix( num_classes=self.num_classes) self.F1Score = tfa.metrics.F1Score(num_classes=len( self.targ_tokenizer.index_word), average="micro") # self.F1Score = tfa.metrics.F1Score(num_classes=self.max_length_targ, average="micro") # input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split( # self.input_tensor, # self.target_tensor, # test_size=0.2) # self.load_split_dataset(input_tensor_train,target_tensor_train) self.vocab_inp_size = len(self.inp_tokenizer.word_index) + 1 self.vocab_tar_size = len(self.targ_tokenizer.word_index) + 1 # encoder初始化 self.encoder = Encoder(self.vocab_inp_size, self.embedding_dim, self.units, self.BATCH_SIZE) plot_model(self.encoder, to_file='encoder.png', show_shapes=True, show_layer_names=True, rankdir='TB', dpi=900, expand_nested=True) # 样本输入 # sample_hidden = self.encoder.initialize_hidden_state() # sample_output, sample_hidden = self.encoder.call(self.example_input_batch, sample_hidden) # print('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape)) # print('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape)) # attention初始化 attention_layer = BahdanauAttention(10) # attention_result, attention_weights = attention_layer(sample_hidden, sample_output) plot_model(attention_layer, to_file='attention_layer.png', show_shapes=True, show_layer_names=True, rankdir='TB', dpi=900, expand_nested=True) # print("Attention result shape: (batch size, units) {}".format(attention_result.shape)) # print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape)) # decoder初始化 self.decoder = Decoder(self.vocab_tar_size, self.embedding_dim, self.units, self.BATCH_SIZE) plot_model(self.decoder, to_file='decoder.png', show_shapes=True, show_layer_names=True, rankdir='TB', dpi=900, expand_nested=True) # sample_decoder_output, _, _ = self.decoder(tf.random.uniform((self.BATCH_SIZE, 1)), # sample_hidden, sample_output) # # print('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape)) # optimizer初始化 self.optimizer = tf.keras.optimizers.Adam() self.loss_object = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction='none') # checkpoint & save model as object 初始化 self.checkpoint_dir = './training_checkpoints' self.checkpoint_prefix = os.path.join(self.checkpoint_dir, "ckpt") self.checkpoint = tf.train.Checkpoint(optimizer=self.optimizer, encoder=self.encoder, decoder=self.decoder)
def main(mode, config, use_bokeh=False): # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2, config['topical_embedding_dim']) topical_transformer = topicalq_transformer(config['topical_vocab_size'], config['topical_embedding_dim'], config['enc_nhids'], config['topical_word_num'], config['batch_size']) if mode == "train": # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') source_topical_word = tensor.lmatrix('source_topical') source_topical_mask = tensor.matrix('source_topical_mask') # Get training and development set streams tr_stream = get_tr_stream_with_topicalq(**config) dev_stream = get_dev_stream_with_topicalq(**config) topic_embedding = topical_transformer.apply(source_topical_word) # Get cost of the model representation = encoder.apply(source_sentence, source_sentence_mask) tw_representation = topical_transformer.look_up.apply( source_topical_word.T) content_embedding = representation[0, :, (representation.shape[2] / 2):] cost = decoder.cost(representation, source_sentence_mask, tw_representation, source_topical_mask, target_sentence, target_sentence_mask, topic_embedding, content_embedding) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() topical_transformer.weights_init = IsotropicGaussian( config['weight_scale']) topical_transformer.biases_init = Constant(0) topical_transformer.push_allocation_config() #don't know whether the initialize is for topical_transformer.look_up.weights_init = Orthogonal() topical_transformer.transformer.weights_init = Orthogonal() topical_transformer.initialize() word_topical_embedding = cPickle.load( open(config['topical_embeddings'], 'rb')) np_word_topical_embedding = numpy.array(word_topical_embedding, dtype='float32') topical_transformer.look_up.W.set_value(np_word_topical_embedding) topical_transformer.look_up.W.tag.role = [] # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise(cg, enc_params + dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] ''' # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate( sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if config['bleu_script'] is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) ''' # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Cs-En', channels=[['decoder_cost_cost']], after_batch=True)) # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent(cost=cost, parameters=cg.parameters, on_unused_sources='warn', step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run() elif mode == 'translate': # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_topical_word = tensor.lmatrix('source_topical') # Get test set stream test_stream = get_dev_stream_with_topicalq( config['test_set'], config['src_vocab'], config['src_vocab_size'], config['topical_test_set'], config['topical_vocab'], config['topical_vocab_size'], config['unk_id']) ftrans = open(config['test_set'] + '.trans.out', 'w') # Helper utilities sutils = SamplingBase() unk_idx = config['unk_id'] src_eos_idx = config['src_vocab_size'] - 1 trg_eos_idx = config['trg_vocab_size'] - 1 # Get beam search logger.info("Building sampling model") topic_embedding = topical_transformer.apply(source_topical_word) representation = encoder.apply(source_sentence, tensor.ones(source_sentence.shape)) tw_representation = topical_transformer.look_up.apply( source_topical_word.T) content_embedding = representation[0, :, (representation.shape[2] / 2):] generated = decoder.generate(source_sentence, representation, tw_representation, topical_embedding=topic_embedding, content_embedding=content_embedding) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs beam_search = BeamSearch(samples=samples) logger.info("Loading the model..") model = Model(generated) loader = LoadNMT(config['saveto']) loader.set_model_parameters(model, loader.load_parameters()) # Get target vocabulary trg_vocab = _ensure_special_tokens(pickle.load( open(config['trg_vocab'], 'rb')), bos_idx=0, eos_idx=trg_eos_idx, unk_idx=unk_idx) trg_ivocab = {v: k for k, v in trg_vocab.items()} logger.info("Started translation: ") total_cost = 0.0 for i, line in enumerate(test_stream.get_epoch_iterator()): seq = sutils._oov_to_unk(line[0], config['src_vocab_size'], unk_idx) seq2 = line[1] input_ = numpy.tile(seq, (config['beam_size'], 1)) input_topical = numpy.tile(seq2, (config['beam_size'], 1)) # draw sample, checking to ensure we don't get an empty string back trans, costs = \ beam_search.search( input_values={source_sentence: input_,source_topical_word:input_topical}, max_length=10*len(seq), eol_symbol=src_eos_idx, ignore_first_eol=True) ''' # normalize costs according to the sequence lengths if config['normalized_bleu']: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths ''' #best = numpy.argsort(costs)[0] best = numpy.argsort(costs)[0:config['beam_size']] for b in best: try: total_cost += costs[b] trans_out = trans[b] # convert idx to words trans_out = sutils._idx_to_word(trans_out, trg_ivocab) except ValueError: logger.info( "Can NOT find a translation for line: {}".format(i + 1)) trans_out = '<UNK>' print(trans_out, file=ftrans) if i != 0 and i % 100 == 0: logger.info("Translated {} lines of test set...".format(i)) logger.info("Total cost of the test: {}".format(total_cost)) ftrans.close() elif mode == 'rerank': # Create Theano variables ftrans = open(config['val_set'] + '.scores.out', 'w') logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') config['src_data'] = config['val_set'] config['trg_data'] = config['val_set_grndtruth'] config['batch_size'] = 1 config['sort_k_batches'] = 1 test_stream = get_tr_stream_unsorted(**config) logger.info("Building sampling model") representations = encoder.apply(source_sentence, source_sentence_mask) costs = decoder.cost(representations, source_sentence_mask, target_sentence, target_sentence_mask) logger.info("Loading the model..") model = Model(costs) loader = LoadNMT(config['saveto']) loader.set_model_parameters(model, loader.load_parameters()) costs_computer = function([ source_sentence, source_sentence_mask, target_sentence, target_sentence_mask ], costs) iterator = test_stream.get_epoch_iterator() scores = [] for i, (src, src_mask, trg, trg_mask) in enumerate(iterator): costs = costs_computer(*[src, src_mask, trg, trg_mask]) cost = costs.sum() print(i, cost) scores.append(cost) ftrans.write(str(cost) + "\n") ftrans.close()
train_x = torch.from_numpy(train_x).to(device) train_y = torch.from_numpy(train_y).to(device) test_x = torch.from_numpy(test_x).to(device) test_y = torch.from_numpy(test_y).to(device) # モデルの準備 INPUT_DIM = 1 OUTPUT_DIM = 1 N_LAYERS = args.n_layers HID_DIM = args.hidden_dim DROPOUT_RATE = args.dropout_rate enc = Encoder(INPUT_DIM, HID_DIM, N_LAYERS, DROPOUT_RATE) dec = Decoder(OUTPUT_DIM, INPUT_DIM, HID_DIM, N_LAYERS, DROPOUT_RATE) model = Seq2Seq(enc, dec, device).to(device) # モデルのパラメータの初期化 def init_weights(m): for name, param in m.named_parameters(): nn.init.uniform_(param.data, -0.08, 0.08) print(model.apply(init_weights)) # optimizerの設定 if args.optimizer == "sgd": optimizer = optim.SGD(model.parameters(), lr=0.01) else:
def main(): parser = argparse.ArgumentParser(description='Style Swap by Pytorch') parser.add_argument('--batch_size', '-b', type=int, default=4, help='Number of images in each mini-batch') parser.add_argument('--epoch', '-e', type=int, default=3, help='Number of sweeps over the dataset to train') parser.add_argument('--patch_size', '-p', type=int, default=5, help='Size of extracted patches from style features') parser.add_argument('--gpu', '-g', type=int, default=0, help='GPU ID(nagative value indicate CPU)') parser.add_argument('--learning_rate', '-lr', type=int, default=1e-4, help='learning rate for Adam') parser.add_argument('--tv_weight', type=int, default=1e-6, help='weight for total variation loss') parser.add_argument('--snapshot_interval', type=int, default=500, help='Interval of snapshot to generate image') parser.add_argument('--train_content_dir', type=str, default='/data/chen/content', help='content images directory for train') parser.add_argument('--train_style_dir', type=str, default='/data/chen/style', help='style images directory for train') parser.add_argument('--test_content_dir', type=str, default='/data/chen/content', help='content images directory for test') parser.add_argument('--test_style_dir', type=str, default='/data/chen/style', help='style images directory for test') parser.add_argument('--save_dir', type=str, default='result', help='save directory for result and loss') args = parser.parse_args() # create directory to save if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) loss_dir = f'{args.save_dir}/loss' model_state_dir = f'{args.save_dir}/model_state' image_dir = f'{args.save_dir}/image' if not os.path.exists(loss_dir): os.mkdir(loss_dir) os.mkdir(model_state_dir) os.mkdir(image_dir) # set device on GPU if available, else CPU if torch.cuda.is_available() and args.gpu >= 0: device = torch.device(f'cuda:{args.gpu}') print(f'# CUDA available: {torch.cuda.get_device_name(0)}') else: device = 'cpu' print(f'# Minibatch-size: {args.batch_size}') print(f'# epoch: {args.epoch}') print('') # prepare dataset and dataLoader train_dataset = PreprocessDataset(args.train_content_dir, args.train_style_dir) test_dataset = PreprocessDataset(args.test_content_dir, args.test_style_dir) iters = len(train_dataset) print(f'Length of train image pairs: {iters}') train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=True) test_iter = iter(test_loader) # set model and optimizer encoder = VGGEncoder().to(device) decoder = Decoder().to(device) optimizer = Adam(decoder.parameters(), lr=args.learning_rate) # start training criterion = nn.MSELoss() loss_list = [] for e in range(1, args.epoch + 1): print(f'Start {e} epoch') for i, (content, style) in tqdm(enumerate(train_loader, 1)): content = content.to(device) style = style.to(device) content_feature = encoder(content) style_feature = encoder(style) style_swap_res = [] for b in range(content_feature.shape[0]): c = content_feature[b].unsqueeze(0) s = style_feature[b].unsqueeze(0) cs = style_swap(c, s, args.patch_size, 1) style_swap_res.append(cs) style_swap_res = torch.cat(style_swap_res, 0) out_style_swap = decoder(style_swap_res) out_content = decoder(content_feature) out_style = decoder(style_feature) out_style_swap_latent = encoder(out_style_swap) out_content_latent = encoder(out_content) out_style_latent = encoder(out_style) image_reconstruction_loss = criterion( content, out_content) + criterion(style, out_style) feature_reconstruction_loss = criterion(style_feature, out_style_latent) +\ criterion(content_feature, out_content_latent) +\ criterion(style_swap_res, out_style_swap_latent) tv_loss = TVloss(out_style_swap, args.tv_weight) + TVloss(out_content, args.tv_weight) \ + TVloss(out_style, args.tv_weight) loss = image_reconstruction_loss + feature_reconstruction_loss + tv_loss loss_list.append(loss.item()) optimizer.zero_grad() loss.backward() optimizer.step() print( f'[{e}/total {args.epoch} epoch],[{i} /' f'total {round(iters/args.batch_size)} iteration]: {loss.item()}' ) if i % args.snapshot_interval == 0: content, style = next(test_iter) content = content.to(device) style = style.to(device) with torch.no_grad(): content_feature = encoder(content) style_feature = encoder(style) style_swap_res = [] for b in range(content_feature.shape[0]): c = content_feature[b].unsqueeze(0) s = style_feature[b].unsqueeze(0) cs = style_swap(c, s, args.patch_size, 1) style_swap_res.append(cs) style_swap_res = torch.cat(style_swap_res, 0) out_style_swap = decoder(style_swap_res) out_content = decoder(content_feature) out_style = decoder(style_feature) content = denorm(content, device) style = denorm(style, device) out_style_swap = denorm(out_style_swap, device) out_content = denorm(out_content, device) out_style = denorm(out_style, device) res = torch.cat( [content, style, out_content, out_style, out_style_swap], dim=0) res = res.to('cpu') save_image(res, f'{image_dir}/{e}_epoch_{i}_iteration.png', nrow=content_feature.shape[0]) torch.save(decoder.state_dict(), f'{model_state_dir}/{e}_epoch.pth') plt.plot(range(len(loss_list)), loss_list) plt.xlabel('iteration') plt.ylabel('loss') plt.title('train loss') plt.savefig(f'{loss_dir}/train_loss.png') with open(f'{loss_dir}/loss_log.txt', 'w') as f: for l in loss_list: f.write(f'{l}\n') print(f'Loss saved in {loss_dir}')
print "NOT FINETUNING" if test_model is True: assert args.snapshot is not None else: if args.sort_by_freq is False: assert args.order_free in ["pla", "mla"] else: if args.order_free: raise ValueError( 'Sort by freq and order_free are mutually exclusive.') resume = 0 highest_f1 = 0 epochs_without_imp = 0 iterations = 0 encoder = Encoder(encoder_weights=args.encoder_weights) decoder = Decoder(args.hidden_size, args.embed_size, args.attention_size, args.dropout) encoder = encoder.to('cuda') decoder = decoder.to('cuda') snapshot = args.snapshot test_model = args.test_model train_from_scratch = args.train_from_scratch swa_params = eval(args.swa_params) finetune_encoder = args.finetune_encoder if not test_model: if finetune_encoder: encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=args.encoder_lr) decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=args.decoder_lr)
parser.add_argument('--torch_pretrained', default='ckpt/panofull_lay_pretrained.t7', help='path to load pretrained .t7 file') parser.add_argument('--encoder', default='ckpt/pre_encoder.pth', help='dump path. skip if not given') parser.add_argument('--edg_decoder', default='ckpt/pre_edg_decoder.pth', help='dump path. skip if not given') parser.add_argument('--cor_decoder', default='ckpt/pre_cor_decoder.pth', help='dump path. skip if not given') args = parser.parse_args() torch_pretrained = torchfile.load(args.torch_pretrained) if args.encoder: encoder = Encoder() if args.edg_decoder: edg_decoder = Decoder(skip_num=2, out_planes=3) if args.cor_decoder: cor_decoder = Decoder(skip_num=3, out_planes=1) # Check number of parameters print('torch parameters num:', torch_pretrained.shape[0]) total_parameter = 0 if args.encoder: for p in encoder.parameters(): total_parameter += np.prod(p.size()) if args.edg_decoder: for p in edg_decoder.parameters(): total_parameter += np.prod(p.size()) if args.cor_decoder: for p in cor_decoder.parameters():
def main(config, tr_stream, dev_stream): # Create Theano variables logger.info('Creating theano variables') source_char_seq = tensor.lmatrix('source_char_seq') source_sample_matrix = tensor.btensor3('source_sample_matrix') source_char_aux = tensor.bmatrix('source_char_aux') source_word_mask = tensor.bmatrix('source_word_mask') target_char_seq = tensor.lmatrix('target_char_seq') target_char_aux = tensor.bmatrix('target_char_aux') target_char_mask = tensor.bmatrix('target_char_mask') target_sample_matrix = tensor.btensor3('target_sample_matrix') target_word_mask = tensor.bmatrix('target_word_mask') target_resample_matrix = tensor.btensor3('target_resample_matrix') target_prev_char_seq = tensor.lmatrix('target_prev_char_seq') target_prev_char_aux = tensor.bmatrix('target_prev_char_aux') target_bos_idx = tr_stream.trg_bos target_space_idx = tr_stream.space_idx['target'] # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['char_enc_nhids'], config['enc_nhids'], config['encoder_layers']) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['char_dec_nhids'], config['dec_nhids'], config['enc_nhids'] * 2, config['transition_layers'], target_space_idx, target_bos_idx) representation = encoder.apply(source_char_seq, source_sample_matrix, source_char_aux, source_word_mask) cost = decoder.cost(representation, source_word_mask, target_char_seq, target_sample_matrix, target_resample_matrix, target_char_aux, target_char_mask, target_word_mask, target_prev_char_seq, target_prev_char_aux) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() for layer_n in range(config['encoder_layers']): encoder.decimator.dgru.transitions[layer_n].weights_init = Orthogonal() encoder.children[ 1 + layer_n].prototype.recurrent.weights_init = Orthogonal() decoder.interpolator.igru.weights_init = Orthogonal() decoder.interpolator.feedback_brick.dgru.transitions[ 0].weights_init = Orthogonal() for layer_n in range(config['transition_layers']): decoder.transition.transitions[layer_n].weights_init = Orthogonal() encoder.initialize() decoder.initialize() # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise(cg, enc_params + dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(str(shape), count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(str(value.get_value().shape), name)) logger.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) # Set extensions logger.info("Initializing extensions") # Extensions gradient_norm = aggregation.mean(algorithm.total_gradient_norm) step_norm = aggregation.mean(algorithm.total_step_norm) train_monitor = CostCurve([cost, gradient_norm, step_norm], config=config, after_batch=True, before_first_epoch=True, prefix='tra') extensions = [ train_monitor, Timing(), Printing(after_batch=True), FinishAfter(after_n_batches=config['finish_after']), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") generated = decoder.generate(representation, source_word_mask) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[config['transition_layers']]) ) # generated[transition_layers] is next_outputs # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], transition_layers=config['transition_layers'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if config['bleu_script'] is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(source_char_seq, source_sample_matrix, source_char_aux, source_word_mask, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run()
for i in range(1): logger.info('Creating theano variables') print("create theano variables") source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') # what is the source_mask target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') #sampling_input = tensor.lmatrix('input') # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder( config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder( config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2) #cost = decoder.cost(encoder.apply(source_sentence, source_sentence_mask), # here source_sentence_mask 是embeding向量矩阵? 属于自由向量? # source_sentence_mask, target_sentence, target_sentence_mask) # 定义cost 函数 cost = decoder.cost(encoder.apply(source_sentence, tensor.ones(source_sentence.shape)),tensor.ones(source_sentence.shape), target_sentence, tensor.ones(target_sentence.shape)) logger.info('Creating computational graph') cg = ComputationGraph(cost) # construct the computational graph for gradient computing. it aims to optimize the model,cg包含有整个完整运算的各个权值 # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() # push_initialization_config 已经被预先定义在Initializable里的方法 decoder.push_initialization_config()