def get_han2(sent_num, sent_length, embed_weight, mask_zero=False): input = Input(shape=( sent_num, sent_length, ), dtype="int32") embedding = Embedding(name="embeeding", input_dim=embed_weight.shape[0], weights=[embed_weight], output_dim=embed_weight.shape[1], mask_zero=mask_zero, trainable=False) sent_embed = embedding(input) # print(np.shape(sent_embed)) sent_embed = Reshape((1, sent_length, embed_weight.shape[1]))(sent_embed) print(np.shape(sent_embed)) word_bigru = Bidirectional(GRU(128, return_sequences=True))(sent_embed) word_bigru = Reshape((sent_length, 256))(word_bigru) # print(np.shape(word_bigru)) word_attention = Attention(sent_length)(word_bigru) sent_encode = Reshape((-1, sent_num))(word_attention) # sent_encode = Model(sentence_input, word_attention) # # doc_input = Input(shape=(sent_num, sent_length), dtype="int32") # doc_encode = TimeDistributed(sent_encode)(doc_input) sent_bigru = Bidirectional(GRU(128, return_sequences=True))(sent_encode) doc_attention = Attention(sent_num)(sent_bigru) fc = Activation(activation="relu")(BatchNormalization()( Dense(256)(doc_attention))) output = Dense(4, activation='softmax')(fc) model = Model(input, output) model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy']) return model
def __init__(self, config, embed_size, padding_idx, label_size): super(Contextualized, self).__init__() self.bilstm = Bilstm(config, embed_size) self.attention_s = Attention(config) self.attention_l = Attention(config) self.attention_r = Attention(config) self.linear_out = nn.Linear(config.hidden_dim * 6, label_size)
def get_word_char_hcnn(sent_num, sent_word_length, sent_char_length, word_embed_weight, char_embed_weight, mask_zero=False): sentence_word_input = Input(shape=(sent_word_length, ), dtype="int32") word_embedding = Embedding(name="word_embedding", input_dim=word_embed_weight.shape[0], weights=[word_embed_weight], output_dim=word_embed_weight.shape[1], mask_zero=mask_zero, trainable=False) sent_word_embed = word_embedding(sentence_word_input) word_bigru = Bidirectional(GRU(128, return_sequences=True))(sent_word_embed) word_attention = Attention(sent_word_length)(word_bigru) sent_word_encode = Model(sentence_word_input, word_attention) sentence_char_input = Input(shape=(sent_char_length, ), dtype="int32") char_embedding = Embedding( name="char_embedding", input_dim=char_embed_weight.shape[0], weights=[char_embed_weight], output_dim=char_embed_weight.shape[1], mask_zero=mask_zero, ) sent_char_embed = char_embedding(sentence_char_input) char_bigru = Bidirectional(GRU(64, return_sequences=True))(sent_char_embed) char_attention = Attention(sent_char_length)(char_bigru) sent_char_encode = Model(sentence_char_input, char_attention) review_word_input = Input(shape=(sent_num, sent_word_length), dtype="int32") review_word_encode = TimeDistributed(sent_word_encode)(review_word_input) review_char_input = Input(shape=(sent_num, sent_char_length), dtype="int32") review_char_encode = TimeDistributed(sent_char_encode)(review_char_input) review_encode = concatenate([review_word_encode, review_char_encode]) unvec = convs_block(review_encode, convs=[1, 2, 3, 4, 5], f=256) dropfeat = Dropout(0.2)(unvec) fc = Activation(activation='relu')(BatchNormalization()( Dense(256)(dropfeat))) output = Dense(4, activation="softmax")(fc) model = Model([review_word_input, review_char_input], output) model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accracy']) return model
def main(): torch.manual_seed(777) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') parser = argparse.ArgumentParser() parser.add_argument("--path", type=str) parser.add_argument("--embedding_dim", type=int, default=300) parser.add_argument("--iterator", type=int, default=10) parser.add_argument("--lr", type=float, default=1e-5) parser.add_argument("--decay", type=float, default=0.01) parser.add_argument("--batch_size", type=int, default=100) args = parser.parse_args() trg, src = load_pair(args.path) src_token = eng_tokenize(src) trg_token = es_tokenize(trg) trg2idx, idx2_trg = make_dictionary(trg_token) src2idx, idx2src = make_dictionary(src_token) src_ix = make_src_idx(src_token, src2idx) trg_ix = make_trg_idx(trg_token, trg2idx) args.embedding_dim # model 선언부 encoder = EncoderGRU(emb_dim=args.embedding_dim, bidirectional=True, vocab_size=len(src2idx)) attention = Attention(emb_dim=args.embedding_dim, padding_idx=0) decoder = DecoderGRU(emb_dim=args.embedding_dim, attention=attention, n_class=len(trg2idx)) model = Seq2Seq_a(encoder, decoder, device, trg2idx) num_parameter(model) #loss , optimizer 설정 loss_func = nn.CrossEntropyLoss(ignore_index=0) optimizer = optim.RMSprop(model.parameters(), lr=args.lr, weight_decay=args.decay) #data 나누기 train_loader, test_loader = prepare_data(src=src_ix, trg=trg_ix, test_size=0.2, batch_size=args.batch_size) train(model, iterator=args.iterator, optimizer=optimizer, criterion=loss_func, train_loader=train_loader, visual_path="ssibal", trg2idx=trg2idx, savepath="./seq2seq_model.pth")
def main(fpath): ENC_EMB_DIM = 256 DEC_EMB_DIM = 256 ENC_HID_DIM = 512 DEC_HID_DIM = 512 ENC_DROPOUT = 0.5 DEC_DROPOUT = 0.5 device = torch.device('cuda') dataset = Dataset() INPUT_DIM = len(dataset.SRC.vocab) OUTPUT_DIM = len(dataset.TRG.vocab) SRC_PAD_IDX = dataset.SRC.vocab.stoi[dataset.SRC.pad_token] encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT) attention = Attention(ENC_HID_DIM, DEC_HID_DIM) decoder = Decoder(DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, OUTPUT_DIM, DEC_DROPOUT, attention) model = Seq2Seq(encoder, decoder, SRC_PAD_IDX, device) model.load_state_dict(torch.load("best_model.pt")) model.to(device) with open(fpath, "r") as f: sentences = f.readlines() translate_sentence(model, sentences, dataset.SRC, dataset.TRG, device)
def __init__(self, config, embed_size, padding_idx, label_size, embedding): super(Vanilla, self).__init__() self.bilstm = Bilstm(config, embed_size, embedding) self.attention = Attention(config) self.linear_out = nn.Linear(config.hidden_dim * 2, label_size)
def main(): BATCH_SIZE = 32 NUM_EPOCH = 12 LR = 0.001 CLIP = 1 STEP_SIZE = 4 GAMMA = 0.1 ENC_EMB_DIM = 256 DEC_EMB_DIM = 256 ENC_HID_DIM = 512 DEC_HID_DIM = 512 ENC_DROPOUT = 0.5 DEC_DROPOUT = 0.5 device = torch.device('cuda') dataset = Dataset() train_data, valid_data, test_data = dataset.build_dataset() train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=BATCH_SIZE, sort_within_batch=True, sort_key=lambda x: len(x.src), device=device) INPUT_DIM = len(dataset.SRC.vocab) OUTPUT_DIM = len(dataset.TRG.vocab) SRC_PAD_IDX = dataset.SRC.vocab.stoi[dataset.SRC.pad_token] TRG_PAD_IDX = dataset.TRG.vocab.stoi[dataset.TRG.pad_token] encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT) attention = Attention(ENC_HID_DIM, DEC_HID_DIM) decoder = Decoder(DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, OUTPUT_DIM, DEC_DROPOUT, attention) model = Seq2Seq(encoder, decoder, SRC_PAD_IDX, device) model.apply(init_weight) model.to(device) optimizer = Adam(model.parameters(), lr=LR) criterion = CrossEntropyLoss(ignore_index=TRG_PAD_IDX).to(device) scheduler = StepLR(optimizer, STEP_SIZE, GAMMA) min_valid_loss = 1e10 for e in range(NUM_EPOCH): print("Epoch: {}".format(e + 1)) train_loss = train(model, train_iterator, optimizer, criterion, CLIP) print("Train loss: {}".format(train_loss)) valid_loss = evaluate(model, valid_iterator, criterion) print("Valid loss: {}".format(valid_loss)) if valid_loss < min_valid_loss: torch.save(model.state_dict(), "best_model.pt") min_valid_loss = valid_loss
def main(_): parser = argparse.ArgumentParser() parser.add_argument( '--action', dest='action', type=str, default='train', help='actions: train, or test') args = parser.parse_args() if args.action not in ['train', 'test']: print('invalid action: ', args.action) print("Please input a action: train, test") else: model= Attention(tf.Session(),configure()) getattr(model,args.action)()
def build_model(self, n_classes=1, embedding_dim=300): 'Build bi-level bi-directional GRU model with attention over word embeddings' l2_reg = regularizers.l2(1e-8) sentence_in = Input(shape=(self.max_len, ), dtype="int32") masking_layer = Masking(mask_value=0)(sentence_in) embedded_word_seq = Embedding(10000, 300, input_length=self.max_len, trainable=False, weights=[self.embedding_weights ])(masking_layer) word_encoder = Bidirectional( GRU(50, return_sequences=True, kernel_regularizer=l2_reg))(embedded_word_seq) dense_transform_w = Dense(100, activation="relu", name="dense_transform_w", kernel_regularizer=l2_reg)(word_encoder) attn_weighted_sent = Model( sentence_in, Attention(name='word_attention', regularizer=l2_reg)(dense_transform_w)) attn_weighted_sent.summary() texts_in = Input(shape=(self.max_sentence, self.max_len), dtype='int32') attention_weighted_sentences = TimeDistributed(attn_weighted_sent)( texts_in) sentence_encoder = Bidirectional( GRU(50, return_sequences=True, kernel_regularizer=l2_reg), name="sentence_encoder")(attention_weighted_sentences) dense_transform_s = TimeDistributed( Dense(100, activation='relu', name='dense_transform_s', kernel_regularizer=l2_reg))(sentence_encoder) prediction = TimeDistributed(Dense( 1, activation="sigmoid"))(dense_transform_s) model = Model(texts_in, prediction) model.summary() model.compile(optimizer=Adam(lr=0.001), loss="binary_crossentropy", metrics=["acc"], sample_weight_mode="temporal") return (model)
def init_model( with_attention=False, teaching_force_ratio=0.5, embedding_size=500, hidden_size=256, ): """ Instantiates the model by creating the Encoder, the Decoder and the model itself which represents the seq2seq architecture. :param with_attention: if true then the model apply the attention mechanism :param teaching_force_ratio: used to alternate between generated word or gt-word during training. :return encoder, decoder, model: the encoder, the decoder and the seq2seq model. """ if with_attention: # init with attention encoder = EncoderAttention(embedding_size, hidden_size, vocabulary.__len__()).to(device) attention = Attention(hidden_size).to(device) decoder = DecoderAttention(embedding_size, hidden_size, vocabulary.__len__(), attention=attention).to(device) model = ChatbotModel(encoder, decoder, vocabulary.__len__(), with_attention=True, tf_ratio=teaching_force_ratio).to(device) return encoder, decoder, model else: # init with no attention encoder = Encoder(embedding_size, hidden_size, vocabulary.__len__()).to(device) decoder = Decoder(embedding_size, hidden_size, vocabulary.__len__()).to(device) model = ChatbotModel(encoder, decoder, vocabulary.__len__(), with_attention=False, tf_ratio=teaching_force_ratio).to(device) return encoder, decoder, model
def get_hcnn(sent_num, sent_length, embed_weight, mask_zero=False): sentence_input = Input(shape=(sent_length, ), dtype="int32") embedding = Embedding(input_dim=embed_weight.shape[0], weights=[embed_weight], output_dim=embed_weight.shape[1], mask_zero=mask_zero, trainable=False) sent_embed = embedding(sentence_input) word_bigru = Bidirectional(GRU(128, return_sequences=True))(sent_embed) word_attention = Attention(sent_length)(word_bigru) sent_encode = Model(sentence_input, word_attention) review_input = Input(shape=(sent_num, sent_length), dtype="int32") review_encode = TimeDistributed(sent_encode)(review_input) feat = convs_block(review_encode) dropfeat = Dropout(0.2)(feat) fc = Activation(activation="relu")(BatchNormalization()( Dense(256)(dropfeat))) output = Dense(2, activation="softmax")(fc) model = Model(review_input, output) model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy']) return model
import torch from dataloader import prepare_data from model import Encoder, Attention, Decoder, Seq2Seq, init_weights from trainer import Trainer from config import * """ load data """ train_loader, val_loader, test_loader, m_dh = prepare_data( TRAIN_PATH, VAL_PATH, TEST_PATH, DH_PATH, LOAD_FROM_DUMP, BATCH_SIZE) """ model setup """ INPUT_DIM, OUTPUT_DIM = len(m_dh.de_vocab), len(m_dh.en_vocab) enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT) attn = Attention(ENC_HID_DIM, DEC_HID_DIM, ATTN_DIM) dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn) model = Seq2Seq(enc, dec) model.apply(init_weights) """ training setup """ optimizer = torch.optim.Adam(model.parameters(), lr=LR) criterion = torch.nn.CrossEntropyLoss(ignore_index=1) trainer = Trainer(model, optimizer, criterion, train_loader, val_loader, val_best_path=VAL_BEST_PATH) trainer.load('ckpts/best.pt')
train_set = BertDataset(bert_path / bert_model / 'train') valid_set = BertDataset(bert_path / bert_model / 'valid') training_loader = DataLoader( train_set, batch_size=mb, shuffle=True, num_workers=dl_workers, pin_memory=True if device == 'cuda' else False) valid_loader = DataLoader(valid_set, batch_size=mb, shuffle=True, num_workers=dl_workers, pin_memory=True if device == 'cuda' else False) attention = Attention(bert_hidden_size, decoder_hidden_size, attention_hidden_size) # add attention_hidden_size decoder = Decoder(bert_vocab_size, decoder_input_size, bert_hidden_size, decoder_hidden_size, num_layers, dropout, attention, device) encoder = BertModel.from_pretrained(model_path / stage / bert_model) model = Seq2Seq(encoder, decoder, device, encoder_trained) optimizer = optim.SGD(decoder.parameters(), weight_decay=weight_decay, lr=lr, momentum=momentum) criterion = nn.CrossEntropyLoss(ignore_index=0, reduction='none') # Pad Index if checkpoint is not None:
def main(): parser = argparse.ArgumentParser( description='Train a neural machine translation model') # Training corpus corpora_group = parser.add_argument_group( 'training corpora', 'Corpora related arguments; specify either monolingual or parallel training corpora (or both)' ) corpora_group.add_argument('--src_path', help='the source language monolingual corpus') corpora_group.add_argument('--trg_path', help='the target language monolingual corpus') corpora_group.add_argument( '--max_sentence_length', type=int, default=90, help='the maximum sentence length for training (defaults to 50)') # Embeddings/vocabulary embedding_group = parser.add_argument_group( 'embeddings', 'Embedding related arguments; either give pre-trained cross-lingual embeddings, or a vocabulary and embedding dimensionality to randomly initialize them' ) embedding_group.add_argument('--src_vocabulary', help='the source language vocabulary') embedding_group.add_argument('--trg_vocabulary', help='the target language vocabulary') embedding_group.add_argument('--embedding_size', type=int, default=0, help='the word embedding size') # Architecture architecture_group = parser.add_argument_group( 'architecture', 'Architecture related arguments') architecture_group.add_argument( '--layers', type=int, default=2, help='the number of encoder/decoder layers (defaults to 2)') architecture_group.add_argument( '--enc_hid_dim', type=int, default=512, help='the number of dimensions for the hidden layer (defaults to 600)') architecture_group.add_argument( '--dec_hid_dim', type=int, default=512, help='the number of dimensions for the hidden layer (defaults to 600)') # Optimization optimization_group = parser.add_argument_group( 'optimization', 'Optimization related arguments') optimization_group.add_argument('--batch_size', type=int, default=128, help='the batch size (defaults to 50)') optimization_group.add_argument( '--learning_rate', type=float, default=0.0002, help='the global learning rate (defaults to 0.0002)') optimization_group.add_argument( '--dropout', metavar='PROB', type=float, default=0.4, help='dropout probability for the encoder/decoder (defaults to 0.3)') optimization_group.add_argument( '--param_init', metavar='RANGE', type=float, default=0.1, help= 'uniform initialization in the specified range (defaults to 0.1, 0 for module specific default initialization)' ) optimization_group.add_argument( '--iterations', type=int, default=50, help='the number of training iterations (defaults to 300000)') # Model saving saving_group = parser.add_argument_group( 'model saving', 'Arguments for saving the trained model') saving_group.add_argument('--save_path', metavar='PREFIX', help='save models with the given prefix') saving_group.add_argument('--save_interval', type=int, default=0, help='save intermediate models at this interval') saving_group.add_argument('--model_init_path', help='model init path') # Logging/validation logging_group = parser.add_argument_group( 'logging', 'Logging and validation arguments') logging_group.add_argument('--log_interval', type=int, default=1000, help='log at this interval (defaults to 1000)') logging_group.add_argument('--validate_batch_size', type=int, default=1, help='the batch size (defaults to 50)') corpora_group.add_argument('--inference_output', help='the source language monolingual corpus') corpora_group.add_argument('--validation_src_path', help='the source language monolingual corpus') corpora_group.add_argument('--validation_trg_path', help='the source language monolingual corpus') # Other parser.add_argument( '--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--cuda', default=False, action='store_true', help='use cuda') parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument("--type", type=str, default='train', help="type: train/inference/debug") args = parser.parse_args() print(args) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') src_dictionary = Dictionary( [word.strip() for word in open(args.src_vocabulary).readlines()]) trg_dictionary = Dictionary( [word.strip() for word in open(args.trg_vocabulary).readlines()]) def init_weights(m): for name, param in m.named_parameters(): if 'weight' in name: nn.init.normal_(param.data, mean=0, std=0.01) else: nn.init.constant_(param.data, 0) if not args.model_init_path: attn = Attention(args.enc_hid_dim, args.dec_hid_dim) enc = Encoder(src_dictionary.size(), args.embedding_size, args.enc_hid_dim, args.dec_hid_dim, args.dropout, src_dictionary.PAD) dec = Decoder(trg_dictionary.size(), args.embedding_size, args.enc_hid_dim, args.dec_hid_dim, args.dropout, attn) s2s = Seq2Seq(enc, dec, src_dictionary.PAD, device) parallel_model = Parser(src_dictionary, trg_dictionary, s2s, device) parallel_model.apply(init_weights) else: print(f"load init model from {args.model_init_path}") parallel_model = torch.load(args.model_init_path) parallel_model = parallel_model.to(device) if args.type == TEST: test_dataset = treeDataset(args.validation_src_path, args.validation_trg_path) test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=args.validate_batch_size, collate_fn=collate_fn) hit, total, acc = evaluate_iter_loss2(parallel_model, test_dataloader, src_dictionary, trg_dictionary, device) print(f'hit: {hit: d} | total: {total: d} | acc: {acc: f}', flush=True) elif args.type == INFERENCE: test_dataset = customDataset(args.validation_src_path, args.validation_trg_path) test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=args.validate_batch_size) hit, total, acc = evaluate_iter_acc(parallel_model, test_dataloader, src_dictionary, trg_dictionary, device, args.inference_output) print(f'hit: {hit: d} | total: {total: d} | acc: {acc: f}', flush=True) elif args.type == DEBUG: test_dataset = treeDataset(args.validation_src_path, args.validation_trg_path) test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=args.validate_batch_size, collate_fn=collate_fn) hit, total, acc = debug_iter(parallel_model, test_dataloader, src_dictionary, trg_dictionary, device) print(f'hit: {hit: d} | total: {total: d} | acc: {acc: f}', flush=True) else: train_dataset = treeDataset(args.src_path, args.trg_path) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=args.batch_size, collate_fn=collate_fn) test_dataset = treeDataset(args.validation_src_path, args.validation_trg_path) test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=args.validate_batch_size, collate_fn=collate_fn) train(src_dictionary, trg_dictionary, train_dataloader, test_dataloader, parallel_model, device, args)
def train(self, src_emb, tgt_emb): params = self.params suffix_str = params.suffix_str # Load data if not os.path.exists(params.data_dir): raise "Data path doesn't exists: %s" % params.data_dir en = src_emb it = tgt_emb params = _get_eval_params(params) evaluator = eval.Evaluator(params, src_emb.weight.data, tgt_emb.weight.data, use_cuda=True) if params.context > 0: try: knn_list = pickle.load( open('full_knn_list_' + suffix_str + '.pkl', 'rb')) except FileNotFoundError: knn_list = get_knn_embedding(params, src_emb, suffix_str, context=params.context, method='csls', use_cuda=True) self.knn_emb = convert_to_embeddings(knn_list, use_cuda=True) for _ in range(params.num_random_seeds): # Create models g = Generator(input_size=params.g_input_size, hidden_size=params.g_hidden_size, output_size=params.g_output_size, hyperparams=get_hyperparams(params, disc=False)) d = Discriminator(input_size=params.d_input_size, hidden_size=params.d_hidden_size, output_size=params.d_output_size, hyperparams=get_hyperparams(params, disc=True)) a = Attention(atype=params.atype, input_size=2 * params.g_input_size, hidden_size=params.a_hidden_size) r_p = RankPredictor( input_size=params.g_output_size, output_size=int( np.floor(np.log(params.most_frequent_sampling_size)) + 1), hidden_size=params.d_hidden_size // 4, leaky_slope=params.leaky_slope) if params.initialize_prev_best == 1 and params.context in [0, 2]: prev_best_model_file_path = os.path.join( params.model_dir, params.prev_best_model_fname) g.load_state_dict( torch.load(prev_best_model_file_path, map_location='cpu')) print(g.map1.weight.data) if params.seed > 0: seed = params.seed else: seed = random.randint(0, 1000) # init_xavier(g) # init_xavier(d) self.initialize_exp(seed) # Define loss function and optimizers loss_fn = torch.nn.BCELoss() r_p_loss_fn = torch.nn.CrossEntropyLoss() d_optimizer = optim.SGD(d.parameters(), lr=params.d_learning_rate) g_optimizer = optim.SGD(g.parameters(), lr=params.g_learning_rate) r_p_optimizer = optim.SGD(g.parameters(), lr=params.g_learning_rate) if params.atype in ['mlp', 'bilinear']: a_optimizer = optim.SGD(a.parameters(), lr=params.g_learning_rate) if torch.cuda.is_available(): # Move the network and the optimizer to the GPU g = g.cuda() d = d.cuda() a = a.cuda() r_p = r_p.cuda() loss_fn = loss_fn.cuda() r_p_loss_fn = r_p_loss_fn.cuda() # Regularization loss reg_loss = 0 for i, p in enumerate(g.parameters()): if i > 0: break pred = p.transpose(0, 1)[300:, :] reg_loss += pred.norm(2) factor = 1e-2 reg_loss = reg_loss.cuda() # true_dict = get_true_dict(params.data_dir) d_acc_epochs = [] g_loss_epochs = [] # logs for plotting later log_file = open( "log_{}_{}_{}.txt".format(self.params.src_lang, self.params.tgt_lang, seed), "w") # Being overwritten in every loop, not really required log_file.write("epoch, dis_loss, dis_acc, g_loss, acc, acc_new\n") try: for epoch in range(params.num_epochs): d_losses = [] g_losses = [] rank_losses = [] hit = 0 total = 0 start_time = timer() for mini_batch in range( 0, params.iters_in_epoch // params.mini_batch_size): # W_orig = g.map1.weight.data # print(W_orig) for d_index in range(params.d_steps): d_optimizer.zero_grad() # Reset the gradients d.train() input, output = self.get_batch_data_fast( en, it, g, a, detach=True) pred = d(input) d_loss = loss_fn(pred, output) d_loss.backward( ) # compute/store gradients, but don't change params d_losses.append(d_loss.data.cpu().numpy()) discriminator_decision = pred.data.cpu().numpy() hit += np.sum( discriminator_decision[:params.mini_batch_size] >= 0.5) hit += np.sum( discriminator_decision[params.mini_batch_size:] < 0.5) d_optimizer.step( ) # Only optimizes D's parameters; changes based on stored gradients from backward() # Clip weights _clip(d, params.clip_value) sys.stdout.write( "[%d/%d] :: Discriminator Loss: %f \r" % (mini_batch, params.iters_in_epoch // params.mini_batch_size, np.asscalar(np.mean(d_losses)))) sys.stdout.flush() total += 2 * params.mini_batch_size * params.d_steps for g_index in range(params.g_steps): # 2. Train G on D's response (but DO NOT train D on these labels) g_optimizer.zero_grad() d.eval() if params.use_rank_predictor > 0: input, output, true_ranks = self.get_batch_data_fast( en, it, g, a, detach=False, use_rank_predictor=True) else: input, output = self.get_batch_data_fast( en, it, g, a, detach=False) pred = d(input) g_loss = loss_fn(pred, 1 - output) g_loss += factor * reg_loss if params.use_rank_predictor > 0: g_loss.backward(retain_graph=True) else: g_loss.backward(retain_graph=True) g_optimizer.step() # Only optimizes G's parameters if params.atype in ['mlp', 'bilinear']: a_optimizer.step() g_losses.append(g_loss.data.cpu().numpy()) if params.use_rank_predictor > 0: # First half of input are the transformed embeddings fake_input = input[:len(input) // 2] rank_predictions = r_p(fake_input) rank_loss = r_p_loss_fn( rank_predictions, true_ranks) rank_loss.backward() r_p_optimizer.step() rank_losses.append( rank_loss.data.cpu().numpy()) # Orthogonalize if params.context == 1: pass # for i, p in enumerate(g.parameters()): # print("%d: " % i) # print(p.shape) # W_orig = g.map1.weight.data # print(W_orig) # print(W_orig) # W_top = W_orig[:300, :300] # W_bottom = W_orig[300:, 300:] # print(W_top) # print(W_bottom) # self.orthogonalize(g.map2.weight.data) else: self.orthogonalize(g.map1.weight.data) if params.use_rank_predictor > 0: sys.stdout.write( "[%d/%d] :: Generator Loss: %f , Rank Loss: %f \r" % (mini_batch, params.iters_in_epoch // params.mini_batch_size, np.asscalar(np.mean(g_losses)), np.asscalar(np.mean(rank_losses)))) else: sys.stdout.write( "[%d/%d] :: Generator Loss: %f \r" % (mini_batch, params.iters_in_epoch // params.mini_batch_size, np.asscalar(np.mean(g_losses)))) sys.stdout.flush() d_acc_epochs.append(hit / total) g_loss_epochs.append(np.asscalar(np.mean(g_losses))) print( "Epoch {} : Discriminator Loss: {:.5f}, Discriminator Accuracy: {:.5f}, Generator Loss: {:.5f}, Time elapsed {:.2f} mins" .format(epoch, np.asscalar(np.mean(d_losses)), hit / total, np.asscalar(np.mean(g_losses)), (timer() - start_time) / 60)) # lr decay g_optim_state = g_optimizer.state_dict() old_lr = g_optim_state['param_groups'][0]['lr'] g_optim_state['param_groups'][0]['lr'] = max( old_lr * params.lr_decay, params.lr_min) g_optimizer.load_state_dict(g_optim_state) print("Changing the learning rate: {} -> {}".format( old_lr, g_optim_state['param_groups'][0]['lr'])) d_optim_state = d_optimizer.state_dict() d_optim_state['param_groups'][0]['lr'] = max( d_optim_state['param_groups'][0]['lr'] * params.lr_decay, params.lr_min) d_optimizer.load_state_dict(d_optim_state) if (epoch + 1) % params.print_every == 0: # No need for discriminator weights # torch.save(d.state_dict(), 'discriminator_weights_en_es_{}.t7'.format(epoch)) if params.context > 0: indices = torch.arange( params.top_frequent_words).type( torch.LongTensor) indices = to_cuda(indices, use_cuda=True) all_precisions = evaluator.get_all_precisions( g( construct_input(self.knn_emb, indices, en, a, atype=params.atype, context=params.context, use_cuda=True)).data) else: all_precisions = evaluator.get_all_precisions( g(src_emb.weight).data) #print(json.dumps(all_precisions)) p_1 = all_precisions['validation']['adv'][ 'without-ref']['nn'][1] p_1_new = all_precisions['validation-new']['adv'][ 'without-ref']['nn'][1] log_file.write( "{},{:.5f},{:.5f},{:.5f},{:.5f},{:.5f}\n".format( epoch + 1, np.asscalar(np.mean(d_losses)), hit / total, np.asscalar(np.mean(g_losses)), p_1, p_1_new)) #log_file.write(str(all_precisions) + "\n") # Saving generator weights torch.save( g.state_dict(), 'generator_weights_' + suffix_str + '_seed_{}_mf_{}_lr_{}_p@1_{:.3f}.t7'.format( seed, epoch, params.g_learning_rate, p_1)) if params.atype in ['mlp', 'bilinear']: torch.save( a.state_dict(), 'generator_weights_' + suffix_str + '_seed_{}_mf_{}_lr_{}_p@1_{:.3f}.t7'.format( seed, epoch, params.a_learning_rate, p_1)) # Save the plot for discriminator accuracy and generator loss fig = plt.figure() plt.plot(range(0, params.num_epochs), d_acc_epochs, color='b', label='discriminator') plt.plot(range(0, params.num_epochs), g_loss_epochs, color='r', label='generator') plt.ylabel('accuracy/loss') plt.xlabel('epochs') plt.legend() fig.savefig('d_g.png') except KeyboardInterrupt: print("Interrupted.. saving model !!!") torch.save(g.state_dict(), 'g_model_interrupt.t7') torch.save(d.state_dict(), 'd_model_interrupt.t7') if params.atype in ['mlp', 'bilinear']: torch.save(a.state_dict(), 'a_model_interrupt.t7') log_file.close() exit() log_file.close() return g
shuffle=True, **loader_kwargs) test_loader = data_utils.DataLoader(MnistBags( xor_numbers=[7, 9], mean_bag_length=args.mean_bag_length, var_bag_length=args.var_bag_length, num_bag=args.num_bags_test, seed=args.seed, train=False), batch_size=1, shuffle=False, **loader_kwargs) print('Init Model') model = Attention(args.self_att) if args.cuda: model.cuda() optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.999), weight_decay=args.reg) def train(epoch, sw): model.train() train_loss = 0. train_error = 0. for batch_idx, (data, label) in enumerate(train_loader): bag_label = label[0]
def test(config_path): # Load the parameters param_dict, rep_param_dict = load_params(config_path) # load data TEST_DIR01 = '{}/MQ2007/S1/'.format(param_dict["data_base_path"]) TEST_DIR02 = '{}/MQ2007/S2/'.format(param_dict["data_base_path"]) TEST_DIR03 = '{}/MQ2007/S3/'.format(param_dict["data_base_path"]) TEST_DIR04 = '{}/MQ2007/S4/'.format(param_dict["data_base_path"]) TEST_DIR05 = '{}/MQ2007/S5/'.format(param_dict["data_base_path"]) test_files01 = glob.glob("{}/testdata0.pkl".format(TEST_DIR01)) test_files02 = glob.glob("{}/testdata0.pkl".format(TEST_DIR02)) test_files03 = glob.glob("{}/testdata0.pkl".format(TEST_DIR03)) test_files04 = glob.glob("{}/testdata0.pkl".format(TEST_DIR04)) test_files05 = glob.glob("{}/testdata0.pkl".format(TEST_DIR05)) fold = param_dict["fold"] model_base_path = param_dict['model_base_path'] model_name_str = param_dict['model_name_str'] if fold == 1: test_files = test_files05[0] # a path list ['/...'] only take the str rel_path = '{}/{}/tmp/test/S5.qrels'.format(model_base_path, model_name_str) elif fold == 2: test_files = test_files01[0] rel_path = '{}/{}/tmp/test/S1.qrels'.format(model_base_path, model_name_str) elif fold == 3: test_files = test_files02[0] rel_path = '{}/{}/tmp/test/S2.qrels'.format(model_base_path, model_name_str) elif fold == 4: test_files = test_files03[0] rel_path = '{}/{}/tmp/test/S3.qrels'.format(model_base_path, model_name_str) elif fold == 5: test_files = test_files04[0] rel_path = '{}/{}/tmp/test/S4.qrels'.format(model_base_path, model_name_str) else: raise ValueError("wrong fold num {}".format(fold)) test_data = load_dataset(test_files) q_len = param_dict['q_len'] d_len = param_dict['d_len'] emb_size = param_dict['emb_size'] num_heads = param_dict['num_heads'] kernel_size = rep_param_dict['kernel_size'] filt_size = rep_param_dict['filt_size'] vocab_size = param_dict['vocab_size'] output_dim = rep_param_dict['output_dim'] hidden_size = param_dict['hidden_size'] batch_size = param_dict['batch_size'] preemb = param_dict['preemb'] emb_path = param_dict['emb_path'] hinge_margin = param_dict['hinge_margin'] model = Attention(emb_size=emb_size, query_length=q_len, doc_length=d_len, num_heads=num_heads, kernel_size=kernel_size, filter_size=filt_size, vocab_size=vocab_size, dropout=0.0, qrep_dim=output_dim, hidden_size=hidden_size, batch_size=batch_size, preemb=preemb, emb_path=emb_path).cuda() # Test # load model from file model_file = '{}/{}/saves/model_file'.format(model_base_path, model_name_str) model.load_state_dict(torch.load(model_file)) print("loaded model, and perform test now") MAP, NDCGs = evaluate(config_path, model, test_data, rel_path, mode="test") print(MAP, NDCGs)
drop = 0.6 model = 'LSTM' bi = True criterion = crit traindl, valdl = data.BucketIterator.splits(datasets=(trainds, valds), batch_size=batch_size, sort_key=lambda x: len(x.text), device=device, sort_within_batch=True, repeat=False) train_iter, val_iter = traindl, valdl TEXT, LABEL = text_field, label_field embedding = nn.Embedding(ntokens, emsize, padding_idx=1, max_norm=1) if vectors: embedding.weight.data.copy_(TEXT.vocab.vectors) encoder = Encoder(emsize, hidden, nlayers=nlayers, dropout=drop, bidirectional=bi) attention_dim = hidden if not bi else 2*hidden attention = Attention(attention_dim, attention_dim, attention_dim) model = Classifier(embedding, encoder, attention, attention_dim, nlabels, baseline=True).to(device) criterion = crit optimizer = torch.optim.Adam(model.parameters(), lr, amsgrad=True) for epoch in range(1, epochs + 1): m = train(epoch, model, train_iter, val_iter, optimizer, criterion)
def build_model(sentence_len, max_words, doc_embedding, sent_embedding): 'Constructs the multi-task training extractor/classifier model' l2_reg = regularizers.l2(1e-6) l1_l2reg = regularizers.l1_l2(1e-5) ## word encoder - extractor sentence_in = Input(shape=(sentence_len, ), dtype="int32") embedded_word_seq = Embedding(max_words, 300, input_length=sentence_len, trainable=False, weights=[doc_embedding])(sentence_in) #embedded_word_seq_learn = Embedding(10000, 300, input_length = 30, trainable = True)(sentence_in) #embedding_concat = concatenate([embedded_word_seq, embedded_word_seq_learn]) word_encoder = Bidirectional( GRU(50, return_sequences=True, kernel_regularizer=l2_reg))(embedded_word_seq) dense_transform_w = Dense(100, activation="relu", name="dense_transform_w", kernel_regularizer=l2_reg)(word_encoder) attn_weighted_sent = Model( sentence_in, Attention(name='word_attention', regularizer=l2_reg)(dense_transform_w)) attn_weighted_sent.summary() # Inputs - sentence encoder - extractor class_input = Input(shape=(sentence_len, ), dtype="int32", name="CL_input") class_ids = Input(shape=(1, ), dtype="int32", name="CL_IDs") texts_in = Input(shape=(100, sentence_len), dtype='int32') # sentence encoder - extractor attention_weighted_sentences = TimeDistributed( attn_weighted_sent, name="EX_sent_attn")(texts_in) sentence_encoder = Bidirectional( GRU(50, return_sequences=True, name="sentence_encoder"))(attention_weighted_sentences) sentence_matcher = Lambda(lambda x: x[:, tf.squeeze(class_ids), :], output_shape=(100, ))(sentence_encoder) dense_transform_s = TimeDistributed( Dense(100, activation='relu', name='EX_sent_dense', kernel_regularizer=l1_l2reg))(sentence_encoder) dropout_extractor = Dropout(0.5)(dense_transform_s) output_extractor = TimeDistributed( Dense(1, activation="sigmoid", name="EX_out"))(dropout_extractor) # sentence classifier embedded_words = Embedding(max_words, 300, input_length=sentence_len, trainable=False, name="CL_embed", weights=[sent_embedding])(class_input) rnn = Bidirectional( GRU(50, return_sequences=True, name="CL_RNN", kernel_regularizer=l2_reg))(embedded_words) dense_w = TimeDistributed( Dense(100, kernel_regularizer=l2_reg, name="CL_dense"))(rnn) attn = AttentionWithContext(name="CL_attn")(dense_w) merge_layer = concatenate([attn, sentence_matcher], name="CL_merging") output_classifier = Dense(n_classes, activation="sigmoid")(merge_layer) model = Model(inputs=[class_input, texts_in, class_ids], outputs=[output_classifier, output_extractor]) model.summary() model.compile(optimizer=Adam(lr=0.0002), loss={ 'dense_1': 'binary_crossentropy', 'time_distributed_2': 'binary_crossentropy' }, metrics={ 'dense_1': [top_1_accuracy, top_3_accuracy], 'time_distributed_2': ['acc'] }) return (model)
def main(reader, params): #from rcnn_attention import evaluator k_shot = params.k num_negative_bags = params.neg total_bags = k_shot + num_negative_bags result_lists = {} input_dim = 256 if params.dataset == 'omniglot' else 640 for i, tensor_data in enumerate(reader.get_data()): if (i + 1) % 100 == 0: print('Evaluating problem number %d/%d' % (i + 1, params.eval_num)) [feas, fea_boxes, fea_target_classes, fea_classes, imgs, target_class] = tensor_data[0:6] boxes_list = tensor_data[6:6 + total_bags] class_list = tensor_data[6 + total_bags:] bags = np.squeeze(feas) bag_labels = np.max(fea_target_classes, axis=1) input_labels = fea_target_classes.astype(np.int64) train_loader = data_utils.DataLoader(ImageBags(bags=bags, labels=input_labels), batch_size=1, shuffle=True, **loader_kwargs) test_loader = data_utils.DataLoader(ImageBags(bags=bags, labels=input_labels), batch_size=1, shuffle=False, **loader_kwargs) model = Attention(input_dim=input_dim) if params.cuda: model.cuda() optimizer = optim.Adam(model.parameters(), lr=params.lr, betas=(0.9, 0.999), weight_decay=params.reg) def train(epoch): model.train() train_loss = 0. train_error = 0. for batch_idx, (data, label) in enumerate(train_loader): bag_label = label[0] if params.cuda: data, bag_label = data.cuda(), bag_label.cuda() data, bag_label = Variable(data), Variable(bag_label) # reset gradients optimizer.zero_grad() # calculate loss and metrics loss, _ = model.calculate_objective(data, bag_label) train_loss += loss.data[0] #error, _ = model.calculate_classification_error(data, bag_label) #train_error += error # backward pass loss.backward() # step optimizer.step() train_loss /= len(train_loader) #print('epoch: {}, loss: {}'.format(epoch, train_loss)) #train_error /= len(train_loader) def test(): model.eval() test_loss = 0. test_error = 0. num_success = 0 scores = np.zeros_like(fea_classes[:params.k]) for batch_idx, (data, label) in enumerate(test_loader): bag_label = label[0] instance_labels = label[1] if params.cuda: data, bag_label = data.cuda(), bag_label.cuda() data, bag_label = Variable(data), Variable(bag_label) loss, attention_weights = model.calculate_objective( data, bag_label) test_loss += loss.data[0] #error, predicted_label = model.calculate_classification_error(data, bag_label) #test_error += error if batch_idx < params.k: scores[batch_idx] = attention_weights.cpu().data.numpy()[0] #argmax_pred = np.argmax(attention_weights.cpu().data.numpy()[0]) #val = instance_labels.numpy()[0].tolist()[argmax_pred] #num_success += val #print('batch idx: {}, val: {}'.format(batch_idx, val)) #print('scores: ', scores) res = { 'boxes': fea_boxes[:params.k], 'classes': np.ones_like(fea_classes[:params.k]), 'scores': scores, 'class_agnostic': True } return res gt = {} gt['boxes'] = boxes_list[:params.k] gt['classes'] = class_list[:params.k] gt['target_class'] = target_class for epoch in range(1, args.epochs + 1): train(epoch) res = test() result_dict = {'groundtruth': gt, 'atnmil': res} from rcnn_attention import evaluator evaluator._postprocess_result_dict(result_dict) result_dict.pop('groundtruth') add_results(result_dict, result_lists) if i + 1 == params.eval_num: break metrics = {} from rcnn_attention import eval_util for method, result_list in result_lists.items(): m = eval_util.evaluate_coloc_results(result_list, None) metrics[method] = m for k, v in metrics.items(): print('{}: {}'.format(k, v))
epoch_loss += loss.item() return epoch_loss / len(iterator) def epoch_time(start_time, end_time): elapsed_time = end_time - start_time elapsed_mins = int(elapsed_time / 60) elapsed_secs = int(elapsed_time - (elapsed_mins * 60)) return elapsed_mins, elapsed_secs if __name__ == "__main__": dataset = Dataset() train_data, valid_data, test_data, INPUT_DIM, OUTPUT_DIM = dataset.get_data() attention = Attention(config.ENC_HID_DIM, config.DEC_HID_DIM) encoder = Encoder(INPUT_DIM, config.ENC_EMB_DIM, config.ENC_HID_DIM, config.DEC_HID_DIM, config.N_LAYERS, config.ENC_DROPOUT) decoder = Decoder(OUTPUT_DIM, config.DEC_EMB_DIM, config.ENC_HID_DIM, config.DEC_HID_DIM, config.N_LAYERS, config.DEC_DROPOUT, attention) seq2seq = Seq2Seq(encoder, decoder, config.device).to(config.device) print(seq2seq) optimizer = optim.Adam(seq2seq.parameters()) PAD_IDX = config.target.vocab.stoi['<pad>'] criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX) train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train_data, valid_data, test_data), batch_size=config.BATCH_SIZE, device=config.device) N_EPOCHS = 10 best_valid_loss = float('inf')
import torch from torch import nn import numpy as np from model import Attention, Encoder mha = Attention(d_model=512, num_heads=8, p=0) encoder = Encoder(d_model=512, num_heads=8, conv_hidden_dim=128) def print_out(Q, K, V): temp_out, temp_attn = mha.scaled_dot_product_attention(Q, K, V) print('Attention weights are:', temp_attn.squeeze()) print('Output is:', temp_out.squeeze()) test_K = torch.tensor([[10, 0, 0], [0, 10, 0], [0, 0, 10], [0, 0, 10]]).float()[None, None] test_V = torch.tensor([[1, 0, 0], [10, 0, 0], [100, 5, 0], [1000, 6, 0]]).float()[None, None] test_Q = torch.tensor([[0, 0, 10], [0, 10, 0], [10, 10, 0]]).float()[None, None] print_out(test_Q, test_K, test_V)
def train(): with tf.device('/gpu:0'): global checkpoint_dir train_sent1_word_index, train_sent1_dist_index, train_trigger1_word_index, train_trigger1_dist_index, train_sent2_word_index, train_sent2_dist_index, train_trigger2_word_index, train_trigger2_dist_index, train_label, train_trigger_common, train_time_diff, train_test_label = get_data(train_sents) # test_sent1_word_index, test_sent1_dist_index, test_trigger1_word_index, test_trigger1_dist_index, test_sent2_word_index, test_sent2_dist_index, test_trigger2_word_index, test_trigger2_dist_index, test_label, test_trigger_common, test_time_diff = get_data(test_sents) vocab_count = embedding_matrix.shape[0] print(vocab_count) ##------------------------------------------------------------------------------------------------ ## PADDING DATA for i in range(len(train_sent1_word_index)): train_sent1_word_index[i] = np.pad(train_sent1_word_index[i], pad_width=(0,FLAGS.max_sentence_length - len(train_sent1_word_index[i])), mode='constant', constant_values=(0, FLAGS.max_sentence_length)) for i in range(len(train_sent1_dist_index)): train_sent1_dist_index[i] = np.pad(train_sent1_dist_index[i], pad_width=(0,FLAGS.max_sentence_length - len(train_sent1_dist_index[i])), mode='constant', constant_values=(0, FLAGS.max_sentence_length)) for i in range(len(train_trigger1_word_index)): train_trigger1_word_index[i] = np.pad(train_trigger1_word_index[i], pad_width=(0,FLAGS.max_trigger_length - len(train_trigger1_word_index[i])), mode='constant', constant_values=(0, FLAGS.max_trigger_length)) for i in range(len(train_trigger1_word_index)): train_trigger1_dist_index[i] = np.pad(train_trigger1_dist_index[i], pad_width=(0,FLAGS.max_trigger_length - len(train_trigger1_dist_index[i])), mode='constant', constant_values=(0, FLAGS.max_trigger_length)) for i in range(len(train_sent2_word_index)): train_sent2_word_index[i] = np.pad(train_sent2_word_index[i], pad_width=(0,FLAGS.max_sentence_length - len(train_sent2_word_index[i])), mode='constant', constant_values=(0, FLAGS.max_sentence_length)) for i in range(len(train_sent2_dist_index)): train_sent2_dist_index[i] = np.pad(train_sent2_dist_index[i], pad_width=(0,FLAGS.max_sentence_length - len(train_sent2_dist_index[i])), mode='constant', constant_values=(0, FLAGS.max_sentence_length)) for i in range(len(train_trigger2_word_index)): train_trigger2_word_index[i] = np.pad(train_trigger2_word_index[i], pad_width=(0,FLAGS.max_trigger_length - len(train_trigger2_word_index[i])), mode='constant', constant_values=(0, FLAGS.max_trigger_length)) for i in range(len(train_trigger2_word_index)): train_trigger2_dist_index[i] = np.pad(train_trigger2_dist_index[i], pad_width=(0,FLAGS.max_trigger_length - len(train_trigger2_dist_index[i])), mode='constant', constant_values=(0, FLAGS.max_trigger_length)) print("doneee") ##----------------------------------------------------------------------------------- # TRAINING DATA with tf.Graph().as_default(): session_conf = tf.ConfigProto() sess = tf.Session(config=session_conf) with sess.as_default(): # sequence_length, trigger_length, num_classes, vocab_size, word_embedding_size ,dist_embedding_size, hidden_size, attention_size, coref_size, decay_rate ##--------- CREATE MODEL---------- model = Attention(sequence_length= FLAGS.max_sentence_length, trigger_length = FLAGS.max_trigger_length, num_classes = 2, vocab_size = vocab_count, word_embedding_size = 100, dist_embedding_size = 14, hidden_size = FLAGS.hidden_size, attention_size = 128, co_ref_size = 128, ) global_step = tf.Variable(0, name="global_step", trainable=False) train_op = tf.train.AdamOptimizer(FLAGS.learning_rate).minimize(model.loss, global_step=global_step) timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs")) print("Writing to {}\n".format(out_dir)) loss_summary = tf.summary.scalar("loss", model.loss) acc_summary = tf.summary.scalar("accuracy", model.accuracy) # Train Summaries train_summary_op = tf.summary.merge([loss_summary, acc_summary]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph) checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables()) print("-----------------------------------------------------------------------------------------------") print(checkpoint_dir) sess.run(tf.global_variables_initializer()) sess.run(model.W_em1.assign(embedding_matrix)) #sess.run(model.embedding_init, feed_dict = {model.embedding_placeholder: embedding_matrix}) batches = make_batches(train_sent1_word_index, train_sent1_dist_index, train_trigger1_word_index, train_trigger1_dist_index, train_sent2_word_index, train_sent2_dist_index, train_trigger2_word_index, train_trigger2_dist_index, train_label,train_trigger_common,train_time_diff, train_test_label) print(len(batches)) ##------------ TRAIN BATCHES -------------- for i in range(0,1): print("Epoch number: " + str(i)) for batch in batches: # print(len(batches)) #print(batch[9]) feed_dict = { model.input1_text1: batch[0], model.input1_text2: batch[1], model.trigger1_text1: batch[2], model.trigger1_text2: batch[3], model.input2_text1: batch[4], model.input2_text2: batch[5], model.trigger2_text1: batch[6], model.trigger2_text2: batch[7], model.labels: batch[8], model.V_w: batch[10], model.V_d: batch[11], model.bsz_size: len(batch[0]) } _, step, summaries, loss, accuracy = sess.run([train_op, global_step, train_summary_op,model.loss, model.accuracy], feed_dict) #print(W_em1[0]) train_summary_writer.add_summary(summaries, step) if step % FLAGS.display_every == 0: time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) print(step) if step % 100 == 0: path = saver.save(sess, checkpoint_prefix, global_step=step) print("Saved model checkpoint to {}\n".format(path)) #break # if step % FLAGS.evaluate_every == 0: # print("\nEvaluation:") path = saver.save(sess, checkpoint_prefix, global_step=step) print("Saved model checkpoint to {}\n".format(path))
def run(args): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_worker = 2 n_epoch = args.epochs torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) # backward pass print('Load Train and Test Set') train_loader = DataLoader(MnistBags(target_number=args.target_number, min_target_count=args.min_target_count, mean_bag_length=args.mean_bag_length, var_bag_length=args.var_bag_length, scale=args.scale, num_bag=args.num_bags_train, seed=args.seed, train=True), batch_size=args.batchsize, shuffle=True, num_workers=n_worker, pin_memory=torch.cuda.is_available()) test_loader = DataLoader(MnistBags(target_number=args.target_number, min_target_count=args.min_target_count, mean_bag_length=args.mean_bag_length, var_bag_length=args.var_bag_length, scale=args.scale, num_bag=args.num_bags_test, seed=args.seed, train=False), batch_size=args.batchsize, shuffle=False, num_workers=n_worker, pin_memory=torch.cuda.is_available()) # resume checkpoint checkpoint = load_ckpt() if checkpoint: print('Resume training ...') start_epoch = checkpoint.epoch model = checkpoint.model else: print('Grand new training ...') start_epoch = 0 model = Attention() # put model to multiple GPUs if available if torch.cuda.device_count() > 1: print("Let's use ", torch.cuda.device_count(), " GPUs!") model = nn.DataParallel(model) model = model.to(device) optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.999), weight_decay=args.reg) if checkpoint: try: optimizer.load_state_dict(checkpoint.optimizer) except: print( '[WARNING] optimizer not restored from last checkpoint, continue without previous state' ) # free checkpoint reference del checkpoint log_dir = os.path.join('logs', args.logname) n_cv_epoch = 1 #2 with SummaryWriter(log_dir) as writer: print('\nTraining started ...') for epoch in range(start_epoch + 1, n_epoch + start_epoch + 1): # 1 base train(model, optimizer, train_loader, epoch, writer) if epoch % n_cv_epoch == 0: with torch.no_grad(): test(model, optimizer, test_loader, epoch, writer) save_ckpt(model, optimizer, epoch) print('\nTraining finished ...')
plotter.plot('attention_accuracy', 'val', 'Attention Accuracy', epoch, val_acc) plotter.plot('attention_auc', 'val', 'Attention AUC', epoch, val_auc) plotter.plot('attention_f1', 'val', 'Attention F1', epoch, val_f1) plotter.save(['Tutorial Plots Attention']) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) return model # In[24]: if __name__ == "__main__": # model_ft = Resnet_Classifier() model_ft = Attention(path="model34") model_ft = model_ft.to(device) # for param in model_ft.parameters(): # print(param.requires_grad) # print(param,size()) criterion = nn.CrossEntropyLoss( weight=torch.Tensor([1.0 / 165.0, 1.0 / 122.0]).to(device)) optimizer_ft = optim.Adam(model_ft.parameters(), lr=0.0001) scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=10, gamma=0.1) # model_ft = train_model(model_ft, criterion, optimizer_ft, scheduler, num_epochs=200) global plotter plotter = utils.VisdomLinePlotter(env_name='Tutorial Plots Resnet') # In[ ]:
print('Load Train and Test Set') loader_kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} train_loader = data_utils.DataLoader(BarleyBatches(train=True), batch_size=1, shuffle=True, **loader_kwargs) test_loader = data_utils.DataLoader(BarleyBatches(train=False), batch_size=1, shuffle=False, **loader_kwargs) print('Init Model') if args.model == 'attention': model = Attention() elif args.model == 'gated_attention': model = GatedAttention() if args.cuda: model.cuda() optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.999), weight_decay=args.reg) writer = SummaryWriter() def train(epoch): model.train() train_loss = 0.
**loader_kwargs) test_loader = data_utils.DataLoader(MnistBags( xor_numbers=[7, 9], mean_bag_length=args.mean_bag_length, var_bag_length=args.var_bag_length, num_bag=args.num_bags_test, seed=args.seed, train=False, mode=args.mode), batch_size=1, shuffle=False, **loader_kwargs) print('Init Model') model = Attention(args) if args.cuda: model.cuda() optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.999), weight_decay=args.reg) def train(epoch, sw): model.train() train_loss = 0. train_error = 0. for batch_idx, (data, label) in enumerate(train_loader): bag_label = label[0]
def main(): make_deterministic() # region Prepare data with Timer('\nData preparation time: %s\n'): ru_lang = Language() en_lang = Language() yandex = Yandex( 'datasets/yandex/corpus.en_ru.1m.ru', 'datasets/yandex/corpus.en_ru.1m.en', ru_lang, en_lang, data_slice=H.dataset_slice, ) paracrawl = ParaCrawl( 'datasets/paracrawl/en-ru.txt', ru_lang, en_lang, data_slice=slice(0), ) low = ru_lang.lower_than(H.ru_word_count_minimum) infrequent_words_n = max( ceil(ru_lang.words_n * H.infrequent_words_percent), len(low)) if infrequent_words_n > 0: ru_lang.drop_words(ru_lang.lowk(infrequent_words_n)) print( f'{infrequent_words_n:,} infrequent Russian words are dropped') low = en_lang.lower_than(H.en_word_count_minimum) if len(low) > 0: en_lang.drop_words(*low) print(f'{len(low):,} infrequent English words are dropped') print( f'Russian language: {ru_lang.words_n:,} words, {ru_lang.sentence_length:,} words in a sentence' ) print( f'English language: {en_lang.words_n:,} words, {en_lang.sentence_length:,} words in a sentence' ) batch = H.batch_size dataset = ConcatDataset((yandex, paracrawl)) loader = DataLoader(dataset, batch, shuffle=True) # endregion # region Models and optimizers model = Seq2Seq( Encoder(ru_lang.words_n, H.encoder_embed_dim, H.encoder_hidden_dim, H.encoder_bi, H.decoder_hd), Attention(H.encoder_hd, H.decoder_hd), Decoder(en_lang.words_n, H.decoder_embed_dim, H.decoder_hidden_dim, H.encoder_hd), ).to(Device).train() optimizer = Adam(model.parameters(), lr=H.learning_rate) criterion = CrossEntropyLoss(ignore_index=Token_PAD, reduction='sum') # endregion # region Training teaching_percent = H.teaching_percent total = len(dataset) log_interval = max(5, round(total / batch / 1000)) for epoch in range(1, H.epochs + 1): with Printer() as printer: printer.print(f'Train epoch {epoch}: starting...') for i, ((ru, ru_l), en_sos, en_eos) in enumerate(loader, 1): # Zero the parameter gradients optimizer.zero_grad() # Run data through model predictions = model(ru, ru_l, en_sos, teaching_percent) # Calculate loss loss = criterion(predictions, en_eos) # Back propagate and perform optimization loss.backward() clip_grad_norm_(model.parameters(), H.gradient_norm_clip) optimizer.step() # Print log if i % log_interval == 0: printer.print( f'Train epoch {epoch}: {i * batch / total:.1%} [{i * batch:,}/{total:,}]' ) printer.print(f'Train epoch {epoch}: completed') # endregion torch.save( ( ru_lang.__getnewargs__(), en_lang.__getnewargs__(), model.cpu().eval().data, ), 'data/data.pt', ) evaluate(model.to(Device), ru_lang, en_lang, 'datasets/yandex/corpus.en_ru.1m.ru', slice(H.dataset_slice.stop + 1, H.dataset_slice.stop + 1 + 100))
def train(config_path, resume=True): # Load the parameters param_dict, rep_param_dict = load_params(config_path) # use cuda flag use_cuda = True """ the tranining directory """ # load data TRAIN_DIR01 = "{}/MQ2007/S1/".format(param_dict["data_base_path"]) TRAIN_DIR02 = "{}/MQ2007/S2/".format(param_dict["data_base_path"]) TRAIN_DIR03 = "{}/MQ2007/S3/".format(param_dict["data_base_path"]) TRAIN_DIR04 = "{}/MQ2007/S4/".format(param_dict["data_base_path"]) TRAIN_DIR05 = "{}/MQ2007/S5/".format(param_dict["data_base_path"]) TEST_DIR01 = '{}/MQ2007/S1/'.format(param_dict["data_base_path"]) TEST_DIR02 = '{}/MQ2007/S2/'.format(param_dict["data_base_path"]) TEST_DIR03 = '{}/MQ2007/S3/'.format(param_dict["data_base_path"]) TEST_DIR04 = '{}/MQ2007/S4/'.format(param_dict["data_base_path"]) TEST_DIR05 = '{}/MQ2007/S5/'.format(param_dict["data_base_path"]) train_files01 = glob.glob("{}/data0.pkl".format(TRAIN_DIR01)) train_files02 = glob.glob("{}/data0.pkl".format(TRAIN_DIR02)) train_files03 = glob.glob("{}/data0.pkl".format(TRAIN_DIR03)) train_files04 = glob.glob("{}/data0.pkl".format(TRAIN_DIR04)) train_files05 = glob.glob("{}/data0.pkl".format(TRAIN_DIR05)) test_files01 = glob.glob("{}/testdata0.pkl".format(TEST_DIR01)) test_files02 = glob.glob("{}/testdata0.pkl".format(TEST_DIR02)) test_files03 = glob.glob("{}/testdata0.pkl".format(TEST_DIR03)) test_files04 = glob.glob("{}/testdata0.pkl".format(TEST_DIR04)) test_files05 = glob.glob("{}/testdata0.pkl".format(TEST_DIR05)) fold = param_dict["fold"] model_base_path = param_dict['model_base_path'] model_name_str = param_dict['model_name_str'] q_len = param_dict["q_len"] d_len = param_dict["d_len"] if fold == 1: train_files = train_files01 + train_files02 + train_files03 test_files = test_files04[0] # a path list ['/...'] only take the str rel_path = '{}/{}/tmp/test/S4.qrels'.format(model_base_path, model_name_str) elif fold == 2: train_files = train_files02 + train_files03 + train_files04 test_files = test_files05[0] rel_path = '{}/{}/tmp/test/S5.qrels'.format(model_base_path, model_name_str) elif fold == 3: train_files = train_files03 + train_files04 + train_files05 test_files = test_files01[0] rel_path = '{}/{}/tmp/test/S1.qrels'.format(model_base_path, model_name_str) elif fold == 4: train_files = train_files04 + train_files05 + train_files01 test_files = test_files02[0] rel_path = '{}/{}/tmp/test/S2.qrels'.format(model_base_path, model_name_str) elif fold == 5: train_files = train_files05 + train_files01 + train_files02 test_files = test_files03[0] rel_path = '{}/{}/tmp/test/S3.qrels'.format(model_base_path, model_name_str) else: raise ValueError("wrong fold num {}".format(fold)) """ Build the model """ emb_size = param_dict['emb_size'] num_heads = param_dict['num_heads'] kernel_size = rep_param_dict['kernel_size'] filt_size = rep_param_dict['filt_size'] vocab_size = param_dict['vocab_size'] output_dim = rep_param_dict['output_dim'] hidden_size = param_dict['hidden_size'] batch_size = param_dict['batch_size'] preemb = param_dict['preemb'] emb_path = param_dict['emb_path'] hinge_margin = param_dict['hinge_margin'] model = Attention(emb_size=emb_size, query_length=q_len, doc_length=d_len, num_heads=num_heads, kernel_size=kernel_size, filter_size=filt_size, vocab_size=vocab_size, dropout=0.0, qrep_dim=output_dim, hidden_size=hidden_size, batch_size=batch_size, preemb=preemb, emb_path=emb_path) if use_cuda: model.cuda() # optimizer optimizer = optim.Adam(model.parameters(), lr=param_dict['learning_rate'], betas=(param_dict['beta1'], param_dict['beta2']), weight_decay=param_dict['alpha']) # loss func loss = nn.MarginRankingLoss(margin=hinge_margin, size_average=True) # experiment print("Experiment") if resume == False: f_log = open( '{}/{}/logs/training_log.txt'.format(model_base_path, model_name_str), 'w+', 1) valid_log = open( '{}/{}/logs/valid_log.txt'.format(model_base_path, model_name_str), 'w+', 1) else: f_log = open( '{}/{}/logs/training_log.txt'.format(model_base_path, model_name_str), 'a+', 1) valid_log = open( '{}/{}/logs/valid_log.txt'.format(model_base_path, model_name_str), 'a+', 1) # model_file model_file = '{}/{}/saves/model_file'.format(model_base_path, model_name_str) """ TRAINING """ # define the parameters n_epoch = param_dict['n_epoch'] # init best validation MAP value best_MAP = 0.0 best_NDCG1 = 0.0 batch_count_tr = 0 # restore saved parameter if resume_training is true if resume == True: model_file = '{}/{}/saves/model_file'.format(model_base_path, model_name_str) model.load_state_dict(torch.load(model_file)) with open( '{}/{}/saves/best_MAP.pkl'.format(model_base_path, model_name_str), 'rb') as f_MAP: best_MAP = pickle.load(f_MAP) print("loaded model, and resume training now") for epoch in range(1, n_epoch + 1): '''load_data''' for f in train_files: data = load_dataset(f) print("loaded {}".format(f)) '''prepare_data''' [Q, D_pos, D_neg, L] = pair_data_generator(data, q_len) valid_data = load_dataset(test_files) ''' shuffle data''' train_data = list_shuffle(Q, D_pos, D_neg, L) '''training func''' num_batch = len(train_data[0]) // batch_size for batch_count in range(num_batch): Q = train_data[0][batch_size * batch_count:batch_size * (batch_count + 1)] D_pos = train_data[1][batch_size * batch_count:batch_size * (batch_count + 1)] D_neg = train_data[2][batch_size * batch_count:batch_size * (batch_count + 1)] L = train_data[3][batch_size * batch_count:batch_size * (batch_count + 1)] if use_cuda: Q = Variable(torch.LongTensor( pad_batch_list(Q, max_len=q_len, padding_id=0)), requires_grad=False).cuda() D_pos = Variable(torch.LongTensor( pad_batch_list(D_pos, max_len=d_len, padding_id=0)), requires_grad=False).cuda() D_neg = Variable(torch.LongTensor( pad_batch_list(D_neg, max_len=d_len, padding_id=0)), requires_grad=False).cuda() L = Variable(torch.FloatTensor(L), requires_grad=False).cuda() else: Q = Variable(torch.LongTensor( pad_batch_list(Q, max_len=q_len, padding_id=0)), requires_grad=False) D_pos = Variable(torch.LongTensor( pad_batch_list(D_pos, max_len=d_len, padding_id=0)), requires_grad=False) D_neg = Variable(torch.LongTensor( pad_batch_list(D_neg, max_len=d_len, padding_id=0)), requires_grad=False) L = Variable(torch.FloatTensor(L), requires_grad=False) # run on this batch optimizer.zero_grad() t1 = time.time() q_mask, d_pos_mask, d_neg_mask = model.generate_mask( Q, D_pos, D_neg) """ need to do the modification i the model.py """ S_pos, S_neg = model(Q, D_pos, D_neg, q_mask, d_pos_mask, d_neg_mask) Loss = hinge_loss(S_pos, S_neg, 1.0) Loss.backward() optimizer.step() t2 = time.time() batch_count_tr += 1 print("epoch {} batch {} training cost: {} using {}s" \ .format(epoch, batch_count+1, Loss.data[0], t2-t1)) f_log.write("epoch {} batch {} training cost: {}, using {}s". format(epoch, batch_count + 1, Loss.data[0], t2 - t1) + '\n') """ evaluate part """ if batch_count_tr % 20 == 0: if valid_data is not None: MAP, NDCGs = evaluate(config_path, model, valid_data, rel_path, mode="valid") print(MAP, NDCGs) valid_log.write( "epoch {}, batch {}, MAP: {}, NDCGs: {} {} {} {}". format(epoch + 1, batch_count + 1, MAP, NDCGs[1][0], NDCGs[1][1], NDCGs[1][2], NDCGs[1][3])) if MAP > best_MAP: # save this best model best_MAP = MAP with open( '{}/{}/saves/best_MAP.pkl'.format( model_base_path, model_name_str), 'wb') as f_MAP: pickle.dump(best_MAP, f_MAP) # save model params after several epoch model_file = '{}/{}/saves/model_file'.format( model_base_path, model_name_str) torch.save(model.state_dict(), model_file) print("successfully saved model to the path {}". format(model_file)) valid_log.write("{} {} {} {}".format( NDCGs[1][0], NDCGs[1][1], NDCGs[1][2], NDCGs[1][3])) valid_log.write(" MAP: {}".format(MAP)) valid_log.write('\n') f_log.close() valid_log.close()