def get_han2(sent_num, sent_length, embed_weight, mask_zero=False): input = Input(shape=( sent_num, sent_length, ), dtype="int32") embedding = Embedding(name="embeeding", input_dim=embed_weight.shape[0], weights=[embed_weight], output_dim=embed_weight.shape[1], mask_zero=mask_zero, trainable=False) sent_embed = embedding(input) # print(np.shape(sent_embed)) sent_embed = Reshape((1, sent_length, embed_weight.shape[1]))(sent_embed) print(np.shape(sent_embed)) word_bigru = Bidirectional(GRU(128, return_sequences=True))(sent_embed) word_bigru = Reshape((sent_length, 256))(word_bigru) # print(np.shape(word_bigru)) word_attention = Attention(sent_length)(word_bigru) sent_encode = Reshape((-1, sent_num))(word_attention) # sent_encode = Model(sentence_input, word_attention) # # doc_input = Input(shape=(sent_num, sent_length), dtype="int32") # doc_encode = TimeDistributed(sent_encode)(doc_input) sent_bigru = Bidirectional(GRU(128, return_sequences=True))(sent_encode) doc_attention = Attention(sent_num)(sent_bigru) fc = Activation(activation="relu")(BatchNormalization()( Dense(256)(doc_attention))) output = Dense(4, activation='softmax')(fc) model = Model(input, output) model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy']) return model
def __init__(self, config, embed_size, padding_idx, label_size): super(Contextualized, self).__init__() self.bilstm = Bilstm(config, embed_size) self.attention_s = Attention(config) self.attention_l = Attention(config) self.attention_r = Attention(config) self.linear_out = nn.Linear(config.hidden_dim * 6, label_size)
def get_word_char_hcnn(sent_num, sent_word_length, sent_char_length, word_embed_weight, char_embed_weight, mask_zero=False): sentence_word_input = Input(shape=(sent_word_length, ), dtype="int32") word_embedding = Embedding(name="word_embedding", input_dim=word_embed_weight.shape[0], weights=[word_embed_weight], output_dim=word_embed_weight.shape[1], mask_zero=mask_zero, trainable=False) sent_word_embed = word_embedding(sentence_word_input) word_bigru = Bidirectional(GRU(128, return_sequences=True))(sent_word_embed) word_attention = Attention(sent_word_length)(word_bigru) sent_word_encode = Model(sentence_word_input, word_attention) sentence_char_input = Input(shape=(sent_char_length, ), dtype="int32") char_embedding = Embedding( name="char_embedding", input_dim=char_embed_weight.shape[0], weights=[char_embed_weight], output_dim=char_embed_weight.shape[1], mask_zero=mask_zero, ) sent_char_embed = char_embedding(sentence_char_input) char_bigru = Bidirectional(GRU(64, return_sequences=True))(sent_char_embed) char_attention = Attention(sent_char_length)(char_bigru) sent_char_encode = Model(sentence_char_input, char_attention) review_word_input = Input(shape=(sent_num, sent_word_length), dtype="int32") review_word_encode = TimeDistributed(sent_word_encode)(review_word_input) review_char_input = Input(shape=(sent_num, sent_char_length), dtype="int32") review_char_encode = TimeDistributed(sent_char_encode)(review_char_input) review_encode = concatenate([review_word_encode, review_char_encode]) unvec = convs_block(review_encode, convs=[1, 2, 3, 4, 5], f=256) dropfeat = Dropout(0.2)(unvec) fc = Activation(activation='relu')(BatchNormalization()( Dense(256)(dropfeat))) output = Dense(4, activation="softmax")(fc) model = Model([review_word_input, review_char_input], output) model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accracy']) return model
def main(): torch.manual_seed(777) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') parser = argparse.ArgumentParser() parser.add_argument("--path", type=str) parser.add_argument("--embedding_dim", type=int, default=300) parser.add_argument("--iterator", type=int, default=10) parser.add_argument("--lr", type=float, default=1e-5) parser.add_argument("--decay", type=float, default=0.01) parser.add_argument("--batch_size", type=int, default=100) args = parser.parse_args() trg, src = load_pair(args.path) src_token = eng_tokenize(src) trg_token = es_tokenize(trg) trg2idx, idx2_trg = make_dictionary(trg_token) src2idx, idx2src = make_dictionary(src_token) src_ix = make_src_idx(src_token, src2idx) trg_ix = make_trg_idx(trg_token, trg2idx) args.embedding_dim # model 선언부 encoder = EncoderGRU(emb_dim=args.embedding_dim, bidirectional=True, vocab_size=len(src2idx)) attention = Attention(emb_dim=args.embedding_dim, padding_idx=0) decoder = DecoderGRU(emb_dim=args.embedding_dim, attention=attention, n_class=len(trg2idx)) model = Seq2Seq_a(encoder, decoder, device, trg2idx) num_parameter(model) #loss , optimizer 설정 loss_func = nn.CrossEntropyLoss(ignore_index=0) optimizer = optim.RMSprop(model.parameters(), lr=args.lr, weight_decay=args.decay) #data 나누기 train_loader, test_loader = prepare_data(src=src_ix, trg=trg_ix, test_size=0.2, batch_size=args.batch_size) train(model, iterator=args.iterator, optimizer=optimizer, criterion=loss_func, train_loader=train_loader, visual_path="ssibal", trg2idx=trg2idx, savepath="./seq2seq_model.pth")
def main(fpath): ENC_EMB_DIM = 256 DEC_EMB_DIM = 256 ENC_HID_DIM = 512 DEC_HID_DIM = 512 ENC_DROPOUT = 0.5 DEC_DROPOUT = 0.5 device = torch.device('cuda') dataset = Dataset() INPUT_DIM = len(dataset.SRC.vocab) OUTPUT_DIM = len(dataset.TRG.vocab) SRC_PAD_IDX = dataset.SRC.vocab.stoi[dataset.SRC.pad_token] encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT) attention = Attention(ENC_HID_DIM, DEC_HID_DIM) decoder = Decoder(DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, OUTPUT_DIM, DEC_DROPOUT, attention) model = Seq2Seq(encoder, decoder, SRC_PAD_IDX, device) model.load_state_dict(torch.load("best_model.pt")) model.to(device) with open(fpath, "r") as f: sentences = f.readlines() translate_sentence(model, sentences, dataset.SRC, dataset.TRG, device)
def __init__(self, config, embed_size, padding_idx, label_size, embedding): super(Vanilla, self).__init__() self.bilstm = Bilstm(config, embed_size, embedding) self.attention = Attention(config) self.linear_out = nn.Linear(config.hidden_dim * 2, label_size)
def main(): BATCH_SIZE = 32 NUM_EPOCH = 12 LR = 0.001 CLIP = 1 STEP_SIZE = 4 GAMMA = 0.1 ENC_EMB_DIM = 256 DEC_EMB_DIM = 256 ENC_HID_DIM = 512 DEC_HID_DIM = 512 ENC_DROPOUT = 0.5 DEC_DROPOUT = 0.5 device = torch.device('cuda') dataset = Dataset() train_data, valid_data, test_data = dataset.build_dataset() train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=BATCH_SIZE, sort_within_batch=True, sort_key=lambda x: len(x.src), device=device) INPUT_DIM = len(dataset.SRC.vocab) OUTPUT_DIM = len(dataset.TRG.vocab) SRC_PAD_IDX = dataset.SRC.vocab.stoi[dataset.SRC.pad_token] TRG_PAD_IDX = dataset.TRG.vocab.stoi[dataset.TRG.pad_token] encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT) attention = Attention(ENC_HID_DIM, DEC_HID_DIM) decoder = Decoder(DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, OUTPUT_DIM, DEC_DROPOUT, attention) model = Seq2Seq(encoder, decoder, SRC_PAD_IDX, device) model.apply(init_weight) model.to(device) optimizer = Adam(model.parameters(), lr=LR) criterion = CrossEntropyLoss(ignore_index=TRG_PAD_IDX).to(device) scheduler = StepLR(optimizer, STEP_SIZE, GAMMA) min_valid_loss = 1e10 for e in range(NUM_EPOCH): print("Epoch: {}".format(e + 1)) train_loss = train(model, train_iterator, optimizer, criterion, CLIP) print("Train loss: {}".format(train_loss)) valid_loss = evaluate(model, valid_iterator, criterion) print("Valid loss: {}".format(valid_loss)) if valid_loss < min_valid_loss: torch.save(model.state_dict(), "best_model.pt") min_valid_loss = valid_loss
def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz): super(Decoder, self).__init__() self.batch_sz = batch_sz self.dec_units = dec_units self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim) self.gru = tf.keras.layers.GRU(self.dec_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform') self.fc = tf.keras.layers.Dense(vocab_size) self.attention = Attention.BahdanauAttention(1024)
def main(_): parser = argparse.ArgumentParser() parser.add_argument( '--action', dest='action', type=str, default='train', help='actions: train, or test') args = parser.parse_args() if args.action not in ['train', 'test']: print('invalid action: ', args.action) print("Please input a action: train, test") else: model= Attention(tf.Session(),configure()) getattr(model,args.action)()
def build_model(self, n_classes=1, embedding_dim=300): 'Build bi-level bi-directional GRU model with attention over word embeddings' l2_reg = regularizers.l2(1e-8) sentence_in = Input(shape=(self.max_len, ), dtype="int32") masking_layer = Masking(mask_value=0)(sentence_in) embedded_word_seq = Embedding(10000, 300, input_length=self.max_len, trainable=False, weights=[self.embedding_weights ])(masking_layer) word_encoder = Bidirectional( GRU(50, return_sequences=True, kernel_regularizer=l2_reg))(embedded_word_seq) dense_transform_w = Dense(100, activation="relu", name="dense_transform_w", kernel_regularizer=l2_reg)(word_encoder) attn_weighted_sent = Model( sentence_in, Attention(name='word_attention', regularizer=l2_reg)(dense_transform_w)) attn_weighted_sent.summary() texts_in = Input(shape=(self.max_sentence, self.max_len), dtype='int32') attention_weighted_sentences = TimeDistributed(attn_weighted_sent)( texts_in) sentence_encoder = Bidirectional( GRU(50, return_sequences=True, kernel_regularizer=l2_reg), name="sentence_encoder")(attention_weighted_sentences) dense_transform_s = TimeDistributed( Dense(100, activation='relu', name='dense_transform_s', kernel_regularizer=l2_reg))(sentence_encoder) prediction = TimeDistributed(Dense( 1, activation="sigmoid"))(dense_transform_s) model = Model(texts_in, prediction) model.summary() model.compile(optimizer=Adam(lr=0.001), loss="binary_crossentropy", metrics=["acc"], sample_weight_mode="temporal") return (model)
def init_model( with_attention=False, teaching_force_ratio=0.5, embedding_size=500, hidden_size=256, ): """ Instantiates the model by creating the Encoder, the Decoder and the model itself which represents the seq2seq architecture. :param with_attention: if true then the model apply the attention mechanism :param teaching_force_ratio: used to alternate between generated word or gt-word during training. :return encoder, decoder, model: the encoder, the decoder and the seq2seq model. """ if with_attention: # init with attention encoder = EncoderAttention(embedding_size, hidden_size, vocabulary.__len__()).to(device) attention = Attention(hidden_size).to(device) decoder = DecoderAttention(embedding_size, hidden_size, vocabulary.__len__(), attention=attention).to(device) model = ChatbotModel(encoder, decoder, vocabulary.__len__(), with_attention=True, tf_ratio=teaching_force_ratio).to(device) return encoder, decoder, model else: # init with no attention encoder = Encoder(embedding_size, hidden_size, vocabulary.__len__()).to(device) decoder = Decoder(embedding_size, hidden_size, vocabulary.__len__()).to(device) model = ChatbotModel(encoder, decoder, vocabulary.__len__(), with_attention=False, tf_ratio=teaching_force_ratio).to(device) return encoder, decoder, model
def get_hcnn(sent_num, sent_length, embed_weight, mask_zero=False): sentence_input = Input(shape=(sent_length, ), dtype="int32") embedding = Embedding(input_dim=embed_weight.shape[0], weights=[embed_weight], output_dim=embed_weight.shape[1], mask_zero=mask_zero, trainable=False) sent_embed = embedding(sentence_input) word_bigru = Bidirectional(GRU(128, return_sequences=True))(sent_embed) word_attention = Attention(sent_length)(word_bigru) sent_encode = Model(sentence_input, word_attention) review_input = Input(shape=(sent_num, sent_length), dtype="int32") review_encode = TimeDistributed(sent_encode)(review_input) feat = convs_block(review_encode) dropfeat = Dropout(0.2)(feat) fc = Activation(activation="relu")(BatchNormalization()( Dense(256)(dropfeat))) output = Dense(2, activation="softmax")(fc) model = Model(review_input, output) model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy']) return model
import torch from dataloader import prepare_data from model import Encoder, Attention, Decoder, Seq2Seq, init_weights from trainer import Trainer from config import * """ load data """ train_loader, val_loader, test_loader, m_dh = prepare_data( TRAIN_PATH, VAL_PATH, TEST_PATH, DH_PATH, LOAD_FROM_DUMP, BATCH_SIZE) """ model setup """ INPUT_DIM, OUTPUT_DIM = len(m_dh.de_vocab), len(m_dh.en_vocab) enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT) attn = Attention(ENC_HID_DIM, DEC_HID_DIM, ATTN_DIM) dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn) model = Seq2Seq(enc, dec) model.apply(init_weights) """ training setup """ optimizer = torch.optim.Adam(model.parameters(), lr=LR) criterion = torch.nn.CrossEntropyLoss(ignore_index=1) trainer = Trainer(model, optimizer, criterion, train_loader, val_loader, val_best_path=VAL_BEST_PATH) trainer.load('ckpts/best.pt')
train_set = BertDataset(bert_path / bert_model / 'train') valid_set = BertDataset(bert_path / bert_model / 'valid') training_loader = DataLoader( train_set, batch_size=mb, shuffle=True, num_workers=dl_workers, pin_memory=True if device == 'cuda' else False) valid_loader = DataLoader(valid_set, batch_size=mb, shuffle=True, num_workers=dl_workers, pin_memory=True if device == 'cuda' else False) attention = Attention(bert_hidden_size, decoder_hidden_size, attention_hidden_size) # add attention_hidden_size decoder = Decoder(bert_vocab_size, decoder_input_size, bert_hidden_size, decoder_hidden_size, num_layers, dropout, attention, device) encoder = BertModel.from_pretrained(model_path / stage / bert_model) model = Seq2Seq(encoder, decoder, device, encoder_trained) optimizer = optim.SGD(decoder.parameters(), weight_decay=weight_decay, lr=lr, momentum=momentum) criterion = nn.CrossEntropyLoss(ignore_index=0, reduction='none') # Pad Index if checkpoint is not None:
def run(args): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_worker = 2 n_epoch = args.epochs torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) # backward pass print('Load Train and Test Set') train_loader = DataLoader(MnistBags(target_number=args.target_number, min_target_count=args.min_target_count, mean_bag_length=args.mean_bag_length, var_bag_length=args.var_bag_length, scale=args.scale, num_bag=args.num_bags_train, seed=args.seed, train=True), batch_size=args.batchsize, shuffle=True, num_workers=n_worker, pin_memory=torch.cuda.is_available()) test_loader = DataLoader(MnistBags(target_number=args.target_number, min_target_count=args.min_target_count, mean_bag_length=args.mean_bag_length, var_bag_length=args.var_bag_length, scale=args.scale, num_bag=args.num_bags_test, seed=args.seed, train=False), batch_size=args.batchsize, shuffle=False, num_workers=n_worker, pin_memory=torch.cuda.is_available()) # resume checkpoint checkpoint = load_ckpt() if checkpoint: print('Resume training ...') start_epoch = checkpoint.epoch model = checkpoint.model else: print('Grand new training ...') start_epoch = 0 model = Attention() # put model to multiple GPUs if available if torch.cuda.device_count() > 1: print("Let's use ", torch.cuda.device_count(), " GPUs!") model = nn.DataParallel(model) model = model.to(device) optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.999), weight_decay=args.reg) if checkpoint: try: optimizer.load_state_dict(checkpoint.optimizer) except: print( '[WARNING] optimizer not restored from last checkpoint, continue without previous state' ) # free checkpoint reference del checkpoint log_dir = os.path.join('logs', args.logname) n_cv_epoch = 1 #2 with SummaryWriter(log_dir) as writer: print('\nTraining started ...') for epoch in range(start_epoch + 1, n_epoch + start_epoch + 1): # 1 base train(model, optimizer, train_loader, epoch, writer) if epoch % n_cv_epoch == 0: with torch.no_grad(): test(model, optimizer, test_loader, epoch, writer) save_ckpt(model, optimizer, epoch) print('\nTraining finished ...')
def build_model(sentence_len, max_words, doc_embedding, sent_embedding): 'Constructs the multi-task training extractor/classifier model' l2_reg = regularizers.l2(1e-6) l1_l2reg = regularizers.l1_l2(1e-5) ## word encoder - extractor sentence_in = Input(shape=(sentence_len, ), dtype="int32") embedded_word_seq = Embedding(max_words, 300, input_length=sentence_len, trainable=False, weights=[doc_embedding])(sentence_in) #embedded_word_seq_learn = Embedding(10000, 300, input_length = 30, trainable = True)(sentence_in) #embedding_concat = concatenate([embedded_word_seq, embedded_word_seq_learn]) word_encoder = Bidirectional( GRU(50, return_sequences=True, kernel_regularizer=l2_reg))(embedded_word_seq) dense_transform_w = Dense(100, activation="relu", name="dense_transform_w", kernel_regularizer=l2_reg)(word_encoder) attn_weighted_sent = Model( sentence_in, Attention(name='word_attention', regularizer=l2_reg)(dense_transform_w)) attn_weighted_sent.summary() # Inputs - sentence encoder - extractor class_input = Input(shape=(sentence_len, ), dtype="int32", name="CL_input") class_ids = Input(shape=(1, ), dtype="int32", name="CL_IDs") texts_in = Input(shape=(100, sentence_len), dtype='int32') # sentence encoder - extractor attention_weighted_sentences = TimeDistributed( attn_weighted_sent, name="EX_sent_attn")(texts_in) sentence_encoder = Bidirectional( GRU(50, return_sequences=True, name="sentence_encoder"))(attention_weighted_sentences) sentence_matcher = Lambda(lambda x: x[:, tf.squeeze(class_ids), :], output_shape=(100, ))(sentence_encoder) dense_transform_s = TimeDistributed( Dense(100, activation='relu', name='EX_sent_dense', kernel_regularizer=l1_l2reg))(sentence_encoder) dropout_extractor = Dropout(0.5)(dense_transform_s) output_extractor = TimeDistributed( Dense(1, activation="sigmoid", name="EX_out"))(dropout_extractor) # sentence classifier embedded_words = Embedding(max_words, 300, input_length=sentence_len, trainable=False, name="CL_embed", weights=[sent_embedding])(class_input) rnn = Bidirectional( GRU(50, return_sequences=True, name="CL_RNN", kernel_regularizer=l2_reg))(embedded_words) dense_w = TimeDistributed( Dense(100, kernel_regularizer=l2_reg, name="CL_dense"))(rnn) attn = AttentionWithContext(name="CL_attn")(dense_w) merge_layer = concatenate([attn, sentence_matcher], name="CL_merging") output_classifier = Dense(n_classes, activation="sigmoid")(merge_layer) model = Model(inputs=[class_input, texts_in, class_ids], outputs=[output_classifier, output_extractor]) model.summary() model.compile(optimizer=Adam(lr=0.0002), loss={ 'dense_1': 'binary_crossentropy', 'time_distributed_2': 'binary_crossentropy' }, metrics={ 'dense_1': [top_1_accuracy, top_3_accuracy], 'time_distributed_2': ['acc'] }) return (model)
shuffle=True, **loader_kwargs) test_loader = data_utils.DataLoader(MnistBags( xor_numbers=[7, 9], mean_bag_length=args.mean_bag_length, var_bag_length=args.var_bag_length, num_bag=args.num_bags_test, seed=args.seed, train=False), batch_size=1, shuffle=False, **loader_kwargs) print('Init Model') model = Attention(args.self_att) if args.cuda: model.cuda() optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.999), weight_decay=args.reg) def train(epoch, sw): model.train() train_loss = 0. train_error = 0. for batch_idx, (data, label) in enumerate(train_loader): bag_label = label[0]
def test(config_path): # Load the parameters param_dict, rep_param_dict = load_params(config_path) # load data TEST_DIR01 = '{}/MQ2007/S1/'.format(param_dict["data_base_path"]) TEST_DIR02 = '{}/MQ2007/S2/'.format(param_dict["data_base_path"]) TEST_DIR03 = '{}/MQ2007/S3/'.format(param_dict["data_base_path"]) TEST_DIR04 = '{}/MQ2007/S4/'.format(param_dict["data_base_path"]) TEST_DIR05 = '{}/MQ2007/S5/'.format(param_dict["data_base_path"]) test_files01 = glob.glob("{}/testdata0.pkl".format(TEST_DIR01)) test_files02 = glob.glob("{}/testdata0.pkl".format(TEST_DIR02)) test_files03 = glob.glob("{}/testdata0.pkl".format(TEST_DIR03)) test_files04 = glob.glob("{}/testdata0.pkl".format(TEST_DIR04)) test_files05 = glob.glob("{}/testdata0.pkl".format(TEST_DIR05)) fold = param_dict["fold"] model_base_path = param_dict['model_base_path'] model_name_str = param_dict['model_name_str'] if fold == 1: test_files = test_files05[0] # a path list ['/...'] only take the str rel_path = '{}/{}/tmp/test/S5.qrels'.format(model_base_path, model_name_str) elif fold == 2: test_files = test_files01[0] rel_path = '{}/{}/tmp/test/S1.qrels'.format(model_base_path, model_name_str) elif fold == 3: test_files = test_files02[0] rel_path = '{}/{}/tmp/test/S2.qrels'.format(model_base_path, model_name_str) elif fold == 4: test_files = test_files03[0] rel_path = '{}/{}/tmp/test/S3.qrels'.format(model_base_path, model_name_str) elif fold == 5: test_files = test_files04[0] rel_path = '{}/{}/tmp/test/S4.qrels'.format(model_base_path, model_name_str) else: raise ValueError("wrong fold num {}".format(fold)) test_data = load_dataset(test_files) q_len = param_dict['q_len'] d_len = param_dict['d_len'] emb_size = param_dict['emb_size'] num_heads = param_dict['num_heads'] kernel_size = rep_param_dict['kernel_size'] filt_size = rep_param_dict['filt_size'] vocab_size = param_dict['vocab_size'] output_dim = rep_param_dict['output_dim'] hidden_size = param_dict['hidden_size'] batch_size = param_dict['batch_size'] preemb = param_dict['preemb'] emb_path = param_dict['emb_path'] hinge_margin = param_dict['hinge_margin'] model = Attention(emb_size=emb_size, query_length=q_len, doc_length=d_len, num_heads=num_heads, kernel_size=kernel_size, filter_size=filt_size, vocab_size=vocab_size, dropout=0.0, qrep_dim=output_dim, hidden_size=hidden_size, batch_size=batch_size, preemb=preemb, emb_path=emb_path).cuda() # Test # load model from file model_file = '{}/{}/saves/model_file'.format(model_base_path, model_name_str) model.load_state_dict(torch.load(model_file)) print("loaded model, and perform test now") MAP, NDCGs = evaluate(config_path, model, test_data, rel_path, mode="test") print(MAP, NDCGs)
drop = 0.6 model = 'LSTM' bi = True criterion = crit traindl, valdl = data.BucketIterator.splits(datasets=(trainds, valds), batch_size=batch_size, sort_key=lambda x: len(x.text), device=device, sort_within_batch=True, repeat=False) train_iter, val_iter = traindl, valdl TEXT, LABEL = text_field, label_field embedding = nn.Embedding(ntokens, emsize, padding_idx=1, max_norm=1) if vectors: embedding.weight.data.copy_(TEXT.vocab.vectors) encoder = Encoder(emsize, hidden, nlayers=nlayers, dropout=drop, bidirectional=bi) attention_dim = hidden if not bi else 2*hidden attention = Attention(attention_dim, attention_dim, attention_dim) model = Classifier(embedding, encoder, attention, attention_dim, nlabels, baseline=True).to(device) criterion = crit optimizer = torch.optim.Adam(model.parameters(), lr, amsgrad=True) for epoch in range(1, epochs + 1): m = train(epoch, model, train_iter, val_iter, optimizer, criterion)
parser.add_argument('--model', type=str, default='attention', help='Choose b/w attention and gated_attention') args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) print('\nGPU is ON!') print('Load Train and Test Set') loader_kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} test_loader = data_utils.DataLoader(TorchDataset(filename="test_labels.txt", data_dir="dataset/val"), batch_size=1, shuffle=True, **loader_kwargs) if args.model=='attention': model = Attention() elif args.model=='gated_attention': model = GatedAttention() if args.cuda: model.cuda() def test(): model = torch.load('model40.pth') model.eval() test_loss = 0. y = [] y1 = [] y2 = [] for batch_idx, (data, label) in enumerate(tqdm(test_loader)): bag_label = label
def main(reader, params): #from rcnn_attention import evaluator k_shot = params.k num_negative_bags = params.neg total_bags = k_shot + num_negative_bags result_lists = {} input_dim = 256 if params.dataset == 'omniglot' else 640 for i, tensor_data in enumerate(reader.get_data()): if (i + 1) % 100 == 0: print('Evaluating problem number %d/%d' % (i + 1, params.eval_num)) [feas, fea_boxes, fea_target_classes, fea_classes, imgs, target_class] = tensor_data[0:6] boxes_list = tensor_data[6:6 + total_bags] class_list = tensor_data[6 + total_bags:] bags = np.squeeze(feas) bag_labels = np.max(fea_target_classes, axis=1) input_labels = fea_target_classes.astype(np.int64) train_loader = data_utils.DataLoader(ImageBags(bags=bags, labels=input_labels), batch_size=1, shuffle=True, **loader_kwargs) test_loader = data_utils.DataLoader(ImageBags(bags=bags, labels=input_labels), batch_size=1, shuffle=False, **loader_kwargs) model = Attention(input_dim=input_dim) if params.cuda: model.cuda() optimizer = optim.Adam(model.parameters(), lr=params.lr, betas=(0.9, 0.999), weight_decay=params.reg) def train(epoch): model.train() train_loss = 0. train_error = 0. for batch_idx, (data, label) in enumerate(train_loader): bag_label = label[0] if params.cuda: data, bag_label = data.cuda(), bag_label.cuda() data, bag_label = Variable(data), Variable(bag_label) # reset gradients optimizer.zero_grad() # calculate loss and metrics loss, _ = model.calculate_objective(data, bag_label) train_loss += loss.data[0] #error, _ = model.calculate_classification_error(data, bag_label) #train_error += error # backward pass loss.backward() # step optimizer.step() train_loss /= len(train_loader) #print('epoch: {}, loss: {}'.format(epoch, train_loss)) #train_error /= len(train_loader) def test(): model.eval() test_loss = 0. test_error = 0. num_success = 0 scores = np.zeros_like(fea_classes[:params.k]) for batch_idx, (data, label) in enumerate(test_loader): bag_label = label[0] instance_labels = label[1] if params.cuda: data, bag_label = data.cuda(), bag_label.cuda() data, bag_label = Variable(data), Variable(bag_label) loss, attention_weights = model.calculate_objective( data, bag_label) test_loss += loss.data[0] #error, predicted_label = model.calculate_classification_error(data, bag_label) #test_error += error if batch_idx < params.k: scores[batch_idx] = attention_weights.cpu().data.numpy()[0] #argmax_pred = np.argmax(attention_weights.cpu().data.numpy()[0]) #val = instance_labels.numpy()[0].tolist()[argmax_pred] #num_success += val #print('batch idx: {}, val: {}'.format(batch_idx, val)) #print('scores: ', scores) res = { 'boxes': fea_boxes[:params.k], 'classes': np.ones_like(fea_classes[:params.k]), 'scores': scores, 'class_agnostic': True } return res gt = {} gt['boxes'] = boxes_list[:params.k] gt['classes'] = class_list[:params.k] gt['target_class'] = target_class for epoch in range(1, args.epochs + 1): train(epoch) res = test() result_dict = {'groundtruth': gt, 'atnmil': res} from rcnn_attention import evaluator evaluator._postprocess_result_dict(result_dict) result_dict.pop('groundtruth') add_results(result_dict, result_lists) if i + 1 == params.eval_num: break metrics = {} from rcnn_attention import eval_util for method, result_list in result_lists.items(): m = eval_util.evaluate_coloc_results(result_list, None) metrics[method] = m for k, v in metrics.items(): print('{}: {}'.format(k, v))
epoch_loss += loss.item() return epoch_loss / len(iterator) def epoch_time(start_time, end_time): elapsed_time = end_time - start_time elapsed_mins = int(elapsed_time / 60) elapsed_secs = int(elapsed_time - (elapsed_mins * 60)) return elapsed_mins, elapsed_secs if __name__ == "__main__": dataset = Dataset() train_data, valid_data, test_data, INPUT_DIM, OUTPUT_DIM = dataset.get_data() attention = Attention(config.ENC_HID_DIM, config.DEC_HID_DIM) encoder = Encoder(INPUT_DIM, config.ENC_EMB_DIM, config.ENC_HID_DIM, config.DEC_HID_DIM, config.N_LAYERS, config.ENC_DROPOUT) decoder = Decoder(OUTPUT_DIM, config.DEC_EMB_DIM, config.ENC_HID_DIM, config.DEC_HID_DIM, config.N_LAYERS, config.DEC_DROPOUT, attention) seq2seq = Seq2Seq(encoder, decoder, config.device).to(config.device) print(seq2seq) optimizer = optim.Adam(seq2seq.parameters()) PAD_IDX = config.target.vocab.stoi['<pad>'] criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX) train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train_data, valid_data, test_data), batch_size=config.BATCH_SIZE, device=config.device) N_EPOCHS = 10 best_valid_loss = float('inf')
import torch from torch import nn import numpy as np from model import Attention, Encoder mha = Attention(d_model=512, num_heads=8, p=0) encoder = Encoder(d_model=512, num_heads=8, conv_hidden_dim=128) def print_out(Q, K, V): temp_out, temp_attn = mha.scaled_dot_product_attention(Q, K, V) print('Attention weights are:', temp_attn.squeeze()) print('Output is:', temp_out.squeeze()) test_K = torch.tensor([[10, 0, 0], [0, 10, 0], [0, 0, 10], [0, 0, 10]]).float()[None, None] test_V = torch.tensor([[1, 0, 0], [10, 0, 0], [100, 5, 0], [1000, 6, 0]]).float()[None, None] test_Q = torch.tensor([[0, 0, 10], [0, 10, 0], [10, 10, 0]]).float()[None, None] print_out(test_Q, test_K, test_V)
def train(): with tf.device('/gpu:0'): global checkpoint_dir train_sent1_word_index, train_sent1_dist_index, train_trigger1_word_index, train_trigger1_dist_index, train_sent2_word_index, train_sent2_dist_index, train_trigger2_word_index, train_trigger2_dist_index, train_label, train_trigger_common, train_time_diff, train_test_label = get_data(train_sents) # test_sent1_word_index, test_sent1_dist_index, test_trigger1_word_index, test_trigger1_dist_index, test_sent2_word_index, test_sent2_dist_index, test_trigger2_word_index, test_trigger2_dist_index, test_label, test_trigger_common, test_time_diff = get_data(test_sents) vocab_count = embedding_matrix.shape[0] print(vocab_count) ##------------------------------------------------------------------------------------------------ ## PADDING DATA for i in range(len(train_sent1_word_index)): train_sent1_word_index[i] = np.pad(train_sent1_word_index[i], pad_width=(0,FLAGS.max_sentence_length - len(train_sent1_word_index[i])), mode='constant', constant_values=(0, FLAGS.max_sentence_length)) for i in range(len(train_sent1_dist_index)): train_sent1_dist_index[i] = np.pad(train_sent1_dist_index[i], pad_width=(0,FLAGS.max_sentence_length - len(train_sent1_dist_index[i])), mode='constant', constant_values=(0, FLAGS.max_sentence_length)) for i in range(len(train_trigger1_word_index)): train_trigger1_word_index[i] = np.pad(train_trigger1_word_index[i], pad_width=(0,FLAGS.max_trigger_length - len(train_trigger1_word_index[i])), mode='constant', constant_values=(0, FLAGS.max_trigger_length)) for i in range(len(train_trigger1_word_index)): train_trigger1_dist_index[i] = np.pad(train_trigger1_dist_index[i], pad_width=(0,FLAGS.max_trigger_length - len(train_trigger1_dist_index[i])), mode='constant', constant_values=(0, FLAGS.max_trigger_length)) for i in range(len(train_sent2_word_index)): train_sent2_word_index[i] = np.pad(train_sent2_word_index[i], pad_width=(0,FLAGS.max_sentence_length - len(train_sent2_word_index[i])), mode='constant', constant_values=(0, FLAGS.max_sentence_length)) for i in range(len(train_sent2_dist_index)): train_sent2_dist_index[i] = np.pad(train_sent2_dist_index[i], pad_width=(0,FLAGS.max_sentence_length - len(train_sent2_dist_index[i])), mode='constant', constant_values=(0, FLAGS.max_sentence_length)) for i in range(len(train_trigger2_word_index)): train_trigger2_word_index[i] = np.pad(train_trigger2_word_index[i], pad_width=(0,FLAGS.max_trigger_length - len(train_trigger2_word_index[i])), mode='constant', constant_values=(0, FLAGS.max_trigger_length)) for i in range(len(train_trigger2_word_index)): train_trigger2_dist_index[i] = np.pad(train_trigger2_dist_index[i], pad_width=(0,FLAGS.max_trigger_length - len(train_trigger2_dist_index[i])), mode='constant', constant_values=(0, FLAGS.max_trigger_length)) print("doneee") ##----------------------------------------------------------------------------------- # TRAINING DATA with tf.Graph().as_default(): session_conf = tf.ConfigProto() sess = tf.Session(config=session_conf) with sess.as_default(): # sequence_length, trigger_length, num_classes, vocab_size, word_embedding_size ,dist_embedding_size, hidden_size, attention_size, coref_size, decay_rate ##--------- CREATE MODEL---------- model = Attention(sequence_length= FLAGS.max_sentence_length, trigger_length = FLAGS.max_trigger_length, num_classes = 2, vocab_size = vocab_count, word_embedding_size = 100, dist_embedding_size = 14, hidden_size = FLAGS.hidden_size, attention_size = 128, co_ref_size = 128, ) global_step = tf.Variable(0, name="global_step", trainable=False) train_op = tf.train.AdamOptimizer(FLAGS.learning_rate).minimize(model.loss, global_step=global_step) timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs")) print("Writing to {}\n".format(out_dir)) loss_summary = tf.summary.scalar("loss", model.loss) acc_summary = tf.summary.scalar("accuracy", model.accuracy) # Train Summaries train_summary_op = tf.summary.merge([loss_summary, acc_summary]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph) checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables()) print("-----------------------------------------------------------------------------------------------") print(checkpoint_dir) sess.run(tf.global_variables_initializer()) sess.run(model.W_em1.assign(embedding_matrix)) #sess.run(model.embedding_init, feed_dict = {model.embedding_placeholder: embedding_matrix}) batches = make_batches(train_sent1_word_index, train_sent1_dist_index, train_trigger1_word_index, train_trigger1_dist_index, train_sent2_word_index, train_sent2_dist_index, train_trigger2_word_index, train_trigger2_dist_index, train_label,train_trigger_common,train_time_diff, train_test_label) print(len(batches)) ##------------ TRAIN BATCHES -------------- for i in range(0,1): print("Epoch number: " + str(i)) for batch in batches: # print(len(batches)) #print(batch[9]) feed_dict = { model.input1_text1: batch[0], model.input1_text2: batch[1], model.trigger1_text1: batch[2], model.trigger1_text2: batch[3], model.input2_text1: batch[4], model.input2_text2: batch[5], model.trigger2_text1: batch[6], model.trigger2_text2: batch[7], model.labels: batch[8], model.V_w: batch[10], model.V_d: batch[11], model.bsz_size: len(batch[0]) } _, step, summaries, loss, accuracy = sess.run([train_op, global_step, train_summary_op,model.loss, model.accuracy], feed_dict) #print(W_em1[0]) train_summary_writer.add_summary(summaries, step) if step % FLAGS.display_every == 0: time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) print(step) if step % 100 == 0: path = saver.save(sess, checkpoint_prefix, global_step=step) print("Saved model checkpoint to {}\n".format(path)) #break # if step % FLAGS.evaluate_every == 0: # print("\nEvaluation:") path = saver.save(sess, checkpoint_prefix, global_step=step) print("Saved model checkpoint to {}\n".format(path))
print("Accuracy {}/{} ({:.2f}%), TPR {:.2f}%, TNR {:.2f}%".format( correct, test_num, correct / test_num * 100, 100 * TPR, 100 * TNR)) return res if __name__ == "__main__": utilData = UtilData("./data/image.txt") train_names = utilData.train_names test_names = utilData.test_names img_info = utilData.img_info resList = [] for i in range(5): model = Attention.Attention(args) if args.cuda: model.cuda() optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.999), weight_decay=args.reg) criteration = nn.CrossEntropyLoss() print('cross-validation: {}'.format(i)) train_loader = data_utils.DataLoader( dataLoader( train_name=train_names[i], test_name=test_names[i], img_info=img_info,
plotter.plot('attention_accuracy', 'val', 'Attention Accuracy', epoch, val_acc) plotter.plot('attention_auc', 'val', 'Attention AUC', epoch, val_auc) plotter.plot('attention_f1', 'val', 'Attention F1', epoch, val_f1) plotter.save(['Tutorial Plots Attention']) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) return model # In[24]: if __name__ == "__main__": # model_ft = Resnet_Classifier() model_ft = Attention(path="model34") model_ft = model_ft.to(device) # for param in model_ft.parameters(): # print(param.requires_grad) # print(param,size()) criterion = nn.CrossEntropyLoss( weight=torch.Tensor([1.0 / 165.0, 1.0 / 122.0]).to(device)) optimizer_ft = optim.Adam(model_ft.parameters(), lr=0.0001) scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=10, gamma=0.1) # model_ft = train_model(model_ft, criterion, optimizer_ft, scheduler, num_epochs=200) global plotter plotter = utils.VisdomLinePlotter(env_name='Tutorial Plots Resnet') # In[ ]:
print('Load Train and Test Set') loader_kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} train_loader = data_utils.DataLoader(BarleyBatches(train=True), batch_size=1, shuffle=True, **loader_kwargs) test_loader = data_utils.DataLoader(BarleyBatches(train=False), batch_size=1, shuffle=False, **loader_kwargs) print('Init Model') if args.model == 'attention': model = Attention() elif args.model == 'gated_attention': model = GatedAttention() if args.cuda: model.cuda() optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.999), weight_decay=args.reg) writer = SummaryWriter() def train(epoch): model.train() train_loss = 0.
shuffle=True, **loader_kwargs) test_loader = data_utils.DataLoader(MnistBags(target_number=args.target_number, mean_bag_length=args.mean_bag_length, var_bag_length=args.var_bag_length, num_bag=args.num_bags_test, seed=args.seed, train=False), batch_size=1, shuffle=False, **loader_kwargs) print('Init Model') if args.model=='attention': model = Attention() elif args.model=='gated_attention': model = GatedAttention() if args.cuda: model.cuda() optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.999), weight_decay=args.reg) def train(epoch): model.train() train_loss = 0. train_error = 0. for batch_idx, (data, label) in enumerate(train_loader): bag_label = label[0] if args.cuda:
def main(): make_deterministic() # region Prepare data with Timer('\nData preparation time: %s\n'): ru_lang = Language() en_lang = Language() yandex = Yandex( 'datasets/yandex/corpus.en_ru.1m.ru', 'datasets/yandex/corpus.en_ru.1m.en', ru_lang, en_lang, data_slice=H.dataset_slice, ) paracrawl = ParaCrawl( 'datasets/paracrawl/en-ru.txt', ru_lang, en_lang, data_slice=slice(0), ) low = ru_lang.lower_than(H.ru_word_count_minimum) infrequent_words_n = max( ceil(ru_lang.words_n * H.infrequent_words_percent), len(low)) if infrequent_words_n > 0: ru_lang.drop_words(ru_lang.lowk(infrequent_words_n)) print( f'{infrequent_words_n:,} infrequent Russian words are dropped') low = en_lang.lower_than(H.en_word_count_minimum) if len(low) > 0: en_lang.drop_words(*low) print(f'{len(low):,} infrequent English words are dropped') print( f'Russian language: {ru_lang.words_n:,} words, {ru_lang.sentence_length:,} words in a sentence' ) print( f'English language: {en_lang.words_n:,} words, {en_lang.sentence_length:,} words in a sentence' ) batch = H.batch_size dataset = ConcatDataset((yandex, paracrawl)) loader = DataLoader(dataset, batch, shuffle=True) # endregion # region Models and optimizers model = Seq2Seq( Encoder(ru_lang.words_n, H.encoder_embed_dim, H.encoder_hidden_dim, H.encoder_bi, H.decoder_hd), Attention(H.encoder_hd, H.decoder_hd), Decoder(en_lang.words_n, H.decoder_embed_dim, H.decoder_hidden_dim, H.encoder_hd), ).to(Device).train() optimizer = Adam(model.parameters(), lr=H.learning_rate) criterion = CrossEntropyLoss(ignore_index=Token_PAD, reduction='sum') # endregion # region Training teaching_percent = H.teaching_percent total = len(dataset) log_interval = max(5, round(total / batch / 1000)) for epoch in range(1, H.epochs + 1): with Printer() as printer: printer.print(f'Train epoch {epoch}: starting...') for i, ((ru, ru_l), en_sos, en_eos) in enumerate(loader, 1): # Zero the parameter gradients optimizer.zero_grad() # Run data through model predictions = model(ru, ru_l, en_sos, teaching_percent) # Calculate loss loss = criterion(predictions, en_eos) # Back propagate and perform optimization loss.backward() clip_grad_norm_(model.parameters(), H.gradient_norm_clip) optimizer.step() # Print log if i % log_interval == 0: printer.print( f'Train epoch {epoch}: {i * batch / total:.1%} [{i * batch:,}/{total:,}]' ) printer.print(f'Train epoch {epoch}: completed') # endregion torch.save( ( ru_lang.__getnewargs__(), en_lang.__getnewargs__(), model.cpu().eval().data, ), 'data/data.pt', ) evaluate(model.to(Device), ru_lang, en_lang, 'datasets/yandex/corpus.en_ru.1m.ru', slice(H.dataset_slice.stop + 1, H.dataset_slice.stop + 1 + 100))
def train(config_path, resume=True): # Load the parameters param_dict, rep_param_dict = load_params(config_path) # use cuda flag use_cuda = True """ the tranining directory """ # load data TRAIN_DIR01 = "{}/MQ2007/S1/".format(param_dict["data_base_path"]) TRAIN_DIR02 = "{}/MQ2007/S2/".format(param_dict["data_base_path"]) TRAIN_DIR03 = "{}/MQ2007/S3/".format(param_dict["data_base_path"]) TRAIN_DIR04 = "{}/MQ2007/S4/".format(param_dict["data_base_path"]) TRAIN_DIR05 = "{}/MQ2007/S5/".format(param_dict["data_base_path"]) TEST_DIR01 = '{}/MQ2007/S1/'.format(param_dict["data_base_path"]) TEST_DIR02 = '{}/MQ2007/S2/'.format(param_dict["data_base_path"]) TEST_DIR03 = '{}/MQ2007/S3/'.format(param_dict["data_base_path"]) TEST_DIR04 = '{}/MQ2007/S4/'.format(param_dict["data_base_path"]) TEST_DIR05 = '{}/MQ2007/S5/'.format(param_dict["data_base_path"]) train_files01 = glob.glob("{}/data0.pkl".format(TRAIN_DIR01)) train_files02 = glob.glob("{}/data0.pkl".format(TRAIN_DIR02)) train_files03 = glob.glob("{}/data0.pkl".format(TRAIN_DIR03)) train_files04 = glob.glob("{}/data0.pkl".format(TRAIN_DIR04)) train_files05 = glob.glob("{}/data0.pkl".format(TRAIN_DIR05)) test_files01 = glob.glob("{}/testdata0.pkl".format(TEST_DIR01)) test_files02 = glob.glob("{}/testdata0.pkl".format(TEST_DIR02)) test_files03 = glob.glob("{}/testdata0.pkl".format(TEST_DIR03)) test_files04 = glob.glob("{}/testdata0.pkl".format(TEST_DIR04)) test_files05 = glob.glob("{}/testdata0.pkl".format(TEST_DIR05)) fold = param_dict["fold"] model_base_path = param_dict['model_base_path'] model_name_str = param_dict['model_name_str'] q_len = param_dict["q_len"] d_len = param_dict["d_len"] if fold == 1: train_files = train_files01 + train_files02 + train_files03 test_files = test_files04[0] # a path list ['/...'] only take the str rel_path = '{}/{}/tmp/test/S4.qrels'.format(model_base_path, model_name_str) elif fold == 2: train_files = train_files02 + train_files03 + train_files04 test_files = test_files05[0] rel_path = '{}/{}/tmp/test/S5.qrels'.format(model_base_path, model_name_str) elif fold == 3: train_files = train_files03 + train_files04 + train_files05 test_files = test_files01[0] rel_path = '{}/{}/tmp/test/S1.qrels'.format(model_base_path, model_name_str) elif fold == 4: train_files = train_files04 + train_files05 + train_files01 test_files = test_files02[0] rel_path = '{}/{}/tmp/test/S2.qrels'.format(model_base_path, model_name_str) elif fold == 5: train_files = train_files05 + train_files01 + train_files02 test_files = test_files03[0] rel_path = '{}/{}/tmp/test/S3.qrels'.format(model_base_path, model_name_str) else: raise ValueError("wrong fold num {}".format(fold)) """ Build the model """ emb_size = param_dict['emb_size'] num_heads = param_dict['num_heads'] kernel_size = rep_param_dict['kernel_size'] filt_size = rep_param_dict['filt_size'] vocab_size = param_dict['vocab_size'] output_dim = rep_param_dict['output_dim'] hidden_size = param_dict['hidden_size'] batch_size = param_dict['batch_size'] preemb = param_dict['preemb'] emb_path = param_dict['emb_path'] hinge_margin = param_dict['hinge_margin'] model = Attention(emb_size=emb_size, query_length=q_len, doc_length=d_len, num_heads=num_heads, kernel_size=kernel_size, filter_size=filt_size, vocab_size=vocab_size, dropout=0.0, qrep_dim=output_dim, hidden_size=hidden_size, batch_size=batch_size, preemb=preemb, emb_path=emb_path) if use_cuda: model.cuda() # optimizer optimizer = optim.Adam(model.parameters(), lr=param_dict['learning_rate'], betas=(param_dict['beta1'], param_dict['beta2']), weight_decay=param_dict['alpha']) # loss func loss = nn.MarginRankingLoss(margin=hinge_margin, size_average=True) # experiment print("Experiment") if resume == False: f_log = open( '{}/{}/logs/training_log.txt'.format(model_base_path, model_name_str), 'w+', 1) valid_log = open( '{}/{}/logs/valid_log.txt'.format(model_base_path, model_name_str), 'w+', 1) else: f_log = open( '{}/{}/logs/training_log.txt'.format(model_base_path, model_name_str), 'a+', 1) valid_log = open( '{}/{}/logs/valid_log.txt'.format(model_base_path, model_name_str), 'a+', 1) # model_file model_file = '{}/{}/saves/model_file'.format(model_base_path, model_name_str) """ TRAINING """ # define the parameters n_epoch = param_dict['n_epoch'] # init best validation MAP value best_MAP = 0.0 best_NDCG1 = 0.0 batch_count_tr = 0 # restore saved parameter if resume_training is true if resume == True: model_file = '{}/{}/saves/model_file'.format(model_base_path, model_name_str) model.load_state_dict(torch.load(model_file)) with open( '{}/{}/saves/best_MAP.pkl'.format(model_base_path, model_name_str), 'rb') as f_MAP: best_MAP = pickle.load(f_MAP) print("loaded model, and resume training now") for epoch in range(1, n_epoch + 1): '''load_data''' for f in train_files: data = load_dataset(f) print("loaded {}".format(f)) '''prepare_data''' [Q, D_pos, D_neg, L] = pair_data_generator(data, q_len) valid_data = load_dataset(test_files) ''' shuffle data''' train_data = list_shuffle(Q, D_pos, D_neg, L) '''training func''' num_batch = len(train_data[0]) // batch_size for batch_count in range(num_batch): Q = train_data[0][batch_size * batch_count:batch_size * (batch_count + 1)] D_pos = train_data[1][batch_size * batch_count:batch_size * (batch_count + 1)] D_neg = train_data[2][batch_size * batch_count:batch_size * (batch_count + 1)] L = train_data[3][batch_size * batch_count:batch_size * (batch_count + 1)] if use_cuda: Q = Variable(torch.LongTensor( pad_batch_list(Q, max_len=q_len, padding_id=0)), requires_grad=False).cuda() D_pos = Variable(torch.LongTensor( pad_batch_list(D_pos, max_len=d_len, padding_id=0)), requires_grad=False).cuda() D_neg = Variable(torch.LongTensor( pad_batch_list(D_neg, max_len=d_len, padding_id=0)), requires_grad=False).cuda() L = Variable(torch.FloatTensor(L), requires_grad=False).cuda() else: Q = Variable(torch.LongTensor( pad_batch_list(Q, max_len=q_len, padding_id=0)), requires_grad=False) D_pos = Variable(torch.LongTensor( pad_batch_list(D_pos, max_len=d_len, padding_id=0)), requires_grad=False) D_neg = Variable(torch.LongTensor( pad_batch_list(D_neg, max_len=d_len, padding_id=0)), requires_grad=False) L = Variable(torch.FloatTensor(L), requires_grad=False) # run on this batch optimizer.zero_grad() t1 = time.time() q_mask, d_pos_mask, d_neg_mask = model.generate_mask( Q, D_pos, D_neg) """ need to do the modification i the model.py """ S_pos, S_neg = model(Q, D_pos, D_neg, q_mask, d_pos_mask, d_neg_mask) Loss = hinge_loss(S_pos, S_neg, 1.0) Loss.backward() optimizer.step() t2 = time.time() batch_count_tr += 1 print("epoch {} batch {} training cost: {} using {}s" \ .format(epoch, batch_count+1, Loss.data[0], t2-t1)) f_log.write("epoch {} batch {} training cost: {}, using {}s". format(epoch, batch_count + 1, Loss.data[0], t2 - t1) + '\n') """ evaluate part """ if batch_count_tr % 20 == 0: if valid_data is not None: MAP, NDCGs = evaluate(config_path, model, valid_data, rel_path, mode="valid") print(MAP, NDCGs) valid_log.write( "epoch {}, batch {}, MAP: {}, NDCGs: {} {} {} {}". format(epoch + 1, batch_count + 1, MAP, NDCGs[1][0], NDCGs[1][1], NDCGs[1][2], NDCGs[1][3])) if MAP > best_MAP: # save this best model best_MAP = MAP with open( '{}/{}/saves/best_MAP.pkl'.format( model_base_path, model_name_str), 'wb') as f_MAP: pickle.dump(best_MAP, f_MAP) # save model params after several epoch model_file = '{}/{}/saves/model_file'.format( model_base_path, model_name_str) torch.save(model.state_dict(), model_file) print("successfully saved model to the path {}". format(model_file)) valid_log.write("{} {} {} {}".format( NDCGs[1][0], NDCGs[1][1], NDCGs[1][2], NDCGs[1][3])) valid_log.write(" MAP: {}".format(MAP)) valid_log.write('\n') f_log.close() valid_log.close()