def __prepare_train_data(self, X_r: list, y_r: list, source2index: list, target2index: list): X_p, y_p = [], [] for source, target in zip(X_r, y_r): X_p.append( prepare_sequence(['<s>'] + source + ['</s>'], source2index).view(1, -1)) y_p.append( prepare_sequence(['<s>'] + target + ['</s>'], target2index).view(1, -1)) train_data = list(zip(X_p, y_p)) return train_data
def train(epoch): print("Training Epoch:", epoch) att_g.train() total_loss = 0 total_img_loss = 0 total_text_loss = 0 for idx, (img, caps, info) in enumerate(train_dataset): img = img[None,] img = img.to(device) img_loss, text_loss = 0, 0 optimizer.zero_grad() for s in caps: seq = prepare_sequence(s, w_map, device, tag=True) r_img, r_text, attn = att_g(img, seq) img_loss += img_criterion(r_img, img) text_loss += text_criterion(r_text, seq) loss = img_loss + text_loss loss.backward() optimizer.step() total_loss += loss.item() total_img_loss += img_loss.item() total_text_loss += text_loss.item() step = idx + 1 if step % LOG_FREQ == 0: print("Step: %d, Loss: %.2f, ImgLoss: %.2f, TextLoss: %.2f" % (step, total_loss/step, total_img_loss/step, total_text_loss/step))
def forward(self, word_input, anneal=False): char_idx = [] for sentence in word_input: sentence_chars = [] for token_idx in sentence: token_chars = [] token = self.vocab.idx2key[token_idx] if len(token) <= 20: token_chars.append( prepare_sequence(token, self.char_set) + [self.char_set["<pad>"]] * (20 - len(token))) else: token_chars.append( prepare_sequence(token[0:13] + token[-7:], self.char_set)) sentence_chars.append(token_chars[0].copy()) char_idx.append(sentence_chars.copy()) char_input, _ = pad_packed_sequence( pack_sequence([torch.LongTensor(_) for _ in char_idx], enforce_sorted=False), batch_first=True, padding_value=self.char_set["<pad>"], ) char_input = char_input.to(self.device) batch_size = word_input.size(0) seq_len = word_input.size(1) char_output = self.char_encoder( char_input.reshape(-1, char_input.size(2))).reshape( batch_size, seq_len, -1) word_output = self.word_encoder(word_input, char_output) y = self.decoder(word_output) if anneal: preds = F.log_softmax(y / self.T, dim=2) else: preds = F.log_softmax(y, dim=2) return {"last_preds": preds} # , "embeddings": word_output}
def train(target_dir, embedding_dim, hidden_dim, glove_file): torch.manual_seed(1) train_word_to_ix, train_tag_to_ix, train_sents_idx, train_labels_idx = pickle.load( open(target_dir + "CoNLL_train.pkl", "rb")) test_word_to_ix, test_tag_to_ix, test_sents_idx, test_labels_idx = pickle.load( open(target_dir + "CoNLL_test.pkl", "rb")) model = LSTMTagger(embedding_dim, hidden_dim, len(train_word_to_ix), len(train_tag_to_ix), target_dir, glove_file) criterion = nn.NLLLoss() optimizer = optim.RMSprop(model.parameters()) EPOCHS = 2 for epoch in range(EPOCHS): loss = 0 for i, (sentence, tags) in tqdm(enumerate(zip(train_sents_idx, train_labels_idx))): model.zero_grad() model.hidden = model.init_hidden() # 単語インデックスの tensor に変換 sentence_in = utils.prepare_sequence(sentence) # Tags インデックスの tensor に変換 targets = utils.prepare_sequence(tags) tag_scores = model.forward(sentence_in) loss = criterion(tag_scores, targets) loss.backward() optimizer.step() loss += loss.item() f1_score_train_sents_avg = inference.evaluate(model, train_sents_idx[:len(test_sents_idx)], train_labels_idx[:len(test_sents_idx)]) f1_score_test_sents_avg = inference.evaluate(model, test_sents_idx, test_labels_idx) print("[{}] EPOCH {} - LOSS: {:.8f} TRAIN_DATA_F1_SCORE: {} TEST_DATA_F1_SCORE: {}". format(datetime.datetime.today(), epoch + 1, loss, f1_score_train_sents_avg, f1_score_test_sents_avg))
def __predict_sentence(self, src_batch): """ predict sentence :param src_batch: get the source sentence :return: """ hyp_batch = '' inputs = prepare_sequence(['<s>'] + src_batch + ['</s>'], self.data_model.source2index).view(1, -1) start_decode = Variable(LongTensor([[self.data_model.target2index['<s>']] * inputs.size(1)])) show_preds = self.qrnn(inputs, [inputs.size(1)], start_decode) outputs = torch.max(show_preds, dim=1)[1].view(len(inputs), -1) for pred in outputs.data.tolist(): for each_pred in pred: hyp_batch += self.data_model.index2target[each_pred] hyp_batch = hyp_batch.replace('<s>', '') hyp_batch = hyp_batch.replace('</s>', '') return hyp_batch
def generate_seq(self, hidden_f, seq): batch_size = 1 outputs = torch.zeros(seq.size(0), batch_size, self.text_encoder.vocab_size).to(self.device) hidden_f = hidden_f.view(hidden_f.size(1), -1) hidden = hidden_f.mean(1).view(2, 1, self.text_decoder.hidden_dim // 2) hidden = (hidden, hidden) input = prepare_sequence([START_TAG], self.w_map, self.device) for t in range(0, len(seq)): output, hidden = self.text_decoder(input, hidden) outputs[t] = output top1 = output.max(1)[1] if top1.item() == END_TAG: break input = top1 outputs = outputs.view(outputs.size(0), -1) return outputs
def test(fert_model, out_path): toks = 0 all_out_ferts = [] print("Starting evaluation on test set... (%d sentences)" % len(test_data)) for sentence in test_data: fert_model.zero_grad() fert_model.hidden = fert_model.init_hidden() sentence_in = utils.prepare_sequence(sentence, word_to_ix, gpu=args.gpu) fert_scores = fert_model(sentence_in.view(1, -1)) if args.model_type == 'regression': out_ferts = fert_scores.cpu().data.numpy().flatten() else: expected = True if expected: ss = fert_scores.cpu().data.numpy() probs = np.exp(ss) / np.tile( np.exp(ss).sum(2)[:, :, None], (1, 1, ss.shape[2])) out_ferts = (probs[0, :, :] * np.tile( (1. + np.arange(ss.shape[2])), (ss.shape[1], 1))).sum(1) else: values, indices = torch.max(fert_scores, 2) out_ferts = indices.cpu().data.numpy().flatten() + 1 toks += out_ferts.shape[0] all_out_ferts.append(out_ferts.tolist()) print("Writing predicted fertility values..") # Write fertility values to file with open(out_path, 'w') as f: for ferts in all_out_ferts: for fert in ferts: f.write("%s " % fert) f.write("\n")
def infer(model, sent_idx): with torch.no_grad(): inputs = utils.prepare_sequence(sent_idx) tag_scores = model.forward(inputs) _, pred_tag = torch.max(tag_scores.data, 1) return pred_tag
def eval(fert_model, curEpoch=None): correct = 0 toks = 0 num_matches = num_pred = num_gold = 0 all_out_ferts = [] # all_targets = np.array([]) print("Starting evaluation on dev set... (%d sentences)" % len(dev_data)) for start_idx, end_idx in dev_order: dev_sents = dev_data[start_idx:end_idx + 1] target_ferts = dev_ferts[start_idx:end_idx + 1] fert_model.zero_grad() fert_model.hidden = fert_model.init_hidden(len(dev_sents)) batch_sents = torch.stack([ utils.prepare_sequence(sentence, word_to_ix, gpu=args.gpu) for sentence in dev_sents ]) batch_ferts = torch.stack([ utils.prepare_sequence(ferts, gpu=args.gpu) for ferts in target_ferts ]) fert_scores = fert_model(batch_sents) if args.model_type == 'regression': out_ferts = fert_scores.cpu().data.numpy().flatten() out_ferts = np.round(out_ferts) gold_ferts = batch_ferts.cpu().data.numpy().flatten() correct += np.count_nonzero(out_ferts == gold_ferts) toks += out_ferts.shape[0] num_matches += np.count_nonzero( np.logical_and(out_ferts == gold_ferts, gold_ferts != 1)) num_pred += np.count_nonzero(out_ferts != 1) num_gold += np.count_nonzero(gold_ferts != 1) else: values, indices = torch.max(fert_scores, 2) out_ferts = indices.cpu().data.numpy().flatten() + 1 gold_ferts = batch_ferts.cpu().data.numpy().flatten() correct += np.count_nonzero(out_ferts == gold_ferts) toks += out_ferts.shape[0] num_matches += np.count_nonzero( np.logical_and(out_ferts == gold_ferts, gold_ferts != 1)) num_pred += np.count_nonzero(out_ferts != 1) num_gold += np.count_nonzero(gold_ferts != 1) all_out_ferts.append(out_ferts.tolist()) # all_targets = np.append(all_targets, batch_ferts) precision = num_matches / num_pred recall = num_matches / num_gold f1 = 2. * num_matches / (num_pred + num_gold) avg_tok_accuracy = correct / toks print("Dev Set Accuracy: %f" % avg_tok_accuracy) print("Dev Set Precision: %f" % precision) print("Dev Set Recall: %f" % recall) print("Dev Set F1: %f" % f1) return f1 #avg_tok_accuracy
def main(): ############################################################################################ if not os.path.isfile(args.model_name): if args.model_type == 'regression': fert_model = models.BiLSTMRegressor(args.emb_dim, args.hidden_dim, len(word_to_ix), args.max_fert, args.n_layers, args.dropout, args.gpu) else: fert_model = models.BiLSTMTagger(args.emb_dim, args.hidden_dim, len(word_to_ix), args.max_fert, args.n_layers, args.mlp_dim, args.dropout, args.gpu) custom_weight = torch.ones(args.max_fert) custom_weight[0] = 0.5 #0.6 # TODO: set this as a hyperparameter? if args.gpu: fert_model = fert_model.cuda() custom_weight = custom_weight.cuda() if args.model_type == 'regression': loss_function = nn.MSELoss() else: loss_function = nn.NLLLoss(weight=custom_weight) #optimizer = optim.SGD(fert_model.parameters(), lr=0.1) #optimizer = optim.Adam(fert_model.parameters(), lr=0.001) #optimizer = optim.Adam(fert_model.parameters(), lr=0.001, weight_decay=0.001) optimizer = optim.Adam(fert_model.parameters(), lr=0.001) print("Training fertility predictor model...") patience_counter = 0 prev_avg_tok_accuracy = 0 best_avg_tok_accuracy = 0 random.shuffle(train_order) for epoch in xrange(args.epochs): accuracies = [] sent = 0 tokens = 0 cum_loss = 0 batch_idx = 1 num_matches = num_pred = num_gold = 0 print("Starting epoch %d .." % epoch) for start_idx, end_idx in train_order: train_sents = training_data[start_idx:end_idx + 1] target_ferts = training_ferts[start_idx:end_idx + 1] sent += end_idx - start_idx + 1 tokens += sum([len(sentence) for sentence in train_sents]) metric = "MSE" if args.model_type == 'regression' else "Average Accuracy" if batch_idx % 100 == 0: print("[Epoch %d] \ Sentence %d/%d, \ Tokens %d \ Cum_Loss: %f \ %s: %f" % (epoch, sent, len(training_data), tokens, cum_loss / tokens, metric, sum(accuracies) / len(accuracies))) # Step 1. Remember that Pytorch accumulates gradients. We need to clear them out # before each instance fert_model.zero_grad() # Also, we need to clear out the hidden state of the LSTM, detaching it from its # history on the last instance. fert_model.hidden = fert_model.init_hidden(len(train_sents)) # Step 2. Get our inputs ready for the network, that is, turn them into Variables # of word indices. batch_sents = torch.stack([ utils.prepare_sequence(sentence, word_to_ix, gpu=args.gpu) for sentence in train_sents ]) batch_ferts = torch.stack([ utils.prepare_sequence(ferts, gpu=args.gpu) for ferts in target_ferts ]) # Step 3. Run our forward pass. fert_scores = fert_model(batch_sents) if args.model_type == 'regression': out_ferts = fert_scores.cpu().data.numpy().flatten() err = out_ferts - batch_ferts.float().cpu().data.numpy( ).flatten() sent_acc = sum(err**2 / out_ferts.shape[0]) accuracies.append(sent_acc) # This is actually MSE. out_ferts = np.round(out_ferts) gold_ferts = batch_ferts.cpu().data.numpy().flatten() #sent_acc = np.count_nonzero(out_ferts==gold_ferts) / out_ferts.shape[0] #accuracies.append(sent_acc) num_matches += np.count_nonzero( np.logical_and(out_ferts == gold_ferts, gold_ferts != 1)) num_pred += np.count_nonzero(out_ferts != 1) num_gold += np.count_nonzero(gold_ferts != 1) # Step 4. Compute the loss, gradients, and update the parameters loss = loss_function(fert_scores, batch_ferts.float()) else: values, indices = torch.max(fert_scores, 2) out_ferts = indices.cpu().data.numpy().flatten() + 1 gold_ferts = batch_ferts.cpu().data.numpy().flatten() sent_acc = np.count_nonzero( out_ferts == gold_ferts) / out_ferts.shape[0] accuracies.append(sent_acc) num_matches += np.count_nonzero( np.logical_and(out_ferts == gold_ferts, gold_ferts != 1)) num_pred += np.count_nonzero(out_ferts != 1) num_gold += np.count_nonzero(gold_ferts != 1) # Step 4. Compute the loss, gradients, and update the parameters loss = loss_function( fert_scores.view( len(train_sents) * len(train_sents[0]), -1), batch_ferts.view(-1) - 1) cum_loss += loss.cpu().data[0] loss.backward() optimizer.step() batch_idx += 1 precision = num_matches / num_pred recall = num_matches / num_gold f1 = 2. * num_matches / (num_pred + num_gold) print("Loss: %f" % loss.cpu().data.numpy()) print("Accuracy: %f" % np.mean(accuracies)) print("Precision: %f" % precision) print("Recall: %f" % recall) print("F1: %f" % f1) print("Evaluating on dev set...") avg_tok_accuracy = eval(fert_model, epoch) if avg_tok_accuracy > best_avg_tok_accuracy: best_avg_tok_accuracy = avg_tok_accuracy print("Saving model..") torch.save(fert_model, args.model_name) # Early Stopping if avg_tok_accuracy <= prev_avg_tok_accuracy: patience_counter += 1 if patience_counter == args.patience: print( "Model hasn't improved on dev set for %d epochs. Stopping Training." % patience_counter) break prev_avg_tok_accuracy = avg_tok_accuracy else: print("Loading tagger model from " + args.model_name + "...") fert_model = torch.load(args.model_name) if args.gpu: fert_model = fert_model.cuda() if args.test: out_path = args.write_fertilities if args.write_fertilities else args.test_source_path + ".fert.predicted" test(fert_model, out_path)
def seg(self, text, user_dict=None): text = text.strip() if len(text) == 0: return [] results = [] if user_dict is None: new_dict = self.userdict else: new_dict = Trie() new_dict.add_dict_word(user_dict) # word_list = utils.get_word(text) word_list = [] for i in text: word_list.append(i) # print(word_list) sentence_tensor = utils.prepare_sequence(word_list, self.char2index) # print((sentence_tensor)) with torch.no_grad(): sentence, text, mask = utils.collate_fn_without_label([ (sentence_tensor, text) ]) batch_size, seq_len = sentence.shape nb_labels = len(utils.tag_to_ix) text_score = torch.zeros(batch_size, seq_len, nb_labels).float() for i in range(batch_size): matchs = new_dict.cut(text[i]) matchs.extend(process_eng(text[i])) # print(matchs) for m in matchs: weight = new_dict.get_weight(m[2]) * 10.0 if len(m[2]) == 1: text_score[i, m[0], utils.tag_to_ix[SINGLE_TAG]] = weight elif len(m[2]) == 2: text_score[i, m[0], utils.tag_to_ix[BEGIN_TAG]] = weight text_score[i, m[0] + 1, utils.tag_to_ix[END_TAG]] = weight else: text_score[i, m[0], utils.tag_to_ix[BEGIN_TAG]] = weight text_score[i, m[1] - 1, utils.tag_to_ix[END_TAG]] = weight text_score[i, m[0] + 1:m[1] - 1, utils.tag_to_ix[MIDDLE_TAG]] = weight masks = mask.to(self.device) sen = sentence.to(self.device) temp_pred = self.segmentor_model(sen, mask=masks, text=text_score)[1] for i in range(batch_size): result = '' for j in range(len(temp_pred[i])): # if text[i][j] == ' ': # continue result += text[i][j] if temp_pred[i][j] == 4 or temp_pred[i][j] == 3: results.append(result) result = '' return results
def preprocess(self, bin_dataframe): # bin_np = bin_dataframe.as_matrix() bin_np = bin_dataframe.to_numpy() docNr = -1 bin_tweets = [] bin_tweet_lengths=[] bin_tweets_text=[] previous_match = "" match = [] for i in range(bin_np.shape[0]): if bin_np[i][1] == None or i == bin_np.shape[0] - 1: # append all docs including the last one if (i == bin_np.shape[0] - 1): # append last line tweet_text = utils.lstToString(utils.strToLst(bin_np[i][1])).split() tweet, tweet_length = utils.prepare_sequence( tweet_text, self.word_to_ix, pad_length=self.pad_length) bin_tweets.append(tweet) bin_tweet_lengths.append(tweet_length) bin_tweets_text.append(tweet_text) if (docNr != -1): #bin_tweets = np.asarray(bin_tweets) try: tag_id = self.tag_to_ix[target] if target.startswith("B-") or target.startswith("I-"): ec_id=self.ec_to_ix[target[2:]] else: ec_id=self.ec_to_ix[target] except: # print(target) if target.startswith("B-"): tag_id = self.tag_to_ix["B-Other"] elif target.startswith("I-"): tag_id = self.tag_to_ix["I-Other"] ec_id = self.ec_to_ix["Other"] if target=="O": event_duration_idx = self.event_to_ix["non-event"] else: event_duration_idx = self.event_to_ix["event"] if event_id==-1: independent_event_idx = self.event_to_ix["non-event"] else: independent_event_idx = self.event_to_ix["event"] #print (len(bin_tweets)) #print (torch.stack(bin_tweets)) match.append([torch.stack(bin_tweets), tag_id,ec_id,event_duration_idx,independent_event_idx,event_type,event_id,bin_tweet_lengths]) #print (utils.getDictionaryKeyByIdx(self.tag_to_ix,tag_id),utils.getDictionaryKeyByIdx(self.ec_to_ix,ec_id),utils.getDictionaryKeyByIdx(self.event_to_ix,event_id)) # match=np.append(match,bin_tokens) # match['match_bins'].append(bin) docNr += 1 if i != bin_np.shape[0] - 1: infoDict = utils.strToLst(bin_np[i][0]) # print('infoDict', infoDict) if previous_match != infoDict['doc']: # print (infoDict['doc']) # match = {'match_bins': np.empty((0)),"match_name": infoDict['doc']} previous_match = infoDict['doc'] # below two lines should be interchanged i think match = [] self.matches.append(match) bin_tweets = [] bin_tweet_lengths=[] bin_tweets_text=[] target = infoDict['corrected_tags'] event_type = infoDict['event_type'] event_id = infoDict['event_id'] match_name= infoDict['doc'] # {'bin': infoDict['bin'],'targets': infoDict['corrected_tags'],'tweets':[],'timestamps':[],'tokens':""} else: # bin['tweets'].append(strToLst(bin_np[i][1])) # bin_tokens+=" "+lstToString(strToLst(bin_np[i][1])) # bin['timestamps'].append(int(bin_np[i][0])) # print ((lstToString(strToLst(bin_np[i][1])).split())) #print (bin_tokens) tweet_text=utils.lstToString(utils.strToLst(bin_np[i][1])).split() tweet,tweet_length=utils.prepare_sequence(tweet_text, self.word_to_ix, pad_length=self.pad_length) bin_tweets.append(tweet) bin_tweet_lengths.append(tweet_length) bin_tweets_text.append(tweet_text)
def eval(tagger_model, k, dev_or_test="dev"): if k==-1: eval_data = dev_data if dev_or_test=="dev" else test_data else: eval_data = dev_datasets[k] correct = 0 toks = 0 hypTags = [] goldTags = [] all_out_tags = np.array([]) all_targets = np.array([]) logProbs = [] print("Starting evaluation on %s set... (%d sentences)" % (dev_or_test, len(eval_data))) lang_id = [] if args.model_type=="universal": lang_id = [lang] sentCount = 0 for sentence, morph in eval_data: tagger_model.zero_grad() tagger_model.char_hidden = tagger_model.init_hidden() tagger_model.hidden = tagger_model.init_hidden() sent_in = [] sentCount += 1 for word in sentence: s_appended_word = lang_id + [c for c in word] + lang_id word_in = utils.prepare_sequence(s_appended_word, char_to_ix, args.gpu) sent_in.append(word_in) #targets = utils.prepare_sequence(morph, labels_to_ix, args.gpu) if args.sum_word_char: word_seq = utils.prepare_sequence(sentence, word_to_ix, args.gpu) else: word_seq = None if args.model_type=="specific": tag_scores = tagger_model(sent_in, word_idxs=word_seq, lang=langs[-1], test=True) else: tag_scores = tagger_model(sent_in, word_idxs=word_seq, test=True) tag_scores = tag_scores[:, :-1] #values, indices = torch.topk(tag_scores, k=100, dim=1) values, indices = torch.max(tag_scores, 1) out_tags = indices.cpu().data.numpy() #for i in range(out_tags.shape[0]): # hypTags.append([utils.unfreeze_dict(ix_to_labels[idx]) for idx in out_tags[i]]) hypTags.append([ix_to_labels[idx] for idx in out_tags]) scores = values.cpu().data.numpy() #logProbs += [list(scores[i]) for i in range(scores.shape[0])] #all_out_tags = np.append(all_out_tags, out_tags) goldTags.append(morph) #targets = targets.cpu().data.numpy() #correct += np.count_nonzero(out_tags==targets) #print(out_tags) #correct += np.count_nonzero(np.array([ix_to_labels[idx] for idx in out_tags])==np.array(morph)) toks += len(sentence) avg_tok_accuracy = correct / toks prefix = args.model_type + "_" if args.sum_word_char: prefix += "_wc-sum" prefix += "-".join([l for l in langs]) + "_" + dev_or_test if args.sent_attn: prefix += "-sent_attn" if args.tgt_size: prefix += "_" + str(args.tgt_size) write = True folds = 10 dev_size = (int)(len(training_data)/folds) if args.jackknife else None if write: utils.write_unimorph(args.treebank_path, hypTags, logProbs, sentCount, k, dev_or_test=dev_or_test, dev_size=dev_size) return avg_tok_accuracy
def train(k, training_data_jack, dev_data_jack): if not os.path.isfile(args.model_name) or args.continue_train: if args.continue_train: print("Loading tagger model from " + args.model_name + "...") tagger_model = torch.load(args.model_name, map_location=lambda storage, loc: storage) if args.gpu: tagger_model = tagger_model.cuda() else: tagger_model = models.BiLSTMTagger(args, word_freq, langs, len(char_to_ix), len(word_to_ix), len(labels_to_ix)) if args.gpu: tagger_model = tagger_model.cuda() loss_function = nn.NLLLoss() if args.optim=="sgd": optimizer = optim.SGD(tagger_model.parameters(), lr=0.1) elif args.optim=="adam": optimizer = optim.Adam(tagger_model.parameters()) elif args.optim=="adagrad": optimizer = optim.Adagrad(tagger_model.parameters()) elif args.optim=="rmsprop": optimizer = optim.RMSprop(tagger_model.parameters()) print("Training tagger model...") patience_counter = 0 prev_avg_tok_accuracy = 0 for epoch in range(args.epochs): accuracies = [] sent = 0 tokens = 0 cum_loss = 0 correct = 0 print("Starting epoch %d .." %epoch) for lang in langs: lang_id = [] if args.model_type=="universal": lang_id = [lang] for sentence, morph in training_data_jack: sent += 1 if sent%100==0: print("[Epoch %d] \ Sentence %d/%d, \ Tokens %d \ Cum_Loss: %f \ Average Accuracy: %f" % (epoch, sent, len(training_data_jack), tokens, cum_loss/tokens, correct/tokens)) tagger_model.zero_grad() sent_in = [] tokens += len(sentence) for word in sentence: s_appended_word = lang_id + [c for c in word] + lang_id word_in = utils.prepare_sequence(s_appended_word, char_to_ix, args.gpu) # targets = utils.prepare_sequence(s_appended_word[1:], char_to_ix, args.gpu) sent_in.append(word_in) # sent_in = torch.stack(sent_in) tagger_model.char_hidden = tagger_model.init_hidden() tagger_model.hidden = tagger_model.init_hidden() targets = utils.prepare_sequence(morph, labels_to_ix, args.gpu) if args.sum_word_char: word_seq = utils.prepare_sequence(sentence, word_to_ix, args.gpu) else: word_seq = None if args.model_type=="specific" or args.model_type=="joint": tag_scores = tagger_model(sent_in, word_idxs=word_seq, lang=lang) else: tag_scores = tagger_model(sent_in, word_idxs=word_seq) values, indices = torch.max(tag_scores, 1) out_tags = indices.cpu().data.numpy().flatten() correct += np.count_nonzero(out_tags==targets.cpu().data.numpy()) loss = loss_function(tag_scores, targets) cum_loss += loss.cpu().data[0] loss.backward() optimizer.step() print("Loss: %f" % loss.cpu().data.numpy()) print("Accuracy: %f" %(correct/tokens)) print("Saving model..") torch.save(tagger_model, args.model_name) #print("Evaluating on dev set...") #avg_tok_accuracy = eval(tagger_model, curEpoch=epoch) # Early Stopping #if avg_tok_accuracy <= prev_avg_tok_accuracy: # patience_counter += 1 # if patience_counter==args.patience: # print("Model hasn't improved on dev set for %d epochs. Stopping Training." % patience_counter) # break #prev_avg_tok_accuracy = avg_tok_accuracy else: print("Loading tagger model from " + args.model_name + "...") tagger_model = torch.load(args.model_name, map_location=lambda storage, loc: storage) if args.gpu: tagger_model = tagger_model.cuda() if args.test: avg_tok_accuracy = eval(tagger_model, args.fold, dev_or_test=dev_or_test) return tagger_model
test_data.append((test_sentence[i], test_pos[i], test_labels[i])) for i in range(len(val_sentence)): val_data.append((val_sentence[i], val_pos[i], val_labels[i])) word_to_ix = {} label_to_ix = {} pos_to_ix = {} utils.append_to_vocab(train_data, word_to_ix, label_to_ix, pos_to_ix) utils.append_to_vocab(test_data, word_to_ix, label_to_ix, pos_to_ix) utils.append_to_vocab(val_data, word_to_ix, label_to_ix, pos_to_ix) idx_train_data = [] for sentence, pos, tags in train_data: idx_sentences = utils.prepare_sequence(sentence, word_to_ix) idx_labels = utils.prepare_sequence(tags, label_to_ix) idx_pos = utils.prepare_sequence(pos, pos_to_ix) idx_train_data.append((idx_sentences, idx_pos, idx_labels)) idx_test_data = [] for sentence, pos, tags in test_data: idx_sentences = utils.prepare_sequence(sentence, word_to_ix) idx_labels = utils.prepare_sequence(tags, label_to_ix) idx_pos = utils.prepare_sequence(pos, pos_to_ix) idx_test_data.append((idx_sentences, idx_pos, idx_labels)) idx_val_data = [] for sentence, pos, tags in val_data: idx_sentences = utils.prepare_sequence(sentence, word_to_ix) idx_labels = utils.prepare_sequence(tags, label_to_ix)
def eval(tagger_model, curEpoch=None, dev_or_test="dev"): eval_data = dev_data if dev_or_test == "dev" else test_data correct = 0 toks = 0 hypTags = [] goldTags = [] all_out_tags = np.array([]) all_targets = np.array([]) print("Starting evaluation on %s set... (%d sentences)" % (dev_or_test, len(eval_data))) lang_id = [] if args.model_type == "universal": lang_id = [lang] s = 0 for sentence, morph in eval_data: tagger_model.zero_grad() tagger_model.char_hidden = tagger_model.init_hidden() tagger_model.hidden = tagger_model.init_hidden() sent_in = [] for word in sentence: s_appended_word = lang_id + [c for c in word] + lang_id word_in = utils.prepare_sequence(s_appended_word, char_to_ix, args.gpu) sent_in.append(word_in) targets = utils.prepare_sequence(morph, labels_to_ix, args.gpu) if args.sum_word_char: word_seq = utils.prepare_sequence(sentence, word_to_ix, args.gpu) else: word_seq = None if args.model_type == "specific": tag_scores = tagger_model(sent_in, word_idxs=word_seq, lang=langs[-1], test=True) else: tag_scores = tagger_model(sent_in, word_idxs=word_seq, test=True) values, indices = torch.max(tag_scores, 1) out_tags = indices.cpu().data.numpy().flatten() hypTags += [labels_to_ix[idx] for idx in out_tags] goldTags.append(morph) targets = targets.cpu().data.numpy() correct += np.count_nonzero(out_tags == targets) toks += len(sentence) avg_tok_accuracy = correct / toks prefix = args.model_type + "_" if args.sum_word_char: prefix += "_wc-sum" if dev_or_test == "dev": prefix += "-".join([l for l in langs ]) + "_" + dev_or_test + "_" + str(curEpoch) else: prefix += "-".join([l for l in langs]) + "_" + dev_or_test if args.sent_attn: prefix += "-sent_attn" if args.tgt_size: prefix += "_" + str(args.tgt_size) finalTgts = [] for tags in goldTags: for tag in tags: finalTgts.append(tag) f1_score, f1_micro_score = utils.computeF1(hypTags, finalTgts, prefix, labels_to_ix, baseline=True, write_results=True) print("Test Set Accuracy: %f" % avg_tok_accuracy) print("Test Set Avg F1 Score (Macro): %f" % f1_score) print("Test Set Avg F1 Score (Micro): %f" % f1_micro_score) with open(prefix + '_results_f1.txt', 'a') as file: file.write("\nAccuracy: " + str(avg_tok_accuracy) + "\n") return avg_tok_accuracy, f1_score
model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM) optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4) model.cuda() time_epoch = time.time() print("训练集句子数:",training_data_sentence_num) print("训练集字数:",training_data_character_num) for epoch in range(10): for sentence, tags in training_data: model.zero_grad() sentence_in = prepare_sequence(sentence, word_to_ix) targets = torch.LongTensor([tag_to_ix[t] for t in tags]) neg_log_likelihood = model.neg_log_likelihood(sentence_in, targets) neg_log_likelihood.backward() optimizer.step() print("第", epoch, "轮:", time.time() - time_epoch, "秒,",(time.time() - time_epoch)/60,"分钟") time_epoch = time.time() torch.save(model, 'word_segment_model.pkl') # save entire net 保存整个网络 # 获取测试集中的所有数据 test_data,test_data_sentence_num,test_data_character_num = utils.generate_test_data("./icwb2-data/testing/pku_test_gold_BIO.utf8") f = open("./icwb2-data/testing/test_model_10epoch.txt",'a') tag_ind = ['B','I','O','',''] print("测试集句子数:",test_data_sentence_num) print("测试集字数:",test_data_character_num) for item in test_data: prediction = model(utils.prepare_sequence(item, word_to_ix)) for tag_index in prediction[1]: f.write(tag_ind[tag_index] + '\n') f.close()
def eval_on_dev(tagger_model, curEpoch=None, dev_or_test="dev"): correct = 0 toks = 0 all_out_tags = np.array([]) all_targets = np.array([]) eval_order = dev_order if dev_or_test == "dev" else test_order eval_data = dev_data if dev_or_test == "dev" else test_data print("Starting evaluation on %s set... (%d sentences)" % (dev_or_test, len(eval_data))) lang_id = [] if args.model_type == "universal": lang_id = [langs[-1]] for start_idx, end_idx in eval_order: cur_eval_data = eval_data[start_idx:end_idx + 1] eval_sents = [elem[0] for elem in cur_eval_data] morph_sents = [elem[1] for elem in cur_eval_data] sents_in = [] for i, sentence in enumerate(eval_sents): sent_in = [] for word in sentence: s_appended_word = lang_id + [c for c in word] + lang_id word_in = utils.prepare_sequence(s_appended_word, char_to_ix, args.gpu) # targets = utils.prepare_sequence(s_appended_word[1:], char_to_ix, args.gpu) sent_in.append(word_in) sents_in.append(sent_in) tagger_model.zero_grad() tagger_model.char_hidden = tagger_model.init_hidden() tagger_model.hidden = tagger_model.init_hidden() all_word_seq = [] for sentence in eval_sents: word_seq = utils.prepare_sequence(sentence, word_to_ix, args.gpu) all_word_seq.append(word_seq) if args.model_type == "specific" or args.model_type == "joint": lstm_feats, graph, maxVal = tagger_model(sents_in, morph_sents, word_idxs=all_word_seq, langs=[langs[-1]] * len(sents_in), test=True) else: lstm_feats, graph, maxVal = tagger_model(sents_in, morph_sents, word_idxs=all_word_seq, test=True) for k in range(len(eval_sents)): hypSeq = tagger_model.getBestSequence(graph, k) targets = [utils.unfreeze_dict(tags) for tags in morph_sents[k]] correct += utils.getCorrectCount(targets, hypSeq) toks += len(eval_sents[k]) all_out_tags = np.append(all_out_tags, hypSeq) all_targets = np.append(all_targets, targets) avg_tok_accuracy = correct / toks prefix = args.model_name prefix += "_" + dev_or_test if args.sent_attn: prefix += "sent_attn" if args.tgt_size: prefix += "_" + str(args.tgt_size) write = True if dev_or_test == "test" else False f1_score, f1_micro_score = utils.computeF1(all_out_tags, all_targets, prefix, write_results=write) print("Test Set Accuracy: %f" % avg_tok_accuracy) print("Test Set Avg F1 Score (Macro): %f" % f1_score) print("Test Set Avg F1 Score (Micro): %f" % f1_micro_score) if write: with open(prefix + '_results_f1.txt', 'ab') as file: file.write("\nAccuracy: " + str(avg_tok_accuracy) + "\n") for target, hyp in zip(all_targets, all_out_tags): file.write(str(target) + "\n") file.write(str(hyp) + "\n") return avg_tok_accuracy, f1_score
def main(): if not os.path.isfile(args.model_name) or args.continue_train: if args.continue_train: print("Loading tagger model from " + args.model_name + "...") tagger_model = torch.load( args.model_name, map_location=lambda storage, loc: storage) if args.gpu: tagger_model = tagger_model.cuda() else: print("Creating new model...") tagger_model = factorial_crf_tagger.DynamicCRF(args, word_freq, langs, len(char_to_ix), \ len(word_to_ix), unique_tags) if args.gpu: tagger_model = tagger_model.cuda() if args.unit_test: tests = unit.TestBP() labelSum = sum([tag.size() for tag in tagger_model.uniqueTags]) # Create dummy LSTM features lstm_feats = utils.get_var( torch.Tensor(torch.randn(len(training_data[0][0]), labelSum)), args.gpu) tests.setUp(tagger_model, training_data[0][1], len(training_data[0][0]), lstm_feats) loss_function = nn.NLLLoss() # Provide (N,C) log probability values as input # loss_function = nn.CrossEntropyLoss() if args.optim == "sgd": optimizer = optim.SGD(tagger_model.parameters(), lr=1.0) elif args.optim == "adam": optimizer = optim.Adam(tagger_model.parameters()) elif args.optim == "adagrad": optimizer = optim.Adagrad(tagger_model.parameters()) print("Training FCRF-LSTM model...") patience_counter = 0 prev_avg_tok_accuracy = 0 for epoch in xrange(args.epochs): accuracies = [] sent = 0 batch_idx = 0 tokens = 0 cum_loss = 0 correct = 0 random.shuffle(train_order) print("Starting epoch %d .." % epoch) start_time = time.time() for start_idx, end_idx in train_order: train_data = training_data[start_idx:end_idx + 1] train_sents = [elem[0] for elem in train_data] morph_sents = [elem[1] for elem in train_data] lang_ids = train_lang_ids[start_idx:end_idx + 1] sent += end_idx - start_idx + 1 tokens += sum([len(sentence) for sentence in train_sents]) batch_idx += 1 if batch_idx % 5 == 0: print("[Epoch %d] \ Sentence %d/%d, \ Tokens %d \ Cum_Loss: %f \ Time: %f \ Tokens/Sec: %d" # Average Accuracy: %f" % (epoch, sent, len(training_data), tokens, cum_loss / tokens, time.time() - start_time, tokens / (time.time() - start_time))) # , correct/tokens)) tagger_model.zero_grad() sents_in = [] for i, sentence in enumerate(train_sents): sent_in = [] lang_id = [] if args.model_type == "universal": lang_id = [lang_ids[i]] for word in sentence: s_appended_word = lang_id + [c for c in word] + lang_id word_in = utils.prepare_sequence( s_appended_word, char_to_ix, args.gpu) # targets = utils.prepare_sequence(s_appended_word[1:], char_to_ix, args.gpu) sent_in.append(word_in) sents_in.append(sent_in) # sents_in = torch.stack(sent_in) tagger_model.char_hidden = tagger_model.init_hidden() tagger_model.hidden = tagger_model.init_hidden() if args.sum_word_char: all_word_seq = [] for sentence in train_sents: word_seq = utils.prepare_sequence( sentence, word_to_ix, args.gpu) all_word_seq.append(word_seq) else: all_word_seq = None if args.model_type == "specific" or args.model_type == "joint": lstm_feat_sents, graph, maxVal = tagger_model( sents_in, morph_sents, word_idxs=all_word_seq, langs=lang_ids) else: lstm_feat_sents, graph, maxVal = tagger_model( sents_in, morph_sents, word_idxs=all_word_seq) # Skip parameter updates if marginals are not within a threshold if maxVal > 10.00: print("Skipping parameter updates...") continue # Compute the loss, gradients, and update the parameters all_factors_batch = [] for k in range(len(train_sents)): all_factors = tagger_model.get_scores( graph, morph_sents[k], lstm_feat_sents[k], k) all_factors_batch.append(all_factors) loss = tagger_model.compute_loss(all_factors_batch, loss_function) # print("Loss:", loss) cum_loss += loss.cpu().data[0] loss.backward() # tagger_model.gradient_check(all_factors_batch[0]) optimizer.step() print("Loss: %f" % loss.cpu().data.numpy()) print("Saving model..") torch.save(tagger_model, args.model_name) if (epoch + 1) % 4 == 0: print("Evaluating on dev set...") avg_tok_accuracy, f1_score = eval_on_dev(tagger_model, curEpoch=epoch) # Early Stopping if avg_tok_accuracy <= prev_avg_tok_accuracy: patience_counter += 1 if patience_counter == args.patience: print( "Model hasn't improved on dev set for %d epochs. Stopping Training." % patience_counter) break prev_avg_tok_accuracy = avg_tok_accuracy else: print("Loading tagger model from " + args.model_name + "...") tagger_model = torch.load(args.model_name, map_location=lambda storage, loc: storage) if args.gpu: tagger_model = tagger_model.cuda() else: tagger_model.gpu = False if args.visualize: print("[Visualization Mode]") utils.plot_heatmap(unique_tags, tagger_model.pairwise_weights, "pair") #utils.plot_heatmap(unique_tags, tagger_model.transition_weights, "trans") #utils.plot_heatmap(unique_tags, tagger_model.lang_pairwise_weights, "pair", lang_idx=1) print("Stored plots in figures/ directory!") if args.test: avg_tok_accuracy, f1_score = eval_on_dev(tagger_model, dev_or_test="test")
F_value_best = 0 if GPU_available: model.cuda() print('moved model to GPU!!!') end = time.time() eps = 0.00000000001 for epoch in range(start_epoch, 100): losses = AverageMeter() for i, (sentence, tags) in enumerate(training_data): model.zero_grad() # prepare torch.Tensor sentence_in = prepare_sequence(sentence, word_to_ix) targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long) if GPU_available: sentence_in = sentence_in.cuda() targets = targets.cuda() loss = model.neg_log_likelihood(sentence_in, targets) losses.update(loss.item(), 1) loss.backward() optimizer.step() if train_mode: print('Epoch: [{0}]\t' 'Loss: {losses.avg:.4f}\t' 'Time: {epoch_time:.2f}'.format(epoch, losses=losses, epoch_time=time.time()-end))
def prepare_data(path, with_pos=False): """ prepare data before training/evaluating """ print("------- prep data --------") # Load data sentence_path = os.path.join(path, 'train/sentences.txt') test_sentence_path = os.path.join(path, 'test/sentences.txt') val_sentence_path = os.path.join(path, 'val/sentences.txt') label_path = os.path.join(path, 'train/labels.txt') test_label_path = os.path.join(path, 'test/labels.txt') val_label_path = os.path.join(path, 'val/labels.txt') if with_pos: pos_path = os.path.join(path, 'train/pos.txt') test_pos_path = os.path.join(path, 'test/pos.txt') val_pos_path = os.path.join(path, 'val/pos.txt') train_sentence = [] train_labels = [] train_pos = [] test_sentence = [] test_labels = [] test_pos = [] val_sentence = [] val_labels = [] val_pos = [] utils.load_data(sentence_path, train_sentence) utils.load_data(label_path, train_labels) if with_pos: utils.load_data(pos_path, train_pos) utils.load_data(test_sentence_path, test_sentence) utils.load_data(test_label_path, test_labels) if with_pos: utils.load_data(test_pos_path, test_pos) utils.load_data(val_sentence_path, val_sentence) utils.load_data(val_label_path, val_labels) if with_pos: utils.load_data(val_pos_path, val_pos) train_data = [] test_data = [] val_data = [] if with_pos: for i in range(len(train_sentence)): train_data.append( (train_sentence[i], train_pos[i], train_labels[i])) for i in range(len(test_sentence)): test_data.append((test_sentence[i], test_pos[i], test_labels[i])) for i in range(len(val_sentence)): val_data.append((val_sentence[i], val_pos[i], val_labels[i])) else: for i in range(len(train_sentence)): train_data.append((train_sentence[i], train_labels[i])) for i in range(len(test_sentence)): test_data.append((test_sentence[i], test_labels[i])) for i in range(len(val_sentence)): val_data.append((val_sentence[i], val_labels[i])) utils.append_to_vocab(train_data, word_to_ix, label_to_ix, pos_to_ix, with_pos) utils.append_to_vocab(test_data, word_to_ix, label_to_ix, pos_to_ix, with_pos) utils.append_to_vocab(val_data, word_to_ix, label_to_ix, pos_to_ix, with_pos) if with_pos: for sentence, pos, tags in train_data: idx_sentences = utils.prepare_sequence(sentence, word_to_ix) idx_labels = utils.prepare_sequence(tags, label_to_ix) idx_pos = utils.prepare_sequence(pos, pos_to_ix) idx_train_data.append((idx_sentences, idx_pos, idx_labels)) for sentence, pos, tags in test_data: idx_sentences = utils.prepare_sequence(sentence, word_to_ix) idx_labels = utils.prepare_sequence(tags, label_to_ix) idx_pos = utils.prepare_sequence(pos, pos_to_ix) idx_test_data.append((idx_sentences, idx_pos, idx_labels)) for sentence, pos, tags in val_data: idx_sentences = utils.prepare_sequence(sentence, word_to_ix) idx_labels = utils.prepare_sequence(tags, label_to_ix) idx_pos = utils.prepare_sequence(pos, pos_to_ix) idx_val_data.append((idx_sentences, idx_pos, idx_labels)) else: for sentence, tags in train_data: idx_sentences = utils.prepare_sequence(sentence, word_to_ix) idx_labels = utils.prepare_sequence(tags, label_to_ix) idx_train_data.append((idx_sentences, idx_labels)) for sentence, tags in test_data: idx_sentences = utils.prepare_sequence(sentence, word_to_ix) idx_labels = utils.prepare_sequence(tags, label_to_ix) idx_test_data.append((idx_sentences, idx_labels)) for sentence, tags in val_data: idx_sentences = utils.prepare_sequence(sentence, word_to_ix) idx_labels = utils.prepare_sequence(tags, label_to_ix) idx_val_data.append((idx_sentences, idx_labels)) print("------- prep data done ------\n")