def main(argv): argv = docopt.docopt(__doc__, argv=argv) random_seed = argv['--random_seed'] np.random.seed(random_seed) random.seed(random_seed) mini_batch_size = argv['--mini_batch_size'] def read_ids(file): ids = [] with open(file, 'r') as fp: for row in fp: ids.append(row.strip()) return ids test_ids = read_ids(argv['<test_ids>']) with open(argv['--model']) as fp: tmp = pickle.load(fp) ld = tmp['token'] mod = BiLSTM(ld.embs, ld.pos, ld.pospeech, ld.chunk, nc=5, nh=2048, de=ld.embs.shape[1]) mod.__setstate__(tmp['model_params']) pairs_idx, chunk_idx, pos_idx, pos_e1_idx, pos_e2_idx, _, subj_y, pred_y, obj_y, idents, e1_ids, e2_ids = ld.transform( argv['--dataset'], test_ids) test_idxs = list(range(len(pairs_idx))) all_test_preds = [] scores = [] for start, end in zip( range(0, len(test_idxs), mini_batch_size), range(mini_batch_size, len(test_idxs) + mini_batch_size, mini_batch_size)): if len(test_idxs[start:end]) == 0: continue tpairs = ld.pad_data([pairs_idx[i] for i in test_idxs[start:end]]) te1 = ld.pad_data([pos_e1_idx[i] for i in test_idxs[start:end]]) te2 = ld.pad_data([pos_e2_idx[i] for i in test_idxs[start:end]]) preds = mod.predict_proba(tpairs, te1, te2, np.float32(1.)) for x in preds: if x > 0.5: all_test_preds.append(1) else: all_test_preds.append(0) test_f1 = f1_score(y, all_test_preds, average='binary') print("test_f1: %.4f" % (test_f1)) sys.stdout.flush()
def __init__(self, vocab_size, out_size, crf=True): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.embedding_size = LSTMConfig.embedding_size self.hidden_size = LSTMConfig.hidden_size self.crf = crf #无条件随机场 if not crf: self.model = BiLSTM(vocab_size, self.embedding_size, self.hidden_size, out_size).to(self.device) self.cal_loss_func = cal_loss else: self.model = BiLSTM_CRF(vocab_size, self.embedding_size, self.hidden_size, out_size).to(self.device) self.cal_loss_func = cal_lstm_crf_loss self.epoches = TrainingConfig.epoches self.print_step = TrainingConfig.print_step self.lr = TrainingConfig.lr self.batch_size = TrainingConfig.batch_size self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr) self.step = 0 #最佳损失函数,初始化一个极大值 self._best_val_loss = 1e18 self.best_model = None
def get_model(config, embedding): model = None print(config['model_name']) try: if config['model_name'] == "BiLSTM": model = BiLSTM(config, embedding) except Exception as e: logging.error("load model Exception", e) exit() return model
def __init__(self, vocab_size, emb_size, hidden_size, out_size): """初始化参数: vocab_size:字典的大小 emb_size:词向量的维数 hidden_size:隐向量的维数 out_size:标注的种类 """ super(BiLSTM_CRF, self).__init__() self.bilstm = BiLSTM(vocab_size, emb_size, hidden_size, out_size) # CRF实际上就是多学习一个转移矩阵 [out_size, out_size] 初始化为均匀分布 self.transition = nn.Parameter( torch.ones(out_size, out_size) * 1 / out_size)
def __init__(self, num_outputs, hidden_dim=256, device=torch.device("cuda" if torch.cuda.is_available() else "cpu"), word_embedding_path: str = ROOT_DIR+'/resources/word_embeddings/combined-320.tar/320/', max_seq_len=None): """ :param num_outputs: integer specifying the number of outputs of the model, when unknown in advance, this can be retrieved by using the 'get_num_labels_from_file' method :param hidden_dim: integer specifying the hidden dimension of the Bidirectional model :param device: torch.device specifying the device on which the inputs and the model should be put. By default the model will be put on the GPU if one is available :param word_embedding_path: string specifying the path of the word embedding text and pt files :param max_seq_len: the maximum length to which sentences are clipped, this can be used when some sentence are very long, which can cause memory issues when using larger batch sizes. """ # Load in the vectors when they are not already present in the package if not embeddings_available(): download_word_embeddings_nl() print("--- Constructing the Pytorch embedding matrix file ---") torchtext.vocab.Vectors('combined-320.txt', cache=word_embedding_path) vocab_data = torch.load(word_embedding_path+"combined-320.txt.pt") self.device = device self._words, self._embed_dict, self._embeddings, self._embed_dim = vocab_data self.model = BiLSTM(vocab=torch.zeros(size=(1, 1)), hidden_dim=hidden_dim, output_dim=num_outputs, device=device) self._TEXT = Field(lower=True, tokenize="spacy", tokenizer_language="nl_core_news_sm", include_lengths=True, batch_first=True, fix_length=max_seq_len) self.num_outputs = num_outputs self._criterion = None self._label_names = None self.has_trained = False
from sklearn import metrics # 获取模型名称 json_config = {} with open("config_file", 'r', encoding='utf-8') as f: json_config = json.load(f) label_id2name = {'0': '负向情感', '1': '正向情感'} #print(json_config['word_to_id']) print(label_id2name) tag_to_id = json_config['tag_to_id']## 注意: {'1': 0, '0': 1} 这里的key对应原始文本数据,value是torchtext建立的映射关系 id_to_tag = dict([(v,k) for k,v in tag_to_id.items()]) print('tag_to_id=',tag_to_id) # k 原始文本数据 print('id_to_tag=',id_to_tag) # k 为torchtext加载后的key model = BiLSTM(json_config['vocab_size'], json_config['embedding_dim'], json_config['hidden_size'], json_config['num_layers'], json_config['pad_idx'], json_config['unk_idx']) model.load_state_dict(torch.load(json_config['ckpts'], map_location='cpu')) # CPU模式下可独立运行 model.eval() import jieba # 定义一个tokenizer def chi_tokenizer(sentence): return [word for word in jieba.cut(sentence)] def transform_data(record, word_to_id, tag_to_id, batch_size): tokens = chi_tokenizer(record['data']) res = [] for token in tokens: res.append(word_to_id.get(token, 0)) PAD_IX = [1] * (batch_size - len(res))
class BiLSTMClassifier: """ This class implements a Bidirectional LSTM classifier based on the version from PyTorch It deals with the various aspects of the training, such as converting the data into the appropriate format and logging the training process via TensorBoard Attributes ---------- device: torch.device torch.device indicating on which device the model and the inputs should be, either on the GPU or the CPU. The default behaviour is to put the model and the inputs on the GPU when available. model: nn.Module The main model used for classification, in this case the Bidirectional LSTM model num_outputs: int Integer specifying the number of outputs of the model. This should be set to the number of unique classes in the dataset. (the 'get_num_labels_from_file' method can be used to retrieve this from the csv file when this is not known) has_trained: bool Boolean specifying whether the model has already been trained. This is used to ensure that the evaluaton or scoring is not accidentally run on an untrained model. _TEXT: torchtext.data.Field torchtext.data.Field instance specifying several parameters of the reading of the data such as whether or not to convert all text to lowercase and the type and language of the tokenizer used. _words: list list with all the words present in the Dutch embedding file _embed_dict: dict dictionary mapping words in the embeddings file to indices into the embedding matrix _embeddings: torch.Tensor torch.Tensor of size [num_words, embedding_dim] containing the word embeddings _criterion nn.optim.Criterion criterion used for the training and evaluation of the model. This is saved in the train methods for later use in the evaluation methods _embed_dim: int Integer specifying the dimension of the embeddings used in the embedding file _label_names: list list containing the names of the unique labels in the dataset, this is used for converting the integer representation used in training back to the original labels for easier interpretation """ def __init__(self, num_outputs, hidden_dim=256, device=torch.device("cuda" if torch.cuda.is_available() else "cpu"), word_embedding_path: str = ROOT_DIR+'/resources/word_embeddings/combined-320.tar/320/', max_seq_len=None): """ :param num_outputs: integer specifying the number of outputs of the model, when unknown in advance, this can be retrieved by using the 'get_num_labels_from_file' method :param hidden_dim: integer specifying the hidden dimension of the Bidirectional model :param device: torch.device specifying the device on which the inputs and the model should be put. By default the model will be put on the GPU if one is available :param word_embedding_path: string specifying the path of the word embedding text and pt files :param max_seq_len: the maximum length to which sentences are clipped, this can be used when some sentence are very long, which can cause memory issues when using larger batch sizes. """ # Load in the vectors when they are not already present in the package if not embeddings_available(): download_word_embeddings_nl() print("--- Constructing the Pytorch embedding matrix file ---") torchtext.vocab.Vectors('combined-320.txt', cache=word_embedding_path) vocab_data = torch.load(word_embedding_path+"combined-320.txt.pt") self.device = device self._words, self._embed_dict, self._embeddings, self._embed_dim = vocab_data self.model = BiLSTM(vocab=torch.zeros(size=(1, 1)), hidden_dim=hidden_dim, output_dim=num_outputs, device=device) self._TEXT = Field(lower=True, tokenize="spacy", tokenizer_language="nl_core_news_sm", include_lengths=True, batch_first=True, fix_length=max_seq_len) self.num_outputs = num_outputs self._criterion = None self._label_names = None self.has_trained = False def train_from_file(self, file_name: str, batch_size: int, num_epochs: int, delimiter: str = ",", quotechar: str = '"', text_col_name: str = 'text', label_col_name='label', learning_rate=1.0, logging_dir: str = ROOT_DIR+'/runs/') -> None: """ The main method of this class, implementing a training procedure for the model and handling the proper loading of the dataset :param file_name: string specifying the location and name of the file that contains the training dat :param batch_size: integer specifying the batch size, this will affect the size of the batches fed into the \ model this can be set lower if memory issues occur :param num_epochs: integer specifying the number of epochs for which the model is trained. The right amount of \ epochs can differ for different datasets and it is recommended to inspect the produced TensorBoard logs \ to see if the model has converged :param delimiter: string specifying the delimiter used in the training csv file :param quotechar: string specifying the quotechar used in the training csv file :param text_col_name: string specifying the name of the column containing the mails in the csv file :param label_col_name: string specifying the name of the column containing the labels of the mails in \ the csv file :param learning_rate: float specifying the learning rate of the model, this can affect the speed of \ convergence of the model :param logging_dir: directory to which the Tensorboard logging files are saved """ print("--- Starting with reading in the dataset ---") dataset_loader = CSVDataset(text_field=self._TEXT, file_name=file_name) dataset = dataset_loader.load(delimiter=delimiter, quotechar=quotechar, text_col_name=text_col_name, label_col_name=label_col_name) print("--- Finished with reading in the dataset ---") dloader = CustomDataLoader(dataset) data_iterator = dloader.construct_iterators(batch_size=batch_size, text_col_name=text_col_name, label_col_name=label_col_name) self._TEXT.vocab.set_vectors(self._embed_dict, self._embeddings, self._embed_dim) self.model.set_new_embedding_matrix(self._TEXT.vocab.vectors) self._label_names = dataset.fields[label_col_name].vocab.itos weights = single_task_class_weighting(data_iterator) criterion = nn.CrossEntropyLoss(weight=weights.to(self.device)) self._criterion = criterion optimizer = optim.SGD(self.model.parameters(), lr=learning_rate) scheduler = StepLR(optimizer, step_size=50, gamma=0.9) generic_training(self.model, criterion, optimizer, scheduler, data_iterator, device=self.device, tensorboard_dir=logging_dir, n_epochs=num_epochs, clip_val=0.0) self.has_trained = True return None def classify_from_file(self, file_name, delimiter: str = ",", quotechar: str = '"', text_col_name: str = "text", batch_size: int = 64) -> list: """ method used for classifying a set of examples for a file with a trained classifier This method reads in a file, parses it into the correct format and classifies the contents of the file. Throws an error when the model is not trained. :param file_name: string specifying the location and name of the file that contains the training dat :param delimiter: string specifying the delimiter used in the training csv file :param quotechar: string specifying the quotechar used in the training csv file :param text_col_name: string specifying the name of the column containing the mails in the csv file :param batch_size: integer specifying the batch size, this will affect the size of the batches fed into \ the model this can be set lower if memory issues occur :return: returns a list of results, where the result indices from the model have been converted back \ to the original class names from the file """ assert self.has_trained strings = pd.read_csv(file_name, sep=delimiter, quotechar=quotechar)[text_col_name].tolist() if isinstance(strings, str): strings = [strings] if isinstance(strings, list): strings = [[string] for string in strings] fields = [('text', self._TEXT)] list_of_examples = [Example.fromlist(string, fields) for string in strings] dataset = torchtext.data.Dataset(list_of_examples, fields) data = Iterator(dataset, batch_size=batch_size, device=torch.device("cpu"), sort=False, sort_within_batch=False, repeat=False, shuffle=False) predictions = [] for item in data: x = item.text self.model.to(self.device) self.model = self.model.eval() outputs = self.model([x[0].to(self.device), x[1].to(self.device)]) predictions.extend(outputs.detach().cpu().argmax(1).tolist()) results = [self._label_names[i] for i in predictions] return results def classify_from_strings(self, strings: Union[List[str], str]) -> list: """ method that can be used for classifying one or multiple examples with a trained classifier :param strings: a single string or a list of strings representing the pieces of text that should be classified :return: list containing the predictions of the models for the inputted pieces of text """ assert self.has_trained if isinstance(strings, str): strings = [strings] if isinstance(strings, list): strings = [[string] for string in strings] fields = [('text', self._TEXT)] list_of_examples = [Example.fromlist(string, fields) for string in strings] dataset = torchtext.data.Dataset(list_of_examples, fields) data = Iterator(dataset, batch_size=1, device=torch.device("cpu"), sort=False, sort_within_batch=False, repeat=False, shuffle=False) predictions = [] for item in data: x = item.text self.model.to(self.device) self.model = self.model.eval() outputs = self.model([x[0].to(self.device), x[1].to(self.device)]) predictions.extend(outputs.detach().cpu().argmax(1).tolist()) results = [self._label_names[i] for i in predictions] return results def score(self, file_name: str, delimiter: str = ",", quotechar='"', text_col_name: str = 'text', label_col_name: str = 'label', batch_size: int = 64) -> None: """ method that can be used score that model on an unseen test file :param file_name: string specifying the location and name of the file that contains the training dat :param delimiter: string specifying the delimiter used in the training csv file :param quotechar: string specifying the quotechar used in the training csv file :param text_col_name: string specifying the name of the column containing the mails in the csv file :param label_col_name: string specifying the name of the column containing the labels of the mails \ in the csv file :param batch_size: integer specifying the batch size, this will affect the size of the batches fed into \ the model this can be set lower if memory issues occur """ assert self.has_trained print("Evaluating model") print("--- Starting with reading in the dataset ---") dataset_loader = CSVDataset(text_field=self._TEXT, file_name=file_name) dataset = dataset_loader.load(delimiter=delimiter, quotechar=quotechar, text_col_name=text_col_name, label_col_name=label_col_name) print("--- Finished with reading in the dataset ---") dloader = CustomDataLoader(dataset) data_iterator = dloader.construct_iterators(batch_size=batch_size, text_col_name=text_col_name, label_col_name=label_col_name, is_test_set=True) generic_evaluation(self.model, data_iterator, self._criterion, device=self.device) return None def save_model(self, filename: str) -> None: """ method that can be used to save a (trained) classifier :param filename: string specifying the location and name of the destination of the saved model """ assert filename.split(".")[-1] == "pt" torch.save(self.model.state_dict(), filename) return None def load_model(self, filename: str) -> None: """ method that can be used to load a classifier saved in the .pt format :param filename: string specifying the name and location of the saved model to be loaded """ assert filename.split(".")[-1] == "pt" self.model.load_state_dict(torch.load(filename)) return None
def create_model(args, num_classes, embedding_vector): nl_str = args.nonlin.lower() if nl_str == 'relu': nonlin = nn.ReLU elif nl_str == 'threshrelu': nonlin = ThresholdReLU elif nl_str == 'sign11': nonlin = partial(Sign11, targetprop_rule=args.tp_rule) elif nl_str == 'qrelu': nonlin = partial(qReLU, targetprop_rule=args.tp_rule, nsteps=3) else: raise NotImplementedError( 'no other non-linearities currently supported') # input size if args.ds == 'sentiment140' or args.ds == 'tsad': input_shape, target_shape = (1, 60, 50), None elif args.ds == 'semeval': input_shape, target_shape = (1, 60, 100), (1, 6, 100) else: raise NotImplementedError('no other datasets currently supported') # create a model with the specified architecture if args.arch == 'cnn': model = CNN(input_shape, num_classes, embedding_vector, nonlin=nonlin) elif args.arch == 'lstm': model = LSTM(input_shape, num_classes, embedding_vector) elif args.arch == 'cnn-lstm': model = CNN_LSTM(input_shape, num_classes, embedding_vector, nonlin=nonlin) elif args.arch == 'lstm-cnn': model = LSTM_CNN(input_shape, num_classes, embedding_vector, nonlin=nonlin) elif args.arch == 'textcnn': model = TextCNN(input_shape, num_classes, embedding_vector, nonlin=nonlin) elif args.arch == 'bilstm': model = BiLSTM(input_shape, target_shape, num_classes, embedding_vector, nonlin=nonlin) else: raise NotImplementedError('other models not yet supported') logging.info("{} model has {} parameters and non-linearity={} ({})".format( args.arch, sum([p.data.nelement() for p in model.parameters()]), nl_str, args.tp_rule.name)) if len(args.gpus) > 1: model = nn.DataParallel(model) if args.cuda: model.cuda() return model
def main(_): tf.logging.set_verbosity(tf.logging.INFO) data_loader = TextLoader(True, FLAGS.train_path, FLAGS.map_file_path, FLAGS.batch_size, FLAGS.seq_length, None, None, None, 'utf8', False) valid_data_loader = TextLoader(False, FLAGS.valid_path, FLAGS.map_file_path, FLAGS.batch_size, FLAGS.seq_length, data_loader.vocab, data_loader.labels, data_loader.std_label_map, 'utf8', False) tf.logging.info("vocab_size: " + str(data_loader.vocab_size)) FLAGS.vocab_size = data_loader.vocab_size tf.logging.info("label_size: " + str(data_loader.label_size)) FLAGS.label_size = data_loader.label_size bilstm = BiLSTM(FLAGS) init = tf.global_variables_initializer() config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: sess.run(init) idx = 0 test_best_acc = 0 for epcho in range(FLAGS.num_epcho): # for each epoch data_loader.reset_batch_pointer() for train_batch_num in range( data_loader.num_batches): # for each batch input_x, input_y, x_len, _ = data_loader.next_batch() feed = { bilstm.input_x: input_x, bilstm.input_y: input_y, bilstm.x_len: x_len, bilstm.dropout_keep_prob: FLAGS.dropout_keep_prob } _, global_step_op, train_loss, train_acc = sess.run( [ bilstm.train_step, bilstm.global_step, bilstm.loss, bilstm.acc ], feed_dict=feed) tf.logging.info( "training...........global_step = {}, epoch = {}, current_batch = {}, " "train_loss = {:.4f}, accuracy = {:.4f}".format( global_step_op, epcho, train_batch_num, train_loss, train_acc)) idx += 1 if idx % FLAGS.check_every == 0: all_num = 0 acc_num = 0 valid_data_loader.reset_batch_pointer() write_result = [] for _ in range(valid_data_loader.num_batches): input_x_valid, input_y_valid, x_len_valid, _ = valid_data_loader.next_batch( ) feed = { bilstm.input_x: input_x_valid, bilstm.input_y: input_y_valid, bilstm.x_len: x_len_valid, bilstm.dropout_keep_prob: 1.0 } prediction, arg_index = sess.run( [bilstm.prediction, bilstm.arg_index], feed_dict=feed) all_num = all_num + len(input_y_valid) # write_str = "" for i, indexs in enumerate(arg_index): pre_label_id = indexs[0] real_label_id = input_y_valid[i] if pre_label_id == real_label_id: acc_num = acc_num + 1 # if real_label_id in valid_data_loader.id_2_label: # write_str = valid_data_loader.id_2_label.get(real_label_id) # else: # write_str = "__label__unknown" # for index in indexs: # cur_label = valid_data_loader.id_2_label.get(index) # cur_score = prediction[i][index] # write_str = write_str + " " + cur_label + ":" + str(cur_score) # write_str = write_str + "\n" # write_result.append(write_str) test_acc = acc_num * 1.0 / all_num tf.logging.info( "testing...........global_step = {}, epoch = {}, accuracy = {:.4f}, cur_best_acc = {}" .format(global_step_op, epcho, test_acc, test_best_acc)) if test_best_acc < test_acc: test_best_acc = test_acc # save_model if not os.path.exists(FLAGS.model_path): os.makedirs(FLAGS.model_path) checkpoint_path = os.path.join(FLAGS.model_path, 'lstm_model') bilstm.saver.save(sess, checkpoint_path, global_step=global_step_op) # export model export_path = os.path.join(FLAGS.model_path, 'lstm_tf_serving') if os.path.isdir(export_path): shutil.rmtree(export_path) bilstm.export_model(export_path, sess) # resultfile = open(FLAGS.result_file, 'w', encoding='utf-8') # for pre_sen in write_result: # resultfile.write(pre_sen) tf.logging.info( "has saved model and write.result..................................................................." ) # resultfile.close() # save label and vocab vocabfile = open(FLAGS.vocab_file, 'w', encoding='utf-8') for key, value in data_loader.vocab.items(): vocabfile.write( str(key) + "\t" + str(value) + '\n') vocabfile.close() labelfile = open(FLAGS.label_file, 'w', encoding='utf-8') for key, value in data_loader.labels.items(): labelfile.write( str(key) + "\t" + str(value) + '\n') labelfile.close()
print('vocab_size = ', vocab_size) print('embedding_dim = ', embedding_dim) # 打印上述四句话的参数信息 ## 保存参数 word_to_id = dict(TEXT.vocab.stoi) tag_to_id = dict(LABEL.vocab.stoi) param_config = config_model(vocab_size, embedding_dim, pad_idx, unk_idx, word_to_id, tag_to_id) save_config(param_config, 'config_file') # 初始化模型 pre_trained_embedding = TEXT.vocab.vectors model = BiLSTM(vocab_size, embedding_dim, config.hidden_size, config.num_layers, pad_idx, unk_idx, pre_trained_embedding=pre_trained_embedding) optimizer = optim.Adam(model.parameters(), lr=args.lr) loss_func = nn.CrossEntropyLoss() # 模型训练 ## train/val/保存最优模型 print('=>model training ...<=') best_val_loss = float('inf') N_EPOCH = args.epoches for epoch in range(N_EPOCH): t1 = time.time() # train
def main(): parser = argparse.ArgumentParser(description='Train Neural Network.') parser.add_argument('--num_epochs', type=int, default=25, help='Number of updates to make.') parser.add_argument('--num_models', type=int, default=5, help='Number of updates to make.') parser.add_argument('--lstm_hidden_state', type=int, default=128, help='LSTM hidden state size.') parser.add_argument('--word_vectors', default=None, help='Word vecotors filepath.') parser.add_argument('--checkpoint_dir', default='./experiments/exp1/checkpoints/', help='Checkpoint directory.') parser.add_argument('--checkpoint_name', default='checkpoint', help='Checkpoint File Name.') parser.add_argument('--hidden_state', type=int, default=2048, help='hidden layer size.') parser.add_argument('--learn_embeddings', type=bool, default=True, help='Learn Embedding Parameters.') parser.add_argument('--min_df', type=int, default=5, help='Min word count.') parser.add_argument('--lr', type=float, default=0.001, help='Learning Rate.') parser.add_argument('--penalty', type=float, default=0.0, help='Regularization Parameter.') parser.add_argument('--p_penalty', type=float, default=0.0, help='Self-Regularization Parameter.') parser.add_argument('--dropout', type=float, default=0.5, help='Dropout Value.') parser.add_argument('--lstm_dropout', type=float, default=0.5, help='LSTM Dropout Value.') parser.add_argument('--lr_decay', type=float, default=1e-6, help='Learning Rate Decay.') parser.add_argument('--minibatch_size', type=int, default=50, help='Mini-batch Size.') parser.add_argument('--val_minibatch_size', type=int, default=256, help='Val Mini-batch Size.') parser.add_argument('--model_type', help='Neural Net Architecutre.') parser.add_argument('--train_data_X', help='Training Data.') parser.add_argument('--train_data_Y', help='Training Labels.') parser.add_argument('--val_data_X', help='Validation Data.') parser.add_argument('--val_data_Y', help='Validation Labels.') parser.add_argument('--seed', default=42, type=int, help='Random Seed.') parser.add_argument('--grad_clip', type=float, default=None, help='Gradient Clip Value.') parser.add_argument('--cnn_conv_size', nargs='+', type=int, default=[4, 3, 2, 1], help='CNN Covolution Sizes (widths)') parser.add_argument('--num_feat_maps', default=300, type=int, help='Number of CNN Feature Maps.') parser.add_argument('--num_att', default=30, type=int, help='Number of Attention Vectors.') args = parser.parse_args() np.random.seed(args.seed) random.seed(args.seed) # Load & Process Data train_txt, train_Y = load_data_file(args.train_data_X, args.train_data_Y) val_txt, val_Y = load_data_file(args.val_data_X, args.val_data_Y) data_processor = ProcessData(args.word_vectors, lower=True, min_df=args.min_df) X_train = data_processor.fit_transform(train_txt) X_val = data_processor.transform(val_txt) ml_vec = CustomLabelBinarizer() ml_vec.fit(train_Y) Y_train = ml_vec.transform(train_Y) Y_val = ml_vec.transform(val_Y) print("Init Model") sys.stdout.flush() # Init Model if args.model_type == 'bilstm': from models.bilstm import BiLSTM clf = BiLSTM(data_processor.embs, nc=Y_train.shape[1], nh=args.lstm_hidden_state, de=data_processor.embs.shape[1], lr=args.lr, train_emb=args.learn_embeddings, p_lstm_drop=args.lstm_dropout, p_drop=args.dropout, penalty=args.penalty, lr_decay=args.lr_decay, clip=args.grad_clip) elif args.model_type == 'cnn': from models.cnn import CNN clf = CNN(data_processor.embs, nc=Y_train.shape[1], de=data_processor.embs.shape[1], lr=args.lr, p_drop=args.dropout, decay=args.lr_decay, clip=args.grad_clip, fs=args.cnn_conv_size, penalty=args.penalty, train_emb=args.learn_embeddings) print( "CNN: hidden_state: %d word_vec_size: %d lr: %.5f decay: %.6f learn_emb: %s dropout: %.3f num_feat_maps: %d penalty: %.5f conv_widths: %s" % (args.hidden_state, data_processor.embs.shape[1], args.lr, args.lr_decay, args.learn_embeddings, args.dropout, args.num_feat_maps, args.penalty, args.cnn_conv_size)) elif args.model_type == 'att_cnn': from models.att_cnn import CNN clf = CNN(data_processor.embs, nc=Y_train.shape[1], de=data_processor.embs.shape[1], lr=args.lr, p_drop=args.dropout, decay=args.lr_decay, clip=args.grad_clip, fs=args.cnn_conv_size, penalty=args.penalty, train_emb=args.learn_embeddings) print( "ATT_CNN: hidden_state: %d word_vec_size: %d lr: %.5f decay: %.6f learn_emb: %s dropout: %.3f num_feat_maps: %d penalty: %.5f conv_widths: %s" % (args.hidden_state, data_processor.embs.shape[1], args.lr, args.lr_decay, args.learn_embeddings, args.dropout, args.num_feat_maps, args.penalty, args.cnn_conv_size)) elif args.model_type == 'cnn_att_word': from models.cnn_att_word_reg import CNN clf = CNN(data_processor.embs, nc=Y_train.shape[1], de=data_processor.embs.shape[1], lr=args.lr, p_drop=args.dropout, decay=args.lr_decay, clip=args.grad_clip, fs=args.cnn_conv_size, penalty=args.penalty, train_emb=args.learn_embeddings) print( "ATT_CNN: hidden_state: %d word_vec_size: %d lr: %.5f decay: %.6f learn_emb: %s dropout: %.3f num_feat_maps: %d penalty: %.5f conv_widths: %s" % (args.hidden_state, data_processor.embs.shape[1], args.lr, args.lr_decay, args.learn_embeddings, args.dropout, args.num_feat_maps, args.penalty, args.cnn_conv_size)) elif args.model_type == 'bow': from models.bow import BoW clf = BoW(data_processor.embs, nc=Y_train.shape[1], nh=args.hidden_state, de=data_processor.embs.shape[1], lr=args.lr, decay=args.lr_decay, clip=args.grad_clip, train_emb=args.learn_embeddings, penalty=args.penalty, p_drop=args.dropout) print( "BoW: hidden_state: %d word_vec_size: %d lr: %.5f decay: %.6f learn_emb: %s dropout: %.3f penalty: %.5f" % (args.hidden_state, data_processor.embs.shape[1], args.lr, args.lr_decay, args.learn_embeddings, args.dropout, args.penalty)) elif args.model_type == 'att': from models.att import AttBoW clf = AttBoW(data_processor.embs, nc=Y_train.shape[1], nh=args.hidden_state, de=data_processor.embs.shape[1], lr=args.lr, decay=args.lr_decay, clip=args.grad_clip, train_emb=args.learn_embeddings, penalty=args.penalty, p_drop=args.dropout, na=args.num_att, penalty_p=args.p_penalty) print( "AttBoW: hidden_state: %d word_vec_size: %d lr: %.5f decay: %.6f learn_emb: %s dropout: %.3f num_att: %d penalty: %.5f penalty_p: %.5f" % (args.hidden_state, data_processor.embs.shape[1], args.lr, args.lr_decay, args.learn_embeddings, args.dropout, args.num_att, args.penalty, args.p_penalty)) else: raise ValueError('Incorrect Model Specified') print("Training Model") sys.stdout.flush() train_idxs = list(range(len(X_train))) val_idxs = list(range(len(X_val))) # Train Model best_val_f1 = 0 for epoch in range(1, args.num_epochs + 1): mean_loss = [] mean_f1 = [] random.shuffle(train_idxs) epoch_t0 = time() for start, end in zip( range(0, len(train_idxs), args.minibatch_size), range(args.minibatch_size, len(train_idxs) + args.minibatch_size, args.minibatch_size)): if len(train_idxs[start:end]) == 0: continue mini_batch_sample = data_processor.pad_data( [X_train[i] for i in train_idxs[start:end]]) cost, preds = clf.train_batch( mini_batch_sample, Y_train[train_idxs[start:end]].astype('int32'), np.float32(0.)) f1 = f1_score(Y_train[train_idxs[start:end]].argmax(axis=1), preds, average='macro', labels=[0, 1]) mean_f1.append(f1) mean_loss.append(cost) sys.stdout.write( "Epoch: %d train_avg_loss: %.4f train_avg_f1: %.4f\r" % (epoch, np.mean(mean_loss), np.mean(mean_f1))) sys.stdout.flush() # Validate Model final_preds = [] val_loss = [] for start, end in zip( range(0, len(val_idxs), args.val_minibatch_size), range(args.val_minibatch_size, len(train_idxs) + args.val_minibatch_size, args.val_minibatch_size)): if len(train_idxs[start:end]) == 0: continue mini_batch_sample = data_processor.pad_data( [X_val[i] for i in val_idxs[start:end]]) preds, cost = clf.predict_loss(mini_batch_sample, Y_val[val_idxs[start:end]], np.float32(1.)) final_preds += list(preds.flatten()) val_loss.append(cost) f1 = f1_score(Y_val.argmax(axis=1), final_preds, average='macro', labels=[0, 1]) sys.stdout.write( "epoch: %d val_loss %.4f val_f1: %.4f train_avg_loss: %.4f train_avg_f1: %.4f time: %.1f\n" % (epoch, np.mean(val_loss), f1, np.mean(mean_loss), np.mean(mean_f1), time() - epoch_t0)) sys.stdout.flush() # Checkpoint Model if f1 > best_val_f1: best_val_f1 = f1 with open( os.path.abspath(args.checkpoint_dir) + '/' + args.checkpoint_name + '.pkl', 'wb') as out_file: pickle.dump( { 'model_params': clf.__getstate__(), 'token': data_processor, 'ml_bin': ml_vec, 'args': args, 'last_train_avg_loss': np.mean(mean_loss), 'last_train_avg_f1': np.mean(mean_f1), 'val_f1': f1 }, out_file, pickle.HIGHEST_PROTOCOL)
def main(): parser = argparse.ArgumentParser(description='Test Neural Network.') parser.add_argument('--checkpoint_model', help='Checkpoint Model.') parser.add_argument('--data_X', help='Test/Validation Data.') parser.add_argument('--data_Y', help='Test/Validation Labels.') parser.add_argument('--scoring', default='macro', help='Evaluation Measure.') parser.add_argument('--minibatch_size', type=int, default=256, help='Mini-batch Size.') parser.add_argument('--name_count', default='cnn_1', help='count which run') args = parser.parse_args() if args.scoring not in ['binary', 'micro', 'macro', 'prf']: raise ValueError('Incorrect Evaluation Measure Specified') # Load Checkpoint Model with open(args.checkpoint_model, 'rb') as out_file: chk_pt = pickle.load(out_file) # Load & Process Data test_txt, test_Y = load_data_file(args.data_X, args.data_Y) X = chk_pt['token'].transform(test_txt) Y = chk_pt['ml_bin'].transform(test_Y) data_processor = chk_pt['token'] print("Init Model") # Init Model if chk_pt['args'].model_type == 'bilstm': from models.bilstm import BiLSTM clf = BiLSTM(data_processor.embs, nc=Y.shape[1], nh=chk_pt['args'].lstm_hidden_state, de=data_processor.embs.shape[1], lr=chk_pt['args'].lr, train_emb=chk_pt['args'].learn_embeddings, p_lstm_drop=chk_pt['args'].lstm_dropout, p_drop=chk_pt['args'].dropout, penalty=chk_pt['args'].penalty, lr_decay=chk_pt['args'].lr_decay, clip=chk_pt['args'].grad_clip) clf.__setstate__(chk_pt['model_params']) elif chk_pt['args'].model_type == 'cnn': from models.cnn import CNN clf = CNN(data_processor.embs, nc=Y.shape[1], de=data_processor.embs.shape[1], lr=chk_pt['args'].lr, p_drop=chk_pt['args'].dropout, decay=chk_pt['args'].lr_decay, clip=chk_pt['args'].grad_clip, fs=chk_pt['args'].cnn_conv_size, penalty=chk_pt['args'].penalty, train_emb=chk_pt['args'].learn_embeddings) clf.__setstate__(chk_pt['model_params']) print( "CNN: hidden_state: %d word_vec_size: %d lr: %.5f decay: %.6f learn_emb: %s dropout: %.3f num_feat_maps: %d penalty: %.5f conv_widths: %s" % (chk_pt['args'].hidden_state, data_processor.embs.shape[1], chk_pt['args'].lr, chk_pt['args'].lr_decay, chk_pt['args'].learn_embeddings, chk_pt['args'].dropout, chk_pt['args'].num_feat_maps, chk_pt['args'].penalty, chk_pt['args'].cnn_conv_size)) elif chk_pt['args'].model_type == 'cnn_att_word': from models.cnn_att_word_reg import CNN clf = CNN(data_processor.embs, nc=Y.shape[1], de=data_processor.embs.shape[1], lr=chk_pt['args'].lr, p_drop=chk_pt['args'].dropout, decay=chk_pt['args'].lr_decay, clip=chk_pt['args'].grad_clip, fs=chk_pt['args'].cnn_conv_size, penalty=chk_pt['args'].penalty, train_emb=chk_pt['args'].learn_embeddings) clf.__setstate__(chk_pt['model_params']) print( "CNN: hidden_state: %d word_vec_size: %d lr: %.5f decay: %.6f learn_emb: %s dropout: %.3f num_feat_maps: %d penalty: %.5f conv_widths: %s" % (chk_pt['args'].hidden_state, data_processor.embs.shape[1], chk_pt['args'].lr, chk_pt['args'].lr_decay, chk_pt['args'].learn_embeddings, chk_pt['args'].dropout, chk_pt['args'].num_feat_maps, chk_pt['args'].penalty, chk_pt['args'].cnn_conv_size)) elif chk_pt['args'].model_type == 'bow': from models.bow import BoW clf = BoW(data_processor.embs, nc=Y.shape[1], nh=chk_pt['args'].hidden_state, de=data_processor.embs.shape[1], lr=chk_pt['args'].lr, decay=chk_pt['args'].lr_decay, clip=chk_pt['args'].grad_clip, train_emb=chk_pt['args'].learn_embeddings, penalty=chk_pt['args'].penalty, p_drop=chk_pt['args'].dropout) clf.__setstate__(chk_pt['model_params']) print( "BoW: hidden_state: %d word_vec_size: %d lr: %.5f decay: %.6f learn_emb: %s dropout: %.3f penalty: %.5f" % (chk_pt['args'].hidden_state, data_processor.embs.shape[1], chk_pt['args'].lr, chk_pt['args'].lr_decay, chk_pt['args'].learn_embeddings, chk_pt['args'].dropout, chk_pt['args'].penalty)) elif chk_pt['args'].model_type == 'att': from models.att import AttBoW clf = AttBoW(data_processor.embs, nc=Y.shape[1], nh=chk_pt['args'].hidden_state, de=data_processor.embs.shape[1], lr=chk_pt['args'].lr, decay=chk_pt['args'].lr_decay, clip=chk_pt['args'].grad_clip, train_emb=chk_pt['args'].learn_embeddings, penalty=chk_pt['args'].penalty, p_drop=chk_pt['args'].dropout, na=chk_pt['args'].num_att, penalty_p=chk_pt['args'].p_penalty) clf.__setstate__(chk_pt['model_params']) print( "AttBoW: hidden_state: %d word_vec_size: %d lr: %.5f decay: %.6f learn_emb: %s dropout: %.3f num_att: %d penalty: %.5f penalty_p: %.5f" % (chk_pt['args'].hidden_state, data_processor.embs.shape[1], chk_pt['args'].lr, chk_pt['args'].lr_decay, chk_pt['args'].learn_embeddings, chk_pt['args'].dropout, chk_pt['args'].num_att, chk_pt['args'].penalty, chk_pt['args'].p_penalty)) else: raise ValueError('Incorrect Model Specified') # Get Predictions idxs = list(range(len(X))) all_preds = [] all_proba = np.array([]) for start, end in zip( range(0, len(idxs), args.minibatch_size), range(args.minibatch_size, len(idxs) + args.minibatch_size, args.minibatch_size)): if len(idxs[start:end]) == 0: continue mini_batch_sample = data_processor.pad_data( [X[i] for i in idxs[start:end]]) preds = clf.predict(mini_batch_sample, np.float32(1.)) proba = clf.predict_proba(mini_batch_sample, np.float32(1.)) if len(all_proba) < 1: all_proba = proba else: all_proba = np.concatenate((all_proba, proba)) all_preds += list(preds.flatten()) # Evaluate prf1 = None filename = '/home/sehan2/Amia/task2/nn_model/probas_full1/' + str( args.name_count) + '.npy' all_proba.dump(filename) '''
def run_train(): ''' This method creates the TensorFlow graph and session, running the training loop :return: ''' # load preprocessed token, label, shape, char maps labels_str_id_map, labels_id_str_map, vocab_str_id_map, vocab_id_str_map, \ shape_str_id_map, shape_id_str_map, char_str_id_map, char_id_str_map = load_intmaps(FLAGS.train_dir) # create intmaps for label types and bio (used later for evaluation, calculating F1 scores, etc.) # TODO right now these aren't used type_int_int_map, bilou_int_int_map, type_set, bilou_set = create_type_maps( labels_str_id_map) # load the embeddings embeddings = load_embeddings(vocab_str_id_map) labels_size = len(labels_str_id_map) char_domain_size = len(char_id_str_map) vocab_size = len(vocab_str_id_map) shape_domain_size = len(shape_id_str_map) # create TF graph with tf.Graph().as_default(): # create batchers train_batcher = Batcher( FLAGS.train_dir, FLAGS.batch_size) if FLAGS.memmap_train else SeqBatcher( FLAGS.train_dir, FLAGS.batch_size) dev_batcher = SeqBatcher(FLAGS.dev_dir, FLAGS.batch_size, num_buckets=0, num_epochs=1) train_eval_batcher = SeqBatcher(FLAGS.train_dir, FLAGS.batch_size, num_buckets=0, num_epochs=1) # create character embedding model if FLAGS.char_dim > 0 and FLAGS.char_model == "lstm": print("creating and training character embeddings") char_embedding_model = BiLSTMChar(char_domain_size, FLAGS.char_dim, int(FLAGS.char_tok_dim / 2)) # elif FLAGS.char_dim > 0 and FLAGS.char_model == "cnn": # char_embedding_model = CNNChar(char_domain_size, FLAGS.char_dim, FLAGS.char_tok_dim, layers_map[0][1]['width']) else: char_embedding_model = None char_embeddings = char_embedding_model.outputs if char_embedding_model is not None else None # create BiLSTM model if FLAGS.model == 'bilstm': model = BiLSTM( num_classes=labels_size, vocab_size=vocab_size, shape_domain_size=shape_domain_size, char_domain_size=char_domain_size, char_size=FLAGS.char_dim, embedding_size=FLAGS.embed_dim, shape_size=FLAGS.shape_dim, lex_size=FLAGS.lex_dim, nonlinearity=FLAGS.nonlinearity, viterbi=False, #viterbi=FLAGS.viterbi, hidden_dim=FLAGS.lstm_dim, char_embeddings=char_embeddings, embeddings=embeddings, use_geometric_feats=FLAGS.use_geometric_feats, use_lexicons=FLAGS.use_lexicons) # elif FLAGS.model == 'lstm': # model = LSTM( # num_classes=labels_size, # vocab_size=vocab_size, # shape_domain_size=shape_domain_size, # char_domain_size=char_domain_size, # char_size=FLAGS.char_dim, # embedding_size=FLAGS.embed_dim, # shape_size=FLAGS.shape_dim, # nonlinearity=FLAGS.nonlinearity, # viterbi=False, # viterbi=FLAGS.viterbi, # hidden_dim=FLAGS.lstm_dim, # char_embeddings=char_embeddings, # embeddings=embeddings, # use_geometric_feats=FLAGS.use_geometric_feats, # use_lexicons=FLAGS.use_lexicons) # Define Training procedure global_step = tf.Variable(0, name='global_step', trainable=False) optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.lr, beta1=FLAGS.beta1, beta2=FLAGS.beta2, epsilon=FLAGS.epsilon, name="optimizer") model_vars = [ v for v in tf.all_variables() if 'context_agg' not in v.name ] train_op = optimizer.minimize(model.loss, global_step=global_step, var_list=model_vars) print("model vars: %d" % len(model_vars)) print(map(lambda v: v.name, model_vars)) print() sys.stdout.flush() get_trainable_params() tf.initialize_all_variables() frontend_opt_vars = [ optimizer.get_slot(s, n) for n in optimizer.get_slot_names() for s in model_vars if optimizer.get_slot(s, n) is not None ] model_vars += frontend_opt_vars # load pretrained model if one is provided if FLAGS.load_dir: reader = tf.train.NewCheckpointReader(FLAGS.load_dir + ".tf") saved_var_map = reader.get_variable_to_shape_map() intersect_vars = [ k for k in tf.all_variables() if k.name.split(':')[0] in saved_var_map and k.get_shape() == saved_var_map[k.name.split(':')[0]] ] leftovers = [ k for k in tf.all_variables() if k.name.split(':')[0] not in saved_var_map or k.get_shape() != saved_var_map[k.name.split(':')[0]] ] print("WARNING: Loading pretrained frontend, but not loading: ", map(lambda v: v.name, leftovers)) frontend_loader = tf.train.Saver(var_list=intersect_vars) else: frontend_loader = tf.train.Saver(var_list=model_vars) frontend_saver = tf.train.Saver(var_list=model_vars) # create a supervisor sv = tf.python.train.Supervisor( logdir=FLAGS.model_dir if FLAGS.model_dir != '' else None, global_step=global_step, saver=None, save_model_secs=0, save_summaries_secs=0) training_start_time = time.time() # create session with sv.managed_session( FLAGS.master, config=tf.ConfigProto(allow_soft_placement=True)) as sess: print("session created") sys.stdout.flush() # start queue runner threads threads = tf.train.start_queue_runners(sess=sess) # load model if applicable if FLAGS.load_dir != '': print("Deserializing model: " + FLAGS.load_dir + ".tf") frontend_loader.restore(sess, FLAGS.load_dir + ".tf") # load batches print() dev_batches, train_batches, num_dev_examples, num_train_examples \ = load_batches(sess, train_batcher, train_eval_batcher, dev_batcher) # just run the evaluation if applicable if FLAGS.evaluate_only: if FLAGS.train_eval: w_f1, accuracy, preds, labels = evaluation.run_evaluation( sess, model, char_embedding_model, train_batches, labels_str_id_map, labels_id_str_map, "TRAIN") print() w_f1, accuracy, preds, labels = evaluation.run_evaluation( sess, model, char_embedding_model, dev_batches, labels_str_id_map, labels_id_str_map, "TEST", True, vocab_str_id_map, vocab_id_str_map) # write test set predictions to disk (for furthr analysis) print("writing predictions to disk:") # with open(FLAGS.model_dir + os.sep + 'test_preds.txt', 'w') as f: # for pred in preds: # f.write(pred + "\n") # with open(FLAGS.model_dir + os.sep + 'test_golds.txt', 'w') as f: # for label in labels: # f.write(label + "\n") np.save(FLAGS.model_dir + os.sep + "test_preds.npy", preds) np.save(FLAGS.model_dir + os.sep + "test_labels.npy", labels) # train a model else: best_score = 0 total_iterations = 0 # always train the front-end unless load dir was passed if FLAGS.load_dir == '' or (FLAGS.load_dir != '' and FLAGS.layers2 == ''): best_score, training_iteration, train_speed = train( sess, sv, model, char_embedding_model, train_batches, dev_batches, num_train_examples, num_dev_examples, train_batcher, labels_str_id_map, labels_id_str_map, train_op, frontend_saver, vocab_str_id_map) total_iterations += training_iteration if FLAGS.model_dir: print("Deserializing model: " + FLAGS.model_dir + "-frontend.tf") frontend_saver.restore( sess, FLAGS.model_dir + "-frontend.tf") sv.coord.request_stop() sv.coord.join(threads) sess.close() total_time = time.time() - training_start_time if FLAGS.evaluate_only: print("Testing time: %d seconds" % (total_time)) else: print( "Training time: %d minutes, %d iterations (%3.2f minutes/iteration)" % (total_time / 60, total_iterations, total_time / (60 * total_iterations))) print("Avg training speed: %f examples/second" % (train_speed)) print("Best dev F1: %2.2f" % (best_score * 100))
print("--------Training---------") print("Encoder Input Shape - {}".format(encoder_in.shape)) print("Decoder Input Shape - {}".format(decoder_in.shape)) print("Decoder Output Shape- {}".format(decoder_out.shape)) print("-------Validation--------") print("Encoder Input Shape - {}".format(val_encoder_in.shape)) print("Decoder Input Shape - {}".format(val_decoder_in.shape)) print("Decoder Output Shape- {}".format(val_decoder_out.shape)) train = batchgen(encoder_in, decoder_in, decoder_out, batchsize) val = batchgen(val_encoder_in, val_decoder_in, val_decoder_out, batchsize) if model_type == 'bilstm': modelname = 'BiLSTM' from models.bilstm import BiLSTM model = BiLSTM(comwords, refwords, seqlen, targetlen) model, encoder, decoder = model.create_model() print(model.summary()) elif model_type == 'bilstm-f': modelname = 'BiLSTMF' from models.bilstm_f import BiLSTM_F model = BiLSTM_F(comwords, len(srctok.word_counts) + 1, len(reftok.word_counts) + 1, seqlen, srclen, targetlen) model, encoder, decoder = model.create_model() print(model.summary()) elif model_type == 'bilstm-csatt': modelname = 'BiLSTMCSATT' from models.bilstm_csatt import BiLSTM_CSAtt model = BiLSTM_CSAtt( len(comtok.word_counts) + 1,
def main(argv): argv = docopt.docopt(__doc__) num_epochs = argv['--num_epochs'] mini_batch_size = argv['--mini_batch_size'] val_mini_batch_size = 64 num_classes = argv['--num_classes'] lstm_hidden_state_size = argv['--lstm_hidden_state'] random_seed = argv['--random_seed'] np.random.seed(random_seed) random.seed(random_seed) def read_ids(filename): ids = [] with open(filename, 'r') as fp: for row in fp: ids.append(row.strip()) return ids train_ids = read_ids(argv['--train_ids']) val_ids = read_ids(argv['--dev_ids']) test_ids = read_ids(argv['--test_ids']) ld = LoadData(argv['--word2vec']) train_pairs, train_e1, train_e2, train_y, train_ids, _, _ = ld.fit_transform( argv['--dataset'], train_ids) dev_pairs, dev_e1, dev_e2, dev_y, val_ids, dev_e1_ids, dev_e2_ids = ld.transform( argv['--dataset'], val_ids) test_pairs, test_e1, test_e2, test_y, test_ids, e1_ids, e2_ids = ld.transform( argv['--dataset'], test_ids) idxs = list(range(len(train_pairs))) dev_idxs = list(range(len(dev_pairs))) test_idxs = list(range(len(test_pairs))) last_loss = None avg_loss = [] avg_f1 = [] check_preds = None mod = BiLSTM(ld.embs, ld.pos, nc=num_classes, nh=lstm_hidden_state_size, de=ld.embs.shape[1]) best_dev_f1 = 0 for epoch in range(1, num_epochs + 1): mean_loss = [] random.shuffle(idxs) for start, end in zip( range(0, len(idxs), mini_batch_size), range(mini_batch_size, len(idxs) + mini_batch_size, mini_batch_size)): idxs_sample = idxs[start:end] if len(idxs_sample) < mini_batch_size: continue batch_labels = np.array(train_y[idxs_sample], dtype='int32') tpairs = ld.pad_data([train_pairs[i] for i in idxs_sample]) te1 = ld.pad_data([train_e1[i] for i in idxs_sample]) te2 = ld.pad_data([train_e2[i] for i in idxs_sample]) cost = mod.train_batch(tpairs, te1, te2, train_y[idxs_sample].astype('int32'), np.float32(0.), np.array(negs).astype('int32')) mean_loss.append(cost) print("EPOCH: %d loss: %.4f train_loss: %.4f" % (epoch, cost, np.mean(mean_loss))) sys.stdout.flush() all_dev_preds = [] scores = [] for start, end in zip( range(0, len(dev_idxs), val_mini_batch_size), range(val_mini_batch_size, len(dev_idxs) + val_mini_batch_size, val_mini_batch_size)): if len(dev_idxs[start:end]) == 0: continue vpairs = ld.pad_data([dev_pairs[i] for i in dev_idxs[start:end]]) ve1 = ld.pad_data([dev_e1[i] for i in dev_idxs[start:end]]) ve2 = ld.pad_data([dev_e2[i] for i in dev_idxs[start:end]]) preds = mod.predict_proba(vpairs, ve1, ve2, np.float32(1.)) for x in preds: if x > 0.5: all_dev_preds.append(1) else: all_dev_preds.append(0) dev_f1 = f1_score(dev_y, all_dev_preds, average='binary') print("EPOCH: %d train_loss: %.4f dev_f1: %.4f" % (epoch, np.mean(mean_loss), dev_f1)) sys.stdout.flush() if dev_f1 > best_dev_f1: with open(argv['--model'], 'w') as fp: pickle.dump({ 'model_params': mod.__getstate__(), 'token': ld }, fp, pickle.HIGHEST_PROTOCOL) best_dev_f1 = dev_f1 all_test_preds = [] scores = [] for start, end in zip( range(0, len(test_idxs), val_mini_batch_size), range(val_mini_batch_size, len(test_idxs) + val_mini_batch_size, val_mini_batch_size)): if len(test_idxs[start:end]) == 0: continue tpairs = ld.pad_data( [test_pairs[i] for i in test_idxs[start:end]]) te1 = ld.pad_data([test_e1[i] for i in test_idxs[start:end]]) te2 = ld.pad_data([test_e2[i] for i in test_idxs[start:end]]) preds = mod.predict_proba(tpairs, te1, te2, np.float32(1.)) for x in preds: if x > 0.5: all_test_preds.append(1) else: all_test_preds.append(0) test_f1 = f1_score(test_y, all_test_preds, average='binary') print("EPOCH: %d train_loss: %.4f dev_f1: %.4f test_f1: %.4f" % (epoch, np.mean(mean_loss), dev_f1, test_f1)) sys.stdout.flush()