def main(args): print('Loading dataset...') x_train, y_train = load_data_and_labels(args.train_data) x_valid, y_valid = load_data_and_labels(args.valid_data) x_test, y_test = load_data_and_labels(args.test_data) x_train = np.r_[x_train, x_valid] y_train = np.r_[y_train, y_valid] print('Transforming datasets...') p = ELMoTransformer() p.fit(x_train, y_train) print('Loading word embeddings...') embeddings = load_glove(EMBEDDING_PATH) embeddings = filter_embeddings(embeddings, p._word_vocab.vocab, 100) print('Building a model.') model = ELModel(char_embedding_dim=args.char_emb_size, word_embedding_dim=args.word_emb_size, char_lstm_size=args.char_lstm_units, word_lstm_size=args.word_lstm_units, char_vocab_size=p.char_vocab_size, word_vocab_size=p.word_vocab_size, num_labels=p.label_size, embeddings=embeddings, dropout=args.dropout) model, loss = model.build() model.compile(loss=loss, optimizer='adam') print('Training the model...') trainer = Trainer(model, preprocessor=p) trainer.train(x_train, y_train, x_test, y_test) print('Saving the model...') model.save(args.weights_file, args.params_file)
def main(args): print('Loading datasets...') X, y = load_data_and_labels(args.data_path) x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=42) embeddings = KeyedVectors.load(args.embedding_path).wv print('Transforming datasets...') p = IndexTransformer() p.fit(X, y) embeddings = filter_embeddings(embeddings, p._word_vocab, embeddings.vector_size) print('Building a model...') model = BiLSTMCRF(char_vocab_size=p.char_vocab_size, word_vocab_size=p.word_vocab_size, num_labels=p.label_size, embeddings=embeddings, char_embedding_dim=50) model.build() print('Training the model...') trainer = Trainer(model, preprocessor=p) trainer.train(x_train, y_train, x_valid, y_valid) print('Saving the model...') model.save(args.weights_file, args.params_file) p.save(args.preprocessor_file)
def train(self, x_train, y_train, x_valid=None, y_valid=None, vocab_init=None, verbose=1): self.p = prepare_preprocessor(x_train, y_train, vocab_init=vocab_init) embeddings = filter_embeddings(self.embeddings, self.p.vocab_word, self.model_config.word_embedding_size) self.model_config.vocab_size = len(self.p.vocab_word) self.model_config.char_vocab_size = len(self.p.vocab_char) self.model = SeqLabeling(self.model_config, embeddings, len(self.p.vocab_tag)) if not os.path.exists(self.log_dir): print('Successfully made a directory: {}'.format(self.log_dir)) os.mkdir(self.log_dir) self.p.save(os.path.join(self.log_dir, self.preprocessor_file)) self.model_config.save(os.path.join(self.log_dir, self.config_file)) print('Successfully save config and preprocess files') trainer = Trainer(self.model, self.training_config, checkpoint_path=self.log_dir, preprocessor=self.p) return trainer.train(x_train, y_train, x_valid, y_valid, verbose)
def main(args): print('Loading dataset...') x_train, y_train = load_data_and_labels(args.train_data) x_valid, y_valid = load_data_and_labels(args.valid_data) print('Transforming datasets...') p = IndexTransformer(use_char=args.no_char_feature) p.fit(x_train, y_train) print('Building a model.') model = BiLSTMCRF(char_embedding_dim=args.char_emb_size, word_embedding_dim=args.word_emb_size, char_lstm_size=args.char_lstm_units, word_lstm_size=args.word_lstm_units, char_vocab_size=p.char_vocab_size, word_vocab_size=p.word_vocab_size, num_labels=p.label_size, dropout=args.dropout, use_char=args.no_char_feature, use_crf=args.no_use_crf) model, loss = model.build() model.compile(loss=loss, optimizer='adam') print('Training the model...') trainer = Trainer(model, preprocessor=p) trainer.train(x_train, y_train, x_valid, y_valid) print('Saving the model...') model.save(args.weights_file, args.params_file) p.save(args.preprocessor_file)
def test_save(self): # Train the model. trainer = Trainer(self.model, preprocessor=self.p) trainer.train(self.x_train, self.y_train) # Save the model. save_model(self.model, self.weights_file, self.params_file) self.p.save(self.preprocessor_file)
def test_train_no_crf(self): model = BiLSTMCRF(char_vocab_size=self.p.char_vocab_size, word_vocab_size=self.p.word_vocab_size, num_labels=self.p.label_size, use_crf=False) model, loss = model.build() model.compile(loss=loss, optimizer='adam') trainer = Trainer(model, preprocessor=self.p) trainer.train(self.x_train, self.y_train, x_valid=self.x_valid, y_valid=self.y_valid)
def test_train_no_character(self): p = IndexTransformer(use_char=False) p.fit(self.x_train, self.y_train) model = BiLSTMCRF(word_vocab_size=p.word_vocab_size, num_labels=p.label_size, use_crf=False, use_char=False) model, loss = model.build() model.compile(loss=loss, optimizer='adam') trainer = Trainer(model, preprocessor=p) trainer.train(self.x_train, self.y_train, x_valid=self.x_valid, y_valid=self.y_valid)
def fit(self, x_train, y_train, x_valid=None, y_valid=None, epochs=1, batch_size=32, verbose=1, callbacks=None, shuffle=True): """Fit the model for a fixed number of epochs. Args: x_train: list of training data. y_train: list of training target (label) data. x_valid: list of validation data. y_valid: list of validation target (label) data. batch_size: Integer. Number of samples per gradient update. If unspecified, `batch_size` will default to 32. epochs: Integer. Number of epochs to train the model. verbose: Integer. 0, 1, or 2. Verbosity mode. 0 = silent, 1 = progress bar, 2 = one line per epoch. callbacks: List of `keras.callbacks.Callback` instances. List of callbacks to apply during training. shuffle: Boolean (whether to shuffle the training data before each epoch). `shuffle` will default to True. """ p = IndexTransformer(initial_vocab=self.initial_vocab, use_char=self.use_char) p.fit(x_train, y_train) embeddings = filter_embeddings(self.embeddings, p._word_vocab.vocab, self.word_embedding_dim) model = BiLSTMCRF(char_vocab_size=p.char_vocab_size, word_vocab_size=p.word_vocab_size, num_labels=p.label_size, word_embedding_dim=self.word_embedding_dim, char_embedding_dim=self.char_embedding_dim, word_lstm_size=self.word_lstm_size, char_lstm_size=self.char_lstm_size, fc_dim=self.fc_dim, dropout=self.dropout, embeddings=embeddings, use_char=self.use_char, use_crf=self.use_crf) model, loss = model.build() model.compile(loss=loss, optimizer=self.optimizer) trainer = Trainer(model, preprocessor=p) trainer.train(x_train, y_train, x_valid, y_valid, epochs=epochs, batch_size=batch_size, verbose=verbose, callbacks=callbacks, shuffle=shuffle) self.p = p self.model = model
def fit(self, X, y): """ Trains the NER model. Input is list of list of tokens and tags. Parameters ---------- X : list(list(str)) list of list of tokens y : list(list(str)) list of list of BIO tags Returns ------- self """ log.info("Preprocessing dataset...") self.preprocessor_ = IndexTransformer(use_char=self.use_char) self.preprocessor_.fit(X, y) log.info("Building model...") self.model_ = BiLSTMCRF( char_embedding_dim=self.char_embedding_dim, word_embedding_dim=self.word_embedding_dim, char_lstm_size=self.char_lstm_size, word_lstm_size=self.word_lstm_size, char_vocab_size=self.preprocessor_.char_vocab_size, word_vocab_size=self.preprocessor_.word_vocab_size, num_labels=self.preprocessor_.label_size, dropout=self.dropout, use_char=self.use_char, use_crf=self.use_crf) self.model_, loss = self.model_.build() optimizer = Adam(lr=self.learning_rate) self.model_.compile(loss=loss, optimizer=optimizer) self.model_.summary() log.info('Training the model...') self.trainer_ = Trainer(self.model_, preprocessor=self.preprocessor_) x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=42) self.trainer_.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid, batch_size=self.batch_size, epochs=self.max_iter) self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_) return self
def train(self, x_train, y_train, x_valid=None, y_valid=None, vocab_init=None): self.p = prepare_preprocessor(x_train, y_train, vocab_init=vocab_init) embeddings = filter_embeddings(self.embeddings, self.p.vocab_word, self.model_config.word_embedding_size) self.model_config.vocab_size = len(self.p.vocab_word) self.model_config.char_vocab_size = len(self.p.vocab_char) self.model = SeqLabeling(self.model_config, embeddings, len(self.p.vocab_tag)) trainer = Trainer(self.model, self.training_config, checkpoint_path=self.log_dir, preprocessor=self.p, save_path='./models') trainer.train(x_train, y_train, x_valid, y_valid)
class BiLstmCrfNER(NERModel): def __init__(self, word_embedding_dim=100, char_embedding_dim=25, word_lstm_size=100, char_lstm_size=25, fc_dim=100, dropout=0.5, embeddings=None, use_char=True, use_crf=True, batch_size=16, learning_rate=0.001, max_iter=10): """ Construct a BiLSTM-CRF NER model. Model is augmented with character level embeddings as well as word embeddings by default. Implementation is provided by the Anago project. Parameters ---------- word_embedding_dim : int, optional, default 100 word embedding dimensions. char_embedding_dim : int, optional, default 25 character embedding dimensions. word_lstm_size : int, optional, default 100 character LSTM feature extractor output dimensions. char_lstm_size : int, optional, default 25 word tagger LSTM output dimensions. fc_dim : int, optional, default 100 output fully-connected layer size. dropout : float, optional, default 0.5 dropout rate. embeddings : numpy array word embedding matrix. use_char : bool, optional, default True add char feature. use_crf : bool, optional, default True use crf as last layer. batch_size : int, optional, default 16 training batch size. learning_rate : float, optional, default 0.001 learning rate for Adam optimizer max_iter : int number of epochs of training Attributes ---------- preprocessor_ : reference to preprocessor model_ : reference to generated model trainer_ : internal reference to Anago Trainer (model) tagger_ : internal reference to Anago Tagger (predictor) """ super().__init__() self.word_embedding_dim = word_embedding_dim self.char_embedding_dim = char_embedding_dim self.word_lstm_size = word_lstm_size self.char_lstm_size = char_lstm_size self.fc_dim = fc_dim self.dropout = dropout self.embedding = None self.use_char = True self.use_crf = True self.batch_size = batch_size self.learning_rate = learning_rate self.max_iter = max_iter # populated by fit() and load(), expected by save() and transform() self.preprocessor_ = None self.model_ = None self.trainer_ = None self.tagger_ = None def fit(self, X, y): """ Trains the NER model. Input is list of list of tokens and tags. Parameters ---------- X : list(list(str)) list of list of tokens y : list(list(str)) list of list of BIO tags Returns ------- self """ log.info("Preprocessing dataset...") self.preprocessor_ = IndexTransformer(use_char=self.use_char) self.preprocessor_.fit(X, y) log.info("Building model...") self.model_ = BiLSTMCRF( char_embedding_dim=self.char_embedding_dim, word_embedding_dim=self.word_embedding_dim, char_lstm_size=self.char_lstm_size, word_lstm_size=self.word_lstm_size, char_vocab_size=self.preprocessor_.char_vocab_size, word_vocab_size=self.preprocessor_.word_vocab_size, num_labels=self.preprocessor_.label_size, dropout=self.dropout, use_char=self.use_char, use_crf=self.use_crf) self.model_, loss = self.model_.build() optimizer = Adam(lr=self.learning_rate) self.model_.compile(loss=loss, optimizer=optimizer) self.model_.summary() log.info('Training the model...') self.trainer_ = Trainer(self.model_, preprocessor=self.preprocessor_) x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=42) self.trainer_.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid, batch_size=self.batch_size, epochs=self.max_iter) self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_) return self def predict(self, X): """ Predicts using the NER model. Parameters ---------- X : list(list(str)) list of list of tokens. Returns ------- y : list(list(str)) list of list of predicted BIO tags. """ if self.tagger_ is None: raise ValueError("No tagger found, either run fit() to train or load() a trained model") log.info("Predicting from model...") ypreds = [self.tagger_.predict(" ".join(x)) for x in X] return ypreds def save(self, dirpath): """ Saves model to local disk, given a dirpath Parameters ---------- dirpath : str a directory where model artifacts will be saved. Model saves a weights.h5 weights file, a params.json parameter file, and a preprocessor.pkl preprocessor file. Returns ------- None """ if self.model_ is None or self.preprocessor_ is None: raise ValueError("No model artifacts to save, either run fit() to train or load() a trained model") if not os.path.exists(dirpath): os.makedirs(dirpath) weights_file = os.path.join(dirpath, "weights.h5") params_file = os.path.join(dirpath, "params.json") preprocessor_file = os.path.join(dirpath, "preprocessor.pkl") save_model(self.model_, weights_file, params_file) self.preprocessor_.save(preprocessor_file) write_param_file(self.get_params(), os.path.join(dirpath, "params.yaml")) def load(self, dirpath): """ Loads a trained model from local disk, given the dirpath Parameters ---------- dirpath : str a directory where model artifacts are saved. Returns ------- self """ if not os.path.exists(dirpath): raise ValueError("Model directory not found: {:s}".format(dirpath)) weights_file = os.path.join(dirpath, "weights.h5") params_file = os.path.join(dirpath, "params.json") preprocessor_file = os.path.join(dirpath, "preprocessor.pkl") if not (os.path.exists(weights_file) or os.path.exists(params_file) or os.path.exists(preprocessor_file)): raise ValueError("Model files may be corrupted, exiting") self.model_ = load_model(weights_file, params_file) self.preprocessor_ = IndexTransformer.load(preprocessor_file) self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_) return self
def bertFitV2(self, x_train, y_train, x_valid=None, y_valid=None, epochs=1, batch_size=32, verbose=1, callbacks=None, shuffle=True): sess = tf.Session() bert_path = "https://tfhub.dev/google/bert_multi_cased_L-12_H-768_A-12/1" max_seq_length = self._bretMaxLen p = IndexTransformer(initial_vocab=self.initial_vocab, use_char=self.use_char) p.fit(x_train, y_train) embeddings = filter_embeddings(self.embeddings, p._word_vocab.vocab, self.word_embedding_dim) #tokenizer = create_tokenizer_from_hub_module() #print("tokenizar done") #train_examples = convert_text_to_examples(x_train, y_train) #(train_input_ids, train_input_masks, train_segment_ids, train_labels) = convert_examples_to_features(tokenizer,train_examples,max_seq_length=max_seq_length) model = ABM.BertBiLSTMCRF(num_labels=p.label_size, char_embedding_dim=self.char_embedding_dim, word_lstm_size=self.word_lstm_size, char_lstm_size=self.char_lstm_size, fc_dim=self.fc_dim, use_char=self.use_char, char_vocab_size=None, use_crf=self.use_crf, layer2Flag=self._layer2Flag, layerdropout=self._layerdropout, bretFlag=self._bretFlag, bretMaxLen=self._bretMaxLen, bert_path=self._bert_path) model, loss = model.build() # Instantiate variables ABM.initialize_vars(sess) model.compile(loss=loss, optimizer=self.optimizer) trainer = Trainer(model, preprocessor=p) trainer.train(x_train, y_train, x_valid, y_valid, epochs=epochs, batch_size=batch_size, verbose=verbose, callbacks=callbacks, shuffle=shuffle) self.p = p self.model = model
preprocessor = IndexTransformer(use_char=True) x = x_train + x_valid y = y_train + y_valid preprocessor.fit(x, y) print(len(x_train), 'train sequences') print(len(x_valid), 'valid sequences') embeddings = filter_embeddings(wv_model, preprocessor._word_vocab.vocab, wv_model.vector_size) # Use pre-trained word embeddings model = anago.models.BiLSTMCRF( embeddings=embeddings, use_crf=False, use_char=True, num_labels=preprocessor.label_size, word_vocab_size=preprocessor.word_vocab_size, char_vocab_size=preprocessor.char_vocab_size, dropout=.5, word_lstm_size=120) model.build() model.compile(loss=model.get_loss(), optimizer='adam', metrics=["acc"]) model.summary() trainer = Trainer(model, preprocessor=preprocessor) trainer.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid, epochs=100)
class ElmoNER(NERModel): def __init__(self, word_embedding_dim=100, char_embedding_dim=25, word_lstm_size=100, char_lstm_size=25, fc_dim=100, dropout=0.5, embeddings=None, embeddings_file="glove.6B.100d.txt", batch_size=16, learning_rate=0.001, max_iter=2): """ Construct a ELMo based NER model. Model is similar to the BiLSTM-CRF model except that the word embeddings are contextual, since they are returned by a trained ELMo model. ELMo model requires an additional embedding, which is Glove-100 by default. ELMo model is provided by the (dev) Anago project. Parameters ---------- word_embedding_dim : int, optional, default 100 word embedding dimensions. char_embedding_dim : int, optional, default 25 character embedding dimensions. word_lstm_size: int, optional, default 100 character LSTM feature extractor output dimensions. char_lstm_size : int, optional, default 25 word tagger LSTM output dimensions. fc_dim : int, optional, default 100 output fully-connected layer size. dropout : float, optional, default 0.5 dropout rate. embeddings : numpy array word embedding matrix. embeddings_file : str path to embedding file. batch_size : int, optional, default 16 training batch size. learning_rate : float, optional, default 0.001 learning rate for Adam optimizer. max_iter : int, optional, default 2 number of epochs of training. Attributes ---------- preprocessor_ : reference to Anago preprocessor. model_ : reference to the internal Anago ELModel trainer_ : reference to the internal Anago Trainer object. tagger_ : reference to the internal Anago Tagger object. """ super().__init__() self.word_embedding_dim = word_embedding_dim self.char_embedding_dim = char_embedding_dim self.word_lstm_size = word_lstm_size self.char_lstm_size = char_lstm_size self.fc_dim = fc_dim self.dropout = dropout self.embeddings = embeddings self.embeddings_file = embeddings_file self.batch_size = batch_size self.learning_rate = learning_rate self.max_iter = max_iter # populated by fit() and load(), expected by save() and transform() self.preprocessor_ = None self.model_ = None self.trainer_ = None self.tagger_ = None def fit(self, X, y): """ Trains the NER model. Input is list of AnnotatedDocuments. Parameters ---------- X : list(list(str)) list of list of tokens y : list(list(str)) list of list of BIO tags Returns ------- self """ if self.embeddings is None and self.embeddings_file is None: raise ValueError( "Either embeddings or embeddings_file should be provided, exiting." ) log.info("Preprocessing dataset...") self.preprocessor_ = ELMoTransformer() self.preprocessor_.fit(X, y) if self.embeddings is None: self.embeddings = load_glove(self.embeddings_file) embeddings_dim != self.embeddings[list( self.embeddings.keys())[0]].shape[0] self.embeddings = filter_embeddings( self.embeddings, self.preprocessor_._word_vocab.vocab, embeddings_dim) log.info("Building model...") self.model_ = ELModel( char_embedding_dim=self.char_embedding_dim, word_embedding_dim=self.word_embedding_dim, char_lstm_size=self.char_lstm_size, word_lstm_size=self.word_lstm_size, char_vocab_size=self.preprocessor_.char_vocab_size, word_vocab_size=self.preprocessor_.word_vocab_size, num_labels=self.preprocessor_.label_size, embeddings=self.embeddings, dropout=self.dropout) self.model_, loss = self.model_.build() optimizer = Adam(lr=self.learning_rate) self.model_.compile(loss=loss, optimizer=optimizer) self.model_.summary() log.info('Training the model...') self.trainer_ = Trainer(self.model_, preprocessor=self.preprocessor_) x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=42) self.trainer_.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid, batch_size=self.batch_size, epochs=self.max_iter) self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_) return self def predict(self, X): """ Predicts using the NER model. Parameters ---------- X : list(list(str)) list of list of tokens. Returns ------- y : list(list(str)) list of list of predicted BIO tags. """ if self.tagger_ is None: raise ValueError( "No tagger found, either run fit() to train or load() a trained model" ) log.info("Predicting from model...") ypreds = [self.tagger_.predict(" ".join(x)) for x in X] return ypreds def save(self, dirpath): """ Saves model to local disk, given a dirpath Parameters ----------- dirpath : str a directory where model artifacts will be saved. Model saves a weights.h5 weights file, a params.json parameter file, and a preprocessor.pkl preprocessor file. Returns ------- None """ if self.model_ is None or self.preprocessor_ is None: raise ValueError( "No model artifacts to save, either run fit() to train or load() a trained model" ) if not os.path.exists(dirpath): os.makedirs(dirpath) weights_file = os.path.join(dirpath, "weights.h5") params_file = os.path.join(dirpath, "params.json") preprocessor_file = os.path.join(dirpath, "preprocessor.pkl") save_model(self.model_, weights_file, params_file) self.preprocessor_.save(preprocessor_file) write_param_file(self.get_params(), os.path.join(dirpath, "params.yaml")) def load(self, dirpath): """ Loads a trained model from local disk, given the dirpath Parameters ---------- dirpath : str a directory where model artifacts are saved. Returns ------- self """ if not os.path.exists(dirpath): raise ValueError("Model directory not found: {:s}".format(dirpath)) weights_file = os.path.join(dirpath, "weights.h5") params_file = os.path.join(dirpath, "params.json") preprocessor_file = os.path.join(dirpath, "preprocessor.pkl") if not (os.path.exists(weights_file) or os.path.exists(params_file) or os.path.exists(preprocessor_file)): raise ValueError("Model files may be corrupted, exiting") self.model_ = load_model(weights_file, params_file) self.preprocessor_ = ELMoTransformer.load(preprocessor_file) self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_) return self
def fit(self, X, y): """ Trains the NER model. Input is list of AnnotatedDocuments. Parameters ---------- X : list(list(str)) list of list of tokens y : list(list(str)) list of list of BIO tags Returns ------- self """ if self.embeddings is None and self.embeddings_file is None: raise ValueError( "Either embeddings or embeddings_file should be provided, exiting." ) log.info("Preprocessing dataset...") self.preprocessor_ = ELMoTransformer() self.preprocessor_.fit(X, y) if self.embeddings is None: self.embeddings = load_glove(self.embeddings_file) embeddings_dim != self.embeddings[list( self.embeddings.keys())[0]].shape[0] self.embeddings = filter_embeddings( self.embeddings, self.preprocessor_._word_vocab.vocab, embeddings_dim) log.info("Building model...") self.model_ = ELModel( char_embedding_dim=self.char_embedding_dim, word_embedding_dim=self.word_embedding_dim, char_lstm_size=self.char_lstm_size, word_lstm_size=self.word_lstm_size, char_vocab_size=self.preprocessor_.char_vocab_size, word_vocab_size=self.preprocessor_.word_vocab_size, num_labels=self.preprocessor_.label_size, embeddings=self.embeddings, dropout=self.dropout) self.model_, loss = self.model_.build() optimizer = Adam(lr=self.learning_rate) self.model_.compile(loss=loss, optimizer=optimizer) self.model_.summary() log.info('Training the model...') self.trainer_ = Trainer(self.model_, preprocessor=self.preprocessor_) x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=42) self.trainer_.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid, batch_size=self.batch_size, epochs=self.max_iter) self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_) return self
p.fit(x_train, y_train) print('Loading word embeddings...') embeddings = load_glove(EMBEDDING_PATH) embeddings = filter_embeddings(embeddings, p._word_vocab.vocab, EMBEDDING_DIM) print('Building a model.') model = ELModel(char_embedding_dim=32, word_embedding_dim=EMBEDDING_DIM, char_lstm_size=32, word_lstm_size=EMBEDDING_DIM, char_vocab_size=p.char_vocab_size, word_vocab_size=p.word_vocab_size, num_labels=p.label_size, embeddings=embeddings) model, loss = model.build() model.compile(loss=loss, optimizer='adam') print('Training the model...') trainer = Trainer(model, preprocessor=p) trainer.train(x_train, y_train, x_test, y_test, callbacks=[ TensorBoard(log_dir=log_dir, write_graph=False), ModelCheckpoint(weights_path, save_weights_only=True), ReduceLROnPlateau(), EarlyStopping(patience=EARLY_STOP)]) print('Saving the model...') save_model(model, os.path.join(log_dir, 'weights.h5'), os.path.join(log_dir, 'params.json')) p.save(os.path.join(log_dir, 'preprocessor.pkl')) # model.save('weights.h5', 'params.json')
def test_train_no_valid(self): trainer = Trainer(self.model, preprocessor=self.p) trainer.train(self.x_train, self.y_train)
def test_train(self): trainer = Trainer(self.model, preprocessor=self.p) trainer.train(self.x_train, self.y_train, x_valid=self.x_valid, y_valid=self.y_valid)
def train_anago(keras_model_name="WCP", data_name="laptops", task_name="ATEPC2", hand_features=None): DATA_ROOT = 'data' SAVE_ROOT = './models' # trained models LOG_ROOT = './logs' # checkpoint, tensorboard w_embedding_path = '/home/s1610434/Documents/Data/Vector/glove.twitter.27B.100d.txt' c_embedding_path = '/home/s1610434/Documents/Data/Vector/AmaYelp/GloVe/glove.char.100.txt' pos_embedding_path = '/home/s1610434/Documents/Data/Vector/AmaYelp/GloVe/glove.pos.100.txt' unipos_embedding_path = '/home/s1610434/Documents/Data/Vector/AmaYelp/GloVe/glove.unipos.100.txt' model_config = prepare_modelconfig(keras_model_name) training_config = TrainingConfig() training_config.max_epoch = 100 training_config.early_stopping = 30 print("-----{0}-----{1}-----{2}-----{3}-----".format( task_name, data_name, keras_model_name, hand_features)) save_path = SAVE_ROOT + "/{0}/{1}".format(data_name, task_name) train_path = os.path.join(DATA_ROOT, '{0}.{1}.train.tsv'.format(data_name, task_name)) test_path = os.path.join(DATA_ROOT, '{0}.{1}.test.tsv'.format(data_name, task_name)) train_dep_path = os.path.join( DATA_ROOT, '{0}.{1}.train.dep.tsv'.format(data_name, task_name)) test_dep_path = os.path.join( DATA_ROOT, '{0}.{1}.test.dep.tsv'.format(data_name, task_name)) # train set x_train_valid, y_train_valid, _ = collect_data_from_tsv(train_path) x_train_valid_dep = collect_dept_data_from_tsv(train_dep_path) # test set X_test, Y_test, _ = collect_data_from_tsv(test_path) X_test_dep = collect_dept_data_from_tsv(test_dep_path) # train_test set X_train_test = np.concatenate((x_train_valid, X_test), 0) X_train_test_dep = np.concatenate((x_train_valid_dep, X_test_dep), 0) Y_train_test = np.concatenate((y_train_valid, Y_test), 0) # preprocessor p = prepare_preprocessor(list(zip(X_train_test, X_train_test_dep)), Y_train_test, keras_model_name=keras_model_name, hand_features=hand_features) print(len(p.vocab_word)) print(len(p.vocab_char)) model_config.vocab_size = len(p.vocab_word) model_config.char_vocab_size = len(p.vocab_char) if keras_model_name.find("P") != -1: if hand_features is not None: if "UNIPOS" in hand_features: pos_embedding_path = unipos_embedding_path model_config.pos_vocab_size = len(p.pos_extractor.features_dict) if keras_model_name.find("H") != -1: # model_config.hand_feature_size = gen_no_hand_dimension(data_name, hand_features, keras_model_name) model_config.hand_feature_size = 53 print("model_config.hand_feature_size: ", str(model_config.hand_feature_size)) # load embedding W_embeddings = load_word_embeddings(p.vocab_word, w_embedding_path, model_config.word_embedding_size) print("Load W_embeddings: {0}".format(W_embeddings.shape)) C_embeddings = None POS_embeddings = None # if "C" in keras_model_name: # C_embeddings = load_word_embeddings(p.vocab_char, c_embedding_path, model_config.char_embedding_size) # print("Load C_embeddings: {0}".format(C_embeddings.shape)) # if "P" in keras_model_name: # POS_embeddings = load_word_embeddings(p.pos_extractor.features_dict, pos_embedding_path, model_config.pos_embedding_size) # print("Load POS_embeddings: {0}".format(POS_embeddings.shape)) atepc_evaluator = ATEPCEvaluator() results = [] # TODO Kfold split kf = KFold(n_splits=10) i_fold = 0 for train_index, valid_index in kf.split(x_train_valid): model_name = "{0}.{1}.{2}".format(keras_model_name, "{0}".format(hand_features), i_fold) X_train, X_valid = x_train_valid[train_index], x_train_valid[ valid_index] X_train_dep, X_valid_dep = x_train_valid_dep[ train_index], x_train_valid_dep[valid_index] Y_train, Y_valid = y_train_valid[train_index], y_train_valid[ valid_index] print("Data train: ", X_train.shape, Y_train.shape) print("Data valid: ", X_valid.shape, Y_valid.shape) print("Data test: ", X_test.shape, Y_test.shape) trainer = Trainer(model_config=model_config, training_config=training_config, checkpoint_path=LOG_ROOT, save_path=save_path, preprocessor=p, W_embeddings=W_embeddings, C_embeddings=C_embeddings, POS_embeddings=POS_embeddings, keras_model_name=keras_model_name, model_name=model_name) # trainer = Trainer2(model_config=model_config, # training_config=training_config, # checkpoint_path=LOG_ROOT, # save_path=save_path, # preprocessor=p, # W_embeddings=W_embeddings, # C_embeddings=C_embeddings, # POS_embeddings=POS_embeddings, # keras_model_name = keras_model_name, # model_name=model_name) trainer.train(list(zip(X_train, X_train_dep)), Y_train, list(zip(X_valid, X_valid_dep)), Y_valid) evaluator = anago.Evaluator(model_config, weights=model_name, save_path=save_path, preprocessor=p, keras_model_name=keras_model_name) print("--- Test phrase --- " + model_name) print("Train ") f1_score_train = evaluator.eval(list(zip(X_train, X_train_dep)), Y_train) print("Validation ") f1_score_valid = evaluator.eval(list(zip(X_valid, X_valid_dep)), Y_valid) print("Test ") f1_score_test = evaluator.eval(list(zip(X_test, X_test_dep)), Y_test) print("---") i_fold += 1 f_out_name = "data/{0}.{1}.test.pred.tsv".format(data_name, task_name) f_out = open(f_out_name, "w") tagger = anago.Tagger(model_config, model_name, save_path=save_path, preprocessor=p, keras_model_name=keras_model_name) for x, y in zip(list(zip(X_test, X_test_dep)), Y_test): result = tagger.predict(x) for word, label, pred in zip(x[0], y, result): f_out.write("{0}\t{1}\t{2}\n".format(word, label, pred)) f_out.write("\n") f_out.close() ate_f1, apc_acc, c_apc_acc = atepc_evaluator.evaluate(f_out_name) results.append([ate_f1, apc_acc, c_apc_acc]) print(results[-1]) print("-----All-----{0}--{1}".format(keras_model_name, data_name)) for result in results: print(result) print("-----AVG-----") results_np = np.array(results, dtype=np.float32) print(results_np.mean(axis=0)) print("-------------")