def do_training(arguments, vocab): logging.debug("Init training") n_epochs = arguments.epochs batch_size = arguments.batch_size # prep data logging.info(">> Loading in data") logging.info("tokenizing train data ...") training_data = vocab.tokenize_conll(arguments.train) logging.info("... tokenized train data") if arguments.dev_mode: training_data = training_data[:100] logging.info("tokenizing dev data ...") dev_data = vocab.tokenize_conll(arguments.dev) logging.info("... tokenized dev data") # instantiate model logging.info("creating model ...") model = DependencyParser(vocab, arguments.upos_dim, arguments.word_dim, arguments.hidden_dim) logging.info("... model created") callbacks = [] tensorboard_logger = None if arguments.tb_dest: tensorboard_logger = TensorboardLoggerCallback(arguments.tb_dest) callbacks.append(tensorboard_logger) logging.info("creating ModelSaveCallback ...") save_callback = ModelSaveCallback(arguments.model_file) callbacks.append(save_callback) logging.info("... ModelSaveCallback created") # prep params logging.info("creating Model ...") parser = ParserModel(model, decoder="eisner", loss="kiperwasser", optimizer="adam", strategy="bucket", vocab=vocab) logging.info("... Model created") logging.info("training Model ...") parser.train(training_data, arguments.dev, dev_data, epochs=n_epochs, batch_size=batch_size, callbacks=callbacks, patience=arguments.patience) logging.info("...Model trained") logging.info("Model maxed on dev at epoch %s " % (save_callback.best_epoch)) return parser
def do_training_big_datasets(arguments, vocab, embs, subset_size): logging.debug("Init training with big dataset (there is no dev mode)") n_epochs = arguments.epochs batch_size = arguments.batch_size logging.info("tokenizing dev data ...") dev_data = vocab.tokenize_conll(arguments.dev) logging.info("... tokenized dev data") # instantiate model logging.info("creating model ...") model = DependencyParser(vocab, embs, arguments.no_update_pretrained_emb) logging.info("... model created") callbacks = [] if arguments.tb_dest: tensorboard_logger = TensorboardLoggerCallback(arguments.tb_dest) callbacks.append(tensorboard_logger) logging.info("creating ModelSaveCallback ...") save_callback = ModelSaveCallback(arguments.model_file) callbacks.append(save_callback) logging.info("... ModelSaveCallback created") # prep params logging.info("creating Model ...") parser = ParserModel(model, decoder="eisner", loss="kiperwasser", optimizer="adam", strategy="bucket", vocab=vocab) logging.info("... Model created") logging.info("training Model ...") parser.train_big_datasets(arguments.train, arguments.dev, dev_data, epochs=n_epochs, batch_size=batch_size, callbacks=callbacks, patience=arguments.patience, subset_size=subset_size) logging.info("...Model trained") logging.info("Model maxed on dev at epoch %s " % (save_callback.best_epoch)) return parser
def _load_model(self): """ load original K&G model and vocab """ self.vocab = Vocabulary(self.model_config['only_words']) self.vocab.load(self.model_config['vocab_file']) self.parser = DependencyParserPytorch(self.vocab, self.model_config['upos_dim'], self.model_config['word_dim'], self.model_config['hidden_dim']) self.model = ParserModel(self.parser, decoder="eisner", loss="kiperwasser", optimizer="adam", strategy="bucket", vocab=self.vocab) self.model.load_from_file(self.model_config['model_file'])
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--only_words", dest="only_words", type=str2bool, default=False, help= "Should we use only words to train? Lemmas and POS will be ignored", required=True) parser.add_argument("--do_training", dest="do_training", type=str2bool, default=False, help="Should we train the model?", required=True) parser.add_argument("--train_file", dest="train", help="Annotated CONLL train file", metavar="FILE", required=False) parser.add_argument("--dev_file", dest="dev", help="Annotated CONLL dev file", metavar="FILE", required=False) parser.add_argument("--test_file", dest="test", help="Annotated CONLL dev test", metavar="FILE", required=True) parser.add_argument( "--results_folder", dest="results_folder", help="Folder to store log, model, vocabulary and output", metavar="FILE", required=True) parser.add_argument("--logging_file", dest="logging_file", help="File to store the logs", metavar="FILE", required=True) parser.add_argument("--output_file", dest="output_file", help="CONLL output file", metavar="FILE", required=True) parser.add_argument("--vocab_file", dest="vocab_file", required=True) parser.add_argument("--model_file", dest="model_file", required=True) parser.add_argument("--epochs", dest="epochs", type=int, default=30) parser.add_argument("--batch_size", dest="batch_size", type=int, default=32) parser.add_argument("--tb_dest", dest="tb_dest") parser.add_argument("--embs", dest="embs", help="pre-trained embeddings file name", required=False) parser.add_argument( "--no_update_pretrained_emb", dest="no_update_pretrained_emb", type=str2bool, default=False, help="don't update the pretrained embeddings during training") parser.add_argument("--patience", dest='patience', type=int, default=-1) parser.add_argument( "--dev_mode", dest='dev_mode', type=str2bool, default=False, help='small subset of training examples, for code testing') parser.add_argument( "--big_dataset", dest='big_dataset', type=str2bool, default=False, help='Are you training with a huge dataset? (i.e. 1B benchmark)') arguments, unknown = parser.parse_known_args() # create results folder if needed if not os.path.exists(arguments.results_folder): os.makedirs(arguments.results_folder) # configure logging logging.basicConfig(filename=arguments.logging_file, level=logging.DEBUG, format="%(asctime)s:%(levelname)s:\t%(message)s") logging.info( "\n\n\n===================================================================================================" ) logging.info("kiperwasser_main") logging.info( "===================================================================================================\n" ) logging.info("\nArguments:") logging.info(arguments) logging.info("\n") # load or create vocabulary and embeddings vocab, embs = load_or_create_vocab_and_embs(arguments) # transform input files into conllu if needed arguments.train = transform_to_conllu(arguments.train) arguments.dev = transform_to_conllu(arguments.dev) arguments.test = transform_to_conllu(arguments.test) # load or train parser if arguments.do_training: if not arguments.big_dataset: logging.info('Training with normal dataset') parser = do_training(arguments, vocab, embs) else: subset_size = 10000 logging.info('Training with big dataset; subset_size = %i' % subset_size) parser = do_training_big_datasets(arguments, vocab, embs, subset_size) else: logging.info('No training; loading model') model = DependencyParser(vocab, embs, arguments.no_update_pretrained_emb) parser = ParserModel(model, decoder="eisner", loss="kiperwasser", optimizer="adam", strategy="bucket", vocab=vocab) parser.load_from_file(arguments.model_file) # parse test file test_data = vocab.tokenize_conll(arguments.test) output_file, temporal = parser.parse(arguments.test, test_data, arguments.batch_size, arguments.output_file) # evaluate output metrics = parser.evaluate(output_file, arguments.test) test_UAS = metrics["nopunct_uas"] test_LAS = metrics["nopunct_las"] logging.info(metrics) if arguments.tb_dest and tensorboard_logger: tensorboard_logger.raw_write("test_UAS", test_UAS) tensorboard_logger.raw_write("test_LAS", test_LAS) logging.info("\n--------------------------------------------------------") logging.info("Test score: %s %s" % (test_UAS, test_LAS)) logging.info("--------------------------------------------------------\n")
class EmbeddingsExtractor(object): def __init__(self, logging_file, model_config): # configure logging self.logging_file = logging_file self._configure_logging() self.model_config = model_config logging.info(model_config) # load vocabulary, parser and model self._load_model() # create lstms self._create_lstms() def _configure_logging(self): logging.basicConfig(filename=self.logging_file, level=logging.DEBUG, format="%(asctime)s:%(levelname)s:\t%(message)s") def _load_model(self): """ load original K&G model and vocab """ self.vocab = Vocabulary(self.model_config['only_words']) self.vocab.load(self.model_config['vocab_file']) self.parser = DependencyParserPytorch(self.vocab, self.model_config['upos_dim'], self.model_config['word_dim'], self.model_config['hidden_dim']) self.model = ParserModel(self.parser, decoder="eisner", loss="kiperwasser", optimizer="adam", strategy="bucket", vocab=self.vocab) self.model.load_from_file(self.model_config['model_file']) def _create_lstms(self): # create and initialize FWD and BWD biLSTMs with model parameters input_size = self.model_config['word_dim'] + self.model_config[ 'upos_dim'] state_dict = self.parser.deep_bilstm.state_dict() self.lstm_fwd_0 = nn.LSTM(input_size=input_size, hidden_size=self.model_config['hidden_dim'], num_layers=1, batch_first=True, bidirectional=False) new_state_dict = collections.OrderedDict() new_state_dict['weight_hh_l0'] = state_dict['lstm.weight_hh_l0'] new_state_dict['weight_ih_l0'] = state_dict['lstm.weight_ih_l0'] new_state_dict['bias_hh_l0'] = state_dict['lstm.bias_hh_l0'] new_state_dict['bias_ih_l0'] = state_dict['lstm.bias_ih_l0'] self.lstm_fwd_0.load_state_dict(new_state_dict) self.lstm_bwd_0 = nn.LSTM(input_size=input_size, hidden_size=self.model_config['hidden_dim'], num_layers=1, batch_first=True, bidirectional=False) new_state_dict = collections.OrderedDict() new_state_dict['weight_hh_l0'] = state_dict[ 'lstm.weight_hh_l0_reverse'] new_state_dict['weight_ih_l0'] = state_dict[ 'lstm.weight_ih_l0_reverse'] new_state_dict['bias_hh_l0'] = state_dict['lstm.bias_hh_l0_reverse'] new_state_dict['bias_ih_l0'] = state_dict['lstm.bias_ih_l0_reverse'] self.lstm_bwd_0.load_state_dict(new_state_dict) # NOTICE! input_size = 2*hidden_dim? self.lstm_fwd_1 = nn.LSTM(input_size=2 * self.model_config['hidden_dim'], hidden_size=self.model_config['hidden_dim'], num_layers=1, batch_first=True, bidirectional=False) new_state_dict = collections.OrderedDict() new_state_dict['weight_hh_l0'] = state_dict['lstm.weight_hh_l1'] new_state_dict['weight_ih_l0'] = state_dict['lstm.weight_ih_l1'] new_state_dict['bias_hh_l0'] = state_dict['lstm.bias_hh_l1'] new_state_dict['bias_ih_l0'] = state_dict['lstm.bias_ih_l1'] self.lstm_fwd_1.load_state_dict(new_state_dict) # NOTICE! input_size = 2*hidden_dim? self.lstm_bwd_1 = nn.LSTM(input_size=2 * self.model_config['hidden_dim'], hidden_size=self.model_config['hidden_dim'], num_layers=1, batch_first=True, bidirectional=False) new_state_dict = collections.OrderedDict() new_state_dict['weight_hh_l0'] = state_dict[ 'lstm.weight_hh_l1_reverse'] new_state_dict['weight_ih_l0'] = state_dict[ 'lstm.weight_ih_l1_reverse'] new_state_dict['bias_hh_l0'] = state_dict['lstm.bias_hh_l1_reverse'] new_state_dict['bias_ih_l0'] = state_dict['lstm.bias_ih_l1_reverse'] self.lstm_bwd_1.load_state_dict(new_state_dict) def generate_embeddings(self, input_file): logging.info( "\n\n\n===================================================================================================" ) logging.info("Generating K&G contextual embeddings for %s" % input_file) logging.info( "===================================================================================================\n" ) # generate tokenized data tokenized_sentences = self.vocab.tokenize_conll(input_file) embs = {} for i, sample in enumerate(tokenized_sentences): self.model.backend.renew_cg() # for pytorch it is just 'pass' # get embeddings words, lemmas, tags, heads, rels, chars = sample words = self.model.backend.input_tensor(np.array([words]), dtype="int") tags = self.model.backend.input_tensor(np.array([tags]), dtype="int") word_embs = self.parser.wlookup(words) tags_embs = self.parser.tlookup( tags) # TODO think if it makes sense to use tag_embs or not! input_data0 = torch.cat( [word_embs, tags_embs], dim=-1) # dim 1x8x125 (if we have 8 words in the sentence) input_data0_reversed = torch.flip(input_data0, (1, )) # feed data out_lstm_fwd_0, hidden_lstm_fwd_0 = self.lstm_fwd_0(input_data0) out_lstm_bwd_0, hidden_lstm_bwd_0 = self.lstm_bwd_0( input_data0_reversed) input_data1 = torch.cat((out_lstm_fwd_0, out_lstm_bwd_0), 2) input_data1_reversed = torch.flip(input_data1, (1, )) out_lstm_fwd_1, hidden_lstm_fwd_1 = self.lstm_fwd_1(input_data1) out_lstm_bwd_1, hidden_lstm_bwd_1 = self.lstm_bwd_1( input_data1_reversed) # generate embeddings out_lstm_bwd_0 = torch.flip(out_lstm_bwd_0, (1, )) out_lstm_bwd_1 = torch.flip(out_lstm_bwd_1, (1, )) # TODO in ELMo they perform a task-dependant weighted sum of the concatenation of L0 (initial embeddings), L1 and L2; # As our input has varying sizes and we are not weighting the layers, we'll just concatenate everything. # TODO for the syntactic probes, ELMo stores sepparately the three layers, so maybe we can do the same at least with layer 0 and layer1 ¿? sentence_embeddings = torch.cat( (input_data0, out_lstm_fwd_0, out_lstm_bwd_0, out_lstm_fwd_1, out_lstm_bwd_1), 2) # 1 x 8 x 125+100+100+100+100 = 525 embs[i] = sentence_embeddings return embs @staticmethod def save_to_hdf5(embeddings, file_path, skip_root=False): # save embeddings in hdf5 format # Write contextual word representations to disk for each of the train, dev, and test split in hdf5 format, where the # index of the sentence in the conllx file is the key to the hdf5 dataset object. That is, your dataset file should # look a bit like {'0': <np.ndarray(size=(1,SEQLEN1,FEATURE_COUNT))>, '1':<np.ndarray(size=(1,SEQLEN1,FEATURE_COUNT))>...}, etc. # Note here that SEQLEN for each sentence must be the number of tokens in the sentence as specified by the conllx file. with h5py.File(file_path, 'w') as f: for k, v in embeddings.items(): logging.info('creating dataset for k %s' % str(k)) sentence_embs = v.detach().numpy() if skip_root: sentence_embs = sentence_embs[:, 1:, :] f.create_dataset(str(k), data=sentence_embs) @staticmethod def check_hdf5_file(file_path): with h5py.File(file_path, 'r') as f: for item in f.items(): logging.info(item)
vocab = Vocabulary() vocab = vocab.fit(arguments.train) # prep data training_data = vocab.tokenize_conll(arguments.train) dev_data = vocab.tokenize_conll(arguments.dev) test_data = vocab.tokenize_conll(arguments.test) model = DependencyParser(vocab) save_callback = ModelSaveCallback(arguments.model) # prep params parser = ParserModel(model, decoder=arguments.decoder, loss="hinge", optimizer="adam", strategy="bucket", vocab=vocab) parser.train(training_data, arguments.dev, dev_data, epochs=arguments.epochs, batch_size=arguments.batch_size, callbacks=[save_callback]) # load best model model.load_from_file(arguments.model) metrics = parser.parse_and_evaluate(arguments.test, test_data,
# instantiate model model = DependencyParser(vocab, embs, arguments.no_update_pretrained_emb) callbacks = [] tensorboard_logger = None if arguments.tb_dest: tensorboard_logger = TensorboardLoggerCallback(arguments.tb_dest) callbacks.append(tensorboard_logger) save_callback = ModelSaveCallback(arguments.model_dest) callbacks.append(save_callback) # prep params parser = ParserModel(model, decoder="eisner", loss="kiperwasser", optimizer="adam", strategy="bucket", vocab=vocab) parser.train(training_data, arguments.dev, dev_data, epochs=n_epochs, batch_size=32, callbacks=callbacks, patience=arguments.patience) parser.load_from_file(arguments.model_dest) metrics = parser.parse_and_evaluate(arguments.test, test_data, batch_size=32) test_UAS = metrics["nopunct_uas"] test_LAS = metrics["nopunct_las"] print(metrics) if arguments.tb_dest and tensorboard_logger: tensorboard_logger.raw_write("test_UAS", test_UAS) tensorboard_logger.raw_write("test_LAS", test_LAS) print() print(">>> Model maxed on dev at epoch", save_callback.best_epoch)
optimizer = dy.AdamTrainer(model.parameter_collection, learning_rate, beta_1, beta_2, epsilon) """ Callbacks """ custom_learning_update_callback = UpdateParamsCallback() save_callback = ModelSaveCallback(model_destination) if arguments.tb_dest: tensorboard_logger = TensorboardLoggerCallback(tensorboard_destination) callbacks = [ tensorboard_logger, custom_learning_update_callback, save_callback ] else: callbacks = [custom_learning_update_callback, save_callback] parser = ParserModel(model, decoder="cle", loss="crossentropy", optimizer=optimizer, strategy="scaled_batch", vocab=vocab) """ Prep data """ training_data = vocab.tokenize_conll(arguments.train) dev_data = vocab.tokenize_conll(arguments.dev) test_data = vocab.tokenize_conll(arguments.test) parser.train(training_data, dev_file, dev_data, epochs=n_epochs, batch_size=batch_scale, callbacks=callbacks) parser.load_from_file(model_destination)
"===================================================================================================\n" ) # load model and vocab vocab_file = '/home/lpmayos/hd/code/UniParse/models/kiperwasser/1b/bpe/mini/only_words_true/run1/vocab.pkl' model_file = '/home/lpmayos/hd/code/UniParse/models/kiperwasser/1b/bpe/mini/only_words_true/run1/model.model' only_words = True vocab = Vocabulary(only_words) vocab.load(vocab_file) embs = None parser = DependencyParser(vocab, embs, False) model = ParserModel(parser, decoder="eisner", loss="kiperwasser", optimizer="adam", strategy="bucket", vocab=vocab) model.load_from_file(model_file) # input_file = '/home/lpmayos/hd/code/cvt_text/data/raw_data/depparse/test_mini.txt' input_file = '/home/lpmayos/hd/code/structural-probes/example/data/en_ewt-ud-sample/en_ewt-ud-dev.conllu' input_file = transform_to_conllu(input_file) input_data = vocab.tokenize_conll(input_file) embeddings = parser.extract_embeddings( input_data, model.backend, format='concat', save=True,