def get_triviaqa_train_params(n_epochs, n_dev, n_train): return TrainParams(SerializableOptimizer("Adadelta", dict(learning_rate=1)), num_epochs=n_epochs, ema=0.9999, max_checkpoints_to_keep=2, async_encoding=10, log_period=30, eval_period=1800, save_period=1800, eval_samples=dict(dev=n_dev, train=n_train))
def train_params(n_epochs): return TrainParams(SerializableOptimizer("Adadelta", dict(learning_rate=1.0)), ema=0.999, max_checkpoints_to_keep=3, async_encoding=10, num_epochs=n_epochs, log_period=30, eval_period=1200, save_period=1200, eval_samples=dict(dev=None, train=8000))
def train_params(n_epochs): return TrainParams(SerializableOptimizer("Adadelta", dict(learning_rate=1.0)), ema=0.999, max_checkpoints_to_keep=3, async_encoding=10, num_epochs=n_epochs, log_period=30, eval_period=900, save_period=300, best_weights=("dev", "race data accuracy"), eval_samples=dict(dev=None, train=5000))
def get_training_params(train_config): return TrainParams( SerializableOptimizer( train_config.optimizer, dict(learning_rate=train_config.learning_rate) ), num_epochs=train_config.n_epochs, ema=train_config.ema, max_checkpoints_to_keep=train_config.max_checkpoints_to_keep, async_encoding=train_config.async_encoding, log_period=train_config.log_period, eval_period=train_config.eval_period, save_period=train_config.save_period, best_weights=("dev", "b8/question-text-f1"), eval_samples=dict(dev=None, train=6000), eval_at_zero=False )
def main(): parser = argparse.ArgumentParser() parser.add_argument("corpus", choices=["en", "en_trans_de", "en_trans_zh"]) parser.add_argument( 'mode', choices=["confidence", "merge", "shared-norm", "sigmoid", "paragraph"]) # Note I haven't tested modes other than `shared-norm` on this corpus, so # some things might need adjusting parser.add_argument("name", help="Where to store the model") parser.add_argument("-t", "--n_tokens", default=400, type=int, help="Paragraph size") parser.add_argument( '-n', '--n_processes', type=int, default=2, help="Number of processes (i.e., select which paragraphs to train on) " "the data with") args = parser.parse_args() mode = args.mode corpus = args.corpus out = args.name + "-" + datetime.now().strftime("%m%d-%H%M%S") model = get_model(100, 140, mode, WithIndicators()) extract = ExtractMultiParagraphsPerQuestion(MergeParagraphs(args.n_tokens), ShallowOpenWebRanker(16), model.preprocessor, intern=True) eval = [ LossEvaluator(), MultiParagraphSpanEvaluator(8, "triviaqa", mode != "merge", per_doc=False) ] oversample = [ 1 ] * 2 # Sample the top two answer-containing paragraphs twice if mode == "paragraph": n_epochs = 120 test = RandomParagraphSetDatasetBuilder(120, "flatten", True, oversample) train = StratifyParagraphsBuilder(ClusteredBatcher( 60, ContextLenBucketedKey(3), True), oversample, only_answers=True) elif mode == "confidence" or mode == "sigmoid": if mode == "sigmoid": n_epochs = 640 else: n_epochs = 160 test = RandomParagraphSetDatasetBuilder(120, "flatten", True, oversample) train = StratifyParagraphsBuilder( ClusteredBatcher(60, ContextLenBucketedKey(3), True), oversample) else: n_epochs = 80 test = RandomParagraphSetDatasetBuilder( 120, "merge" if mode == "merge" else "group", True, oversample) train = StratifyParagraphSetsBuilder(30, mode == "merge", True, oversample) data = XQADataset(corpus) params = TrainParams(SerializableOptimizer("Adadelta", dict(learning_rate=1)), num_epochs=n_epochs, ema=0.999, max_checkpoints_to_keep=2, async_encoding=10, log_period=30, eval_period=1800, save_period=1800, best_weights=("dev", "b8/question-text-f1"), eval_samples=dict(dev=None, train=6000)) data = PreprocessedData(data, extract, train, test, eval_on_verified=False) data.preprocess(args.n_processes, 1000) with open(__file__, "r") as f: notes = f.read() notes = "Mode: " + args.mode + "\n" + notes trainer.start_training(data, model, params, eval, model_dir.ModelDir(out), notes)
def main(): """ A close-as-possible impelemntation of BiDaF, its based on the `dev` tensorflow 1.1 branch of Ming's repo which, in particular, uses Adam not Adadelta. I was not able to replicate the results in paper using Adadelta, but with Adam i was able to get to 78.0 F1 on the dev set with this scripts. I believe this approach is an exact reproduction up the code in the repo, up to initializations. Notes: Exponential Moving Average is very important, as is early stopping. This is also in particualr best run on a GPU due to the large number of parameters and batch size involved. """ out = get_output_name_from_cli() train_params = TrainParams(SerializableOptimizer( "Adam", dict(learning_rate=0.001)), num_epochs=12, ema=0.999, async_encoding=10, log_period=30, eval_period=1000, save_period=1000, eval_samples=dict(dev=None, train=8000)) # recurrent_layer = BiRecurrentMapper(LstmCellSpec(100, keep_probs=0.8)) # recurrent_layer = FusedLstm() recurrent_layer = SequenceMapperSeq(DropoutLayer(0.8), CudnnLstm(100)) model = Attention( encoder=DocumentAndQuestionEncoder(SingleSpanAnswerEncoder()), word_embed=FixedWordEmbedder(vec_name="glove.6B.100d", word_vec_init_scale=0, learn_unk=False), char_embed=CharWordEmbedder(embedder=LearnedCharEmbedder(16, 49, 8), layer=ReduceLayer("max", Conv1d(100, 5, 0.8), mask=False), shared_parameters=True), word_embed_layer=None, embed_mapper=SequenceMapperSeq(HighwayLayer(activation="relu"), HighwayLayer(activation="relu"), recurrent_layer), preprocess=None, question_mapper=None, context_mapper=None, memory_builder=NullBiMapper(), attention=BiAttention(TriLinear(bias=True), True), match_encoder=NullMapper(), predictor=BoundsPredictor( ChainConcat(start_layer=SequenceMapperSeq(recurrent_layer, recurrent_layer), end_layer=recurrent_layer)), ) with open(__file__, "r") as f: notes = f.read() eval = [LossEvaluator(), SpanEvaluator(bound=[17], text_eval="squad")] corpus = SquadCorpus() train_batching = ClusteredBatcher(60, ContextLenBucketedKey(3), True, False) eval_batching = ClusteredBatcher(60, ContextLenKey(), False, False) data = DocumentQaTrainingData(corpus, None, train_batching, eval_batching) trainer.start_training(data, model, train_params, eval, model_dir.ModelDir(out), notes)
def main(): parser = argparse.ArgumentParser( description='Train a model on TriviaQA unfiltered') parser.add_argument( 'mode', choices=["confidence", "merge", "shared-norm", "sigmoid", "paragraph"]) parser.add_argument("name", help="Where to store the model") parser.add_argument("-t", "--n_tokens", default=400, type=int, help="Paragraph size") parser.add_argument( '-n', '--n_processes', type=int, default=2, help="Number of processes (i.e., select which paragraphs to train on) " "the data with") parser.add_argument("-s", "--source_dir", type=str, default=None, help="where to take input files") parser.add_argument("--n_epochs", type=int, default=None, help="Max number of epoches to train on ") parser.add_argument("--char_th", type=int, default=None, help="char level embeddings") parser.add_argument("--hl_dim", type=int, default=None, help="hidden layer dim size") parser.add_argument("--regularization", type=int, default=None, help="hidden layer dim size") parser.add_argument("--LR", type=float, default=1.0, help="hidden layer dim size") parser.add_argument("--save_every", type=int, default=1800, help="save period") parser.add_argument("--init_from", type=str, default=None, help="model to init from") args = parser.parse_args() mode = args.mode #out = args.name + "-" + datetime.now().strftime("%m%d-%H%M%S") out = join('models', args.name) char_th = 100 hl_dim = 140 if args.char_th is not None: print(args.char_th) char_th = int(args.char_th) out += '--th' + str(char_th) if args.hl_dim is not None: print(args.hl_dim) hl_dim = int(args.hl_dim) out += '--hl' + str(hl_dim) if args.init_from is None: model = get_model(char_th, hl_dim, mode, WithIndicators()) else: md = model_dir.ModelDir(args.init_from) model = md.get_model() extract = ExtractMultiParagraphsPerQuestion(MergeParagraphs(args.n_tokens), ShallowOpenWebRanker(16), model.preprocessor, intern=True) eval = [ LossEvaluator(), MultiParagraphSpanEvaluator(8, "triviaqa", mode != "merge", per_doc=False) ] oversample = [1] * 4 if mode == "paragraph": n_epochs = 120 test = RandomParagraphSetDatasetBuilder(120, "flatten", True, oversample) train = StratifyParagraphsBuilder(ClusteredBatcher( 60, ContextLenBucketedKey(3), True), oversample, only_answers=True) elif mode == "confidence" or mode == "sigmoid": if mode == "sigmoid": n_epochs = 640 else: n_epochs = 160 test = RandomParagraphSetDatasetBuilder(120, "flatten", True, oversample) train = StratifyParagraphsBuilder( ClusteredBatcher(60, ContextLenBucketedKey(3), True), oversample) else: n_epochs = 80 test = RandomParagraphSetDatasetBuilder( 120, "merge" if mode == "merge" else "group", True, oversample) train = StratifyParagraphSetsBuilder(30, mode == "merge", True, oversample) if args.n_epochs is not None: n_epochs = args.n_epochs out += '--' + str(n_epochs) if args.LR != 1.0: out += '--' + str(args.LR) data = TriviaQaOpenDataset(args.source_dir) async_encoding = 10 #async_encoding = 0 params = TrainParams(SerializableOptimizer("Adadelta", dict(learning_rate=args.LR)), num_epochs=n_epochs, num_of_steps=250000, ema=0.999, max_checkpoints_to_keep=2, async_encoding=async_encoding, log_period=30, eval_period=1800, save_period=args.save_every, eval_samples=dict(dev=None, train=6000), regularization_weight=None) data = PreprocessedData(data, extract, train, test, eval_on_verified=False) data.preprocess(args.n_processes, 1000) with open(__file__, "r") as f: notes = f.read() notes = "Mode: " + args.mode + "\n" + notes if args.init_from is not None: init_from = model_dir.ModelDir(args.init_from).get_best_weights() if init_from is None: init_from = model_dir.ModelDir( args.init_from).get_latest_checkpoint() else: init_from = None trainer.start_training(data, model, params, eval, model_dir.ModelDir(out), notes, initialize_from=init_from)