def __init__(self, num_mac_cells: int, hidden_dim: int): self.cells = num_mac_cells self.mac = Mac(hidden_dim) self.hidden_dim = hidden_dim self.acts = [] self.qenc = CudnnGru(hidden_dim // 2, w_init=TruncatedNormal(stddev=0.05)) self.question_drop = DropoutLayer(0.92) self.control_proj = FullyConnected(hidden_dim) for _ in range(num_mac_cells): self.acts.append(FullyConnected(hidden_dim))
class MacNetwork(Configurable): """ Basic non-recurrent attention using the given SimilarityFunction """ def __init__(self, num_mac_cells: int, hidden_dim: int): self.cells = num_mac_cells self.mac = Mac(hidden_dim) self.hidden_dim = hidden_dim self.acts = [] self.qenc = CudnnGru(hidden_dim // 2, w_init=TruncatedNormal(stddev=0.05)) self.question_drop = DropoutLayer(0.92) self.control_proj = FullyConnected(hidden_dim) for _ in range(num_mac_cells): self.acts.append(FullyConnected(hidden_dim)) def apply(self, is_train, document, questions, document_mask=None, question_mask=None): # create question vec # the cudnnGRU layer reverses the sequences and stuff for us so we just grab last hidden states. question_hidden = self.qenc.apply(is_train, questions, question_mask)[:, -1] question_hidden = self.question_drop.apply(is_train, question_hidden) # shared projection question_vec = tf.tanh( self.control_proj.apply(is_train, question_hidden)) # create initial memory and control states init_control = question_hidden init_memory = tf.get_variable( 'init_memory', shape=(1, self.hidden_dim), trainable=True, ) init_memory = tf.tile(init_memory, [tf.shape(questions)[0], 1]) # going through the cells! control, memory = init_control, init_memory for i in range(self.cells): # control projection stuff position_cont = self.acts[i].apply(is_train, question_vec) # call mac cell with tf.variable_scope('macmsc', reuse=False if i == 0 else True): next_control, next_mem, out = self.mac.apply( is_train, document, questions, question_vec, control, position_cont, memory, False if i == 0 else True, document_mask, question_mask) control, memory = next_control, next_mem # no yes/no questions, so no need for outputting states. return out
def main(): parser = argparse.ArgumentParser("Train our ELMo model on SQuAD") parser.add_argument("loss_mode", choices=['default', 'confidence']) parser.add_argument("output_dir") parser.add_argument("--dim", type=int, default=90) parser.add_argument("--l2", type=float, default=0) parser.add_argument("--mode", choices=["input", "output", "both", "none"], default="both") parser.add_argument("--top_layer_only", action="store_true") parser.add_argument("--no-tfidf", action='store_true', help="Don't add TF-IDF negative examples") args = parser.parse_args() out = args.output_dir + "-" + datetime.now().strftime("%m%d-%H%M%S") dim = args.dim recurrent_layer = CudnnGru(dim, w_init=TruncatedNormal(stddev=0.05)) if args.loss_mode == 'default': n_epochs = 24 answer_encoder = SingleSpanAnswerEncoder() predictor = BoundsPredictor( ChainBiMapper(first_layer=recurrent_layer, second_layer=recurrent_layer)) batcher = ClusteredBatcher(45, ContextLenKey(), False, False) data = DocumentQaTrainingData(SquadCorpus(), None, batcher, batcher) elif args.loss_mode == 'confidence': if args.no_tfidf: prepro = SquadDefault() n_epochs = 15 else: prepro = SquadTfIdfRanker(NltkPlusStopWords(True), 4, True) n_epochs = 50 answer_encoder = DenseMultiSpanAnswerEncoder() predictor = ConfidencePredictor(ChainBiMapper( first_layer=recurrent_layer, second_layer=recurrent_layer, ), AttentionEncoder(), FullyConnected(80, activation="tanh"), aggregate="sum") eval_dataset = RandomParagraphSetDatasetBuilder( 100, 'flatten', True, 0) train_batching = ClusteredBatcher(45, ContextLenBucketedKey(3), True, False) data = PreprocessedData(SquadCorpus(), prepro, StratifyParagraphsBuilder(train_batching, 1), eval_dataset, eval_on_verified=False) data.preprocess(1) params = trainer.TrainParams(trainer.SerializableOptimizer( "Adadelta", dict(learning_rate=1.0)), ema=0.999, max_checkpoints_to_keep=2, async_encoding=10, num_epochs=n_epochs, log_period=30, eval_period=1200, save_period=1200, best_weights=("dev", "b17/text-f1"), eval_samples=dict(dev=None, train=8000)) lm_reduce = MapperSeq( ElmoLayer(args.l2, layer_norm=False, top_layer_only=args.top_layer_only), DropoutLayer(0.5), ) model = AttentionWithElmo( encoder=DocumentAndQuestionEncoder(answer_encoder), lm_model=SquadContextConcatSkip(), append_before_atten=(args.mode == "both" or args.mode == "output"), append_embed=(args.mode == "both" or args.mode == "input"), max_batch_size=128, word_embed=FixedWordEmbedder(vec_name="glove.840B.300d", word_vec_init_scale=0, learn_unk=False, cpu=True), char_embed=CharWordEmbedder(LearnedCharEmbedder(word_size_th=14, char_th=49, char_dim=20, init_scale=0.05, force_cpu=True), MaxPool(Conv1d(100, 5, 0.8)), shared_parameters=True), embed_mapper=SequenceMapperSeq( VariationalDropoutLayer(0.8), recurrent_layer, VariationalDropoutLayer(0.8), ), lm_reduce=None, lm_reduce_shared=lm_reduce, per_sentence=False, memory_builder=NullBiMapper(), attention=BiAttention(TriLinear(bias=True), True), match_encoder=SequenceMapperSeq( FullyConnected(dim * 2, activation="relu"), ResidualLayer( SequenceMapperSeq( VariationalDropoutLayer(0.8), recurrent_layer, VariationalDropoutLayer(0.8), StaticAttentionSelf(TriLinear(bias=True), ConcatWithProduct()), FullyConnected(dim * 2, activation="relu"), )), VariationalDropoutLayer(0.8)), predictor=predictor) with open(__file__, "r") as f: notes = f.read() notes = str(sorted(args.__dict__.items(), key=lambda x: x[0])) + "\n" + notes trainer.start_training( data, model, params, [LossEvaluator(), SpanEvaluator(bound=[17], text_eval="squad")], ModelDir(out), notes)
def get_model(char_th: int, dim: int, mode: str, preprocess: Optional[TextPreprocessor]): recurrent_layer = CudnnGru(dim, w_init=TruncatedNormal(stddev=0.05)) #pdb.set_trace() if mode.startswith("shared-norm"): answer_encoder = GroupedSpanAnswerEncoder() predictor = BoundsPredictor( ChainBiMapper( first_layer=recurrent_layer, second_layer=recurrent_layer ), span_predictor=IndependentBoundsGrouped(aggregate="sum") ) elif mode == "confidence": answer_encoder = DenseMultiSpanAnswerEncoder() predictor = ConfidencePredictor( ChainBiMapper( first_layer=recurrent_layer, second_layer=recurrent_layer, ), AttentionEncoder(), FullyConnected(80, activation="tanh"), aggregate="sum" ) elif mode == "sigmoid": answer_encoder = DenseMultiSpanAnswerEncoder() predictor = BoundsPredictor( ChainBiMapper( first_layer=recurrent_layer, second_layer=recurrent_layer ), span_predictor=IndependentBoundsSigmoidLoss() ) elif mode == "paragraph" or mode == "merge": answer_encoder = MultiChoiceAnswerEncoder() predictor = MultiChoicePredictor(4) else: raise NotImplementedError(mode) return Attention( encoder=DocumentAndQuestionEncoder(answer_encoder), word_embed=FixedWordEmbedder(vec_name="glove.840B.300d", word_vec_init_scale=0, learn_unk=False, cpu=True), char_embed=CharWordEmbedder( LearnedCharEmbedder(word_size_th=14, char_th=char_th, char_dim=20, init_scale=0.05, force_cpu=True), MaxPool(Conv1d(100, 5, 0.8)), shared_parameters=True ), preprocess=preprocess, word_embed_layer=None, embed_mapper=SequenceMapperSeq( VariationalDropoutLayer(0.8), recurrent_layer, VariationalDropoutLayer(0.8), ), question_mapper=None, context_mapper=None, memory_builder=NullBiMapper(), attention=BiAttention(TriLinear(bias=True), True), match_encoder=SequenceMapperSeq(FullyConnected(dim * 2, activation="relu"), ResidualLayer(SequenceMapperSeq( VariationalDropoutLayer(0.8), recurrent_layer, VariationalDropoutLayer(0.8), StaticAttentionSelf(TriLinear(bias=True), ConcatWithProduct()), FullyConnected(dim * 2, activation="relu"), )), VariationalDropoutLayer(0.8)), #templayer = BiLinear(bias = True), predictor=predictor )
def build_model(preprocess: Optional[TextPreprocessor], train_config, use_cudnn=False): if use_cudnn: print('Using Cuddn:') recurrent_layer = CudnnGru(train_config.dim, w_init=TruncatedNormal(stddev=train_config.recurrent_stdev)) else: recurrent_layer = BiRecurrentMapper(CompatGruCellSpec(train_config.dim)) lm_reduce = MapperSeq( ElmoLayer( train_config.l2, layer_norm=train_config.lm_layernorm, top_layer_only=train_config.top_layer_only ), DropoutLayer(train_config.elmo_dropout), ) answer_encoder = GroupedSpanAnswerEncoder() predictor = BoundsPredictor( ChainBiMapper( first_layer=recurrent_layer, second_layer=recurrent_layer ), span_predictor=IndependentBoundsGrouped(aggregate="sum") ) word_embed = FixedWordEmbedder( vec_name=train_config.word_vectors, word_vec_init_scale=0, learn_unk=train_config.learn_unk_vector, cpu=True ) char_embed = CharWordEmbedder( LearnedCharEmbedder( word_size_th=14, char_th=train_config.char_th, char_dim=train_config.char_dim, init_scale=0.05, force_cpu=True ), MaxPool(Conv1d(100, 5, 0.8)), shared_parameters=True ) embed_mapper = SequenceMapperSeq( VariationalDropoutLayer(train_config.var_dropout), recurrent_layer, VariationalDropoutLayer(train_config.var_dropout) ) attention = BiAttention(TriLinear(bias=True), True) match_encoder = SequenceMapperSeq( FullyConnected(train_config.dim * 2, activation="relu"), ResidualLayer(SequenceMapperSeq( VariationalDropoutLayer(train_config.var_dropout), recurrent_layer, VariationalDropoutLayer(train_config.var_dropout), StaticAttentionSelf(TriLinear(bias=True), ConcatWithProduct()), FullyConnected(train_config.dim * 2, activation="relu"), )), VariationalDropoutLayer(train_config.var_dropout) ) lm_model = LanguageModel(LM_VOCAB, LM_OPTIONS, LM_WEIGHTS, LM_TOKEN_WEIGHTS) model = CapeAttentionWithElmo( encoder=DocumentAndQuestionEncoder(answer_encoder), lm_model=lm_model, max_batch_size=train_config.max_batch_size, preprocess=preprocess, per_sentence=False, append_embed=(train_config.elmo_mode == "both" or train_config.elmo_mode == "input"), append_before_atten=(train_config.elmo_mode == "both" or train_config.elmo_mode == "output"), word_embed=word_embed, char_embed=char_embed, embed_mapper=embed_mapper, lm_reduce=None, lm_reduce_shared=lm_reduce, memory_builder=NullBiMapper(), attention=attention, match_encoder=match_encoder, predictor=predictor ) return model
def main(): parser = argparse.ArgumentParser("Train rejection model on SQuAD") parser.add_argument("--corpus_dir", type=str, default="~/data/document-qa") parser.add_argument("--output_dir", type=str, default="~/model/document-qa/squad") parser.add_argument("--lm_dir", type=str, default="~/data/lm") parser.add_argument("--exp_id", type=str, default="rejection") parser.add_argument("--lr", type=float, default=0.5) parser.add_argument("--epoch", type=int, default=20) parser.add_argument("--dim", type=int, default=100) parser.add_argument("--batch_size", type=int, default=45) parser.add_argument("--l2", type=float, default=0) parser.add_argument("--mode", choices=["input", "output", "both", "none"], default="both") parser.add_argument("--top_layer_only", action="store_true") args = parser.parse_args() print("Arguments : ", args) out = args.output_dir + "_" + args.exp_id + "_lr" + str( args.lr) + "-" + datetime.now().strftime("%m%d-%H%M%S") dim = args.dim batch_size = args.batch_size out = expanduser(out) lm_dir = expanduser(args.lm_dir) corpus_dir = expanduser(args.corpus_dir) print("Make global recurrent_layer...") recurrent_layer = CudnnGru( dim, w_init=tf.keras.initializers.TruncatedNormal(stddev=0.05)) params = trainer.TrainParams(trainer.SerializableOptimizer( "Adadelta", dict(learning_rate=args.lr)), ema=0.999, max_checkpoints_to_keep=2, async_encoding=10, num_epochs=args.epoch, log_period=30, eval_period=1200, save_period=1200, best_weights=("dev", "b17/text-f1"), eval_samples=dict(dev=None, train=8000)) lm_reduce = MapperSeq( ElmoLayer(args.l2, layer_norm=False, top_layer_only=args.top_layer_only), DropoutLayer(0.5), ) model = AttentionWithElmo( encoder=DocumentAndQuestionEncoder(SingleSpanAnswerEncoder()), lm_model=SquadContextConcatSkip(lm_dir=lm_dir), append_before_atten=(args.mode == "both" or args.mode == "output"), append_embed=(args.mode == "both" or args.mode == "input"), max_batch_size=128, word_embed=FixedWordEmbedder(vec_name="glove.840B.300d", word_vec_init_scale=0, learn_unk=False, cpu=True), char_embed=CharWordEmbedder(LearnedCharEmbedder(word_size_th=14, char_th=49, char_dim=20, init_scale=0.05, force_cpu=True), MaxPool(Conv1d(100, 5, 0.8)), shared_parameters=True), embed_mapper=SequenceMapperSeq( VariationalDropoutLayer(0.8), recurrent_layer, VariationalDropoutLayer(0.8), ), lm_reduce=None, lm_reduce_shared=lm_reduce, per_sentence=False, memory_builder=NullBiMapper(), attention=BiAttention(TriLinear(bias=True), True), match_encoder=SequenceMapperSeq( FullyConnected(dim * 2, activation="relu"), ResidualLayer( SequenceMapperSeq( VariationalDropoutLayer(0.8), recurrent_layer, VariationalDropoutLayer(0.8), StaticAttentionSelf(TriLinear(bias=True), ConcatWithProduct()), FullyConnected(dim * 2, activation="relu"), )), VariationalDropoutLayer(0.8)), predictor=BoundsPredictor( ChainBiMapper(first_layer=recurrent_layer, second_layer=recurrent_layer))) batcher = ClusteredBatcher(batch_size, ContextLenKey(), False, False) data = DocumentQaTrainingData(SquadCorpus(corpus_dir), None, batcher, batcher) with open(__file__, "r") as f: notes = f.read() notes = str(sorted(args.__dict__.items(), key=lambda x: x[0])) + "\n" + notes trainer.start_training( data, model, params, [LossEvaluator(), SpanEvaluator(bound=[17], text_eval="squad")], ModelDir(out), notes)
def __init__(self, hidden_dim): # control self.control_lin = FullyConnected(hidden_dim) self.attn = FullyConnected(1) # read self.mem_drop = DropoutLayer(0.85) self.read_drop = DropoutLayer(0.85) self.mem_proj = FullyConnected(hidden_dim) self.kb_proj = FullyConnected(hidden_dim) self.concat = FullyConnected(hidden_dim) self.concat2 = FullyConnected(hidden_dim) self.bi = CtrlBiAttention(TriLinear(bias=True)) self.lin = FullyConnected(hidden_dim) self.read_drop = DropoutLayer(0.85) self.rattn = FullyConnected(1) # write self.write = FullyConnected(hidden_dim) self.gate = FullyConnected(1)
class Mac(Configurable): def __init__(self, hidden_dim): # control self.control_lin = FullyConnected(hidden_dim) self.attn = FullyConnected(1) # read self.mem_drop = DropoutLayer(0.85) self.read_drop = DropoutLayer(0.85) self.mem_proj = FullyConnected(hidden_dim) self.kb_proj = FullyConnected(hidden_dim) self.concat = FullyConnected(hidden_dim) self.concat2 = FullyConnected(hidden_dim) self.bi = CtrlBiAttention(TriLinear(bias=True)) self.lin = FullyConnected(hidden_dim) self.read_drop = DropoutLayer(0.85) self.rattn = FullyConnected(1) # write self.write = FullyConnected(hidden_dim) self.gate = FullyConnected(1) def apply(self, is_train, document, question_words, question_vec, prev_cont, position_aware_cont, prev_mem, reuse, document_mask=None, question_mask=None): # control unit with tf.variable_scope("control", reuse=reuse): control = tf.concat([prev_cont, position_aware_cont], axis=1) # B, 2xF control_question = self.control_lin.apply(is_train, control) # B, F control_question = tf.expand_dims(control_question, axis=1) # B, 1, F context_prod = control_question * question_words # B, L, F attn_weight = tf.squeeze(self.attn.apply(is_train, context_prod), axis=2) # B, L if question_mask is not None: m = tf.sequence_mask(question_mask) attn_weight += VERY_NEGATIVE_NUMBER * ( 1 - tf.cast(m, context_prod.dtype)) ctrl_attn = tf.nn.softmax(attn_weight, 1) # B, L attn = tf.expand_dims(ctrl_attn, axis=2) # B, L, 1 next_control = tf.reduce_sum(attn * question_words, axis=1) # B, F # read unit with tf.variable_scope("read", reuse=reuse): last_mem = self.mem_drop.apply(is_train, prev_mem) know = self.read_drop.apply(is_train, document) proj_mem = tf.expand_dims(self.mem_proj.apply(is_train, last_mem), axis=1) proj_know = self.kb_proj.apply(is_train, know) concat = self.concat.apply( is_train, tf.concat([proj_mem * proj_know, proj_know], axis=2)) out = tf.nn.elu( self.lin.apply( is_train, self.bi.apply(is_train, concat, question_words, ctrl_attn, document_mask, question_mask))) attn = self.read_drop.apply(is_train, out) attn = tf.squeeze(self.rattn.apply(is_train, attn), axis=-1) if document_mask is not None: m = tf.sequence_mask(document_mask) attn += VERY_NEGATIVE_NUMBER * (1 - tf.cast(m, attn.dtype)) attn = tf.expand_dims(tf.nn.softmax(attn, 1), axis=2) read = tf.reduce_sum(attn * know, axis=1) # write unit, with memory gate. with tf.variable_scope("write", reuse=reuse): concat = self.write.apply( is_train, tf.concat([read, prev_mem, next_control], axis=1)) gate = tf.sigmoid(self.gate.apply(is_train, next_control) + 1.0) next_mem = gate * prev_mem + (1 - gate) * concat # return results of cell! return next_control, next_mem, out