Beispiel #1
0
 def __init__(
         self,
         kind: str,
         n_units,
         n_layers=1,
         # Its not obvious how to compute fan_in/fan_out for these models
         # so we recommend avoiding glorot initialization for now
         w_init=TruncatedNormal(stddev=0.05),
         recurrent_init=None,
         bidirectional=True,
         learn_initial_states: bool = False,
         lstm_bias=1,
         keep_recurrent: float = 1):
     if bidirectional is None or n_layers is None or n_units is None:
         raise ValueError()
     if kind not in ["GRU", "LSTM"]:
         raise ValueError()
     self._kind = kind
     self.keep_recurrent = keep_recurrent
     self.lstm_bias = lstm_bias
     self.n_units = n_units
     self.n_layers = n_layers
     self.bidirectional = bidirectional
     self.w_init = w_init
     self.recurrent_init = recurrent_init
     self.learn_initial_states = learn_initial_states
Beispiel #2
0
 def __init__(self,
              n_units,
              n_layers=1,
              lstm_bias=1,
              w_init=TruncatedNormal(stddev=0.05),
              recurrent_init=None,
              bidirectional=True,
              learn_initial_states=False):
     super().__init__("LSTM", n_units, n_layers, w_init, recurrent_init,
                      bidirectional, learn_initial_states, lstm_bias)
Beispiel #3
0
 def __init__(self, num_mac_cells: int, hidden_dim: int):
     self.cells = num_mac_cells
     self.mac = Mac(hidden_dim)
     self.hidden_dim = hidden_dim
     self.acts = []
     self.qenc = CudnnGru(hidden_dim // 2,
                          w_init=TruncatedNormal(stddev=0.05))
     self.question_drop = DropoutLayer(0.92)
     self.control_proj = FullyConnected(hidden_dim)
     for _ in range(num_mac_cells):
         self.acts.append(FullyConnected(hidden_dim))
Beispiel #4
0
def main():
    parser = argparse.ArgumentParser("Train our ELMo model on SQuAD")
    parser.add_argument("loss_mode", choices=['default', 'confidence'])
    parser.add_argument("output_dir")
    parser.add_argument("--dim", type=int, default=90)
    parser.add_argument("--l2", type=float, default=0)
    parser.add_argument("--mode",
                        choices=["input", "output", "both", "none"],
                        default="both")
    parser.add_argument("--top_layer_only", action="store_true")
    parser.add_argument("--no-tfidf",
                        action='store_true',
                        help="Don't add TF-IDF negative examples")
    args = parser.parse_args()

    out = args.output_dir + "-" + datetime.now().strftime("%m%d-%H%M%S")

    dim = args.dim
    recurrent_layer = CudnnGru(dim, w_init=TruncatedNormal(stddev=0.05))

    if args.loss_mode == 'default':
        n_epochs = 24
        answer_encoder = SingleSpanAnswerEncoder()
        predictor = BoundsPredictor(
            ChainBiMapper(first_layer=recurrent_layer,
                          second_layer=recurrent_layer))
        batcher = ClusteredBatcher(45, ContextLenKey(), False, False)
        data = DocumentQaTrainingData(SquadCorpus(), None, batcher, batcher)
    elif args.loss_mode == 'confidence':
        if args.no_tfidf:
            prepro = SquadDefault()
            n_epochs = 15
        else:
            prepro = SquadTfIdfRanker(NltkPlusStopWords(True), 4, True)
            n_epochs = 50
        answer_encoder = DenseMultiSpanAnswerEncoder()
        predictor = ConfidencePredictor(ChainBiMapper(
            first_layer=recurrent_layer,
            second_layer=recurrent_layer,
        ),
                                        AttentionEncoder(),
                                        FullyConnected(80, activation="tanh"),
                                        aggregate="sum")
        eval_dataset = RandomParagraphSetDatasetBuilder(
            100, 'flatten', True, 0)
        train_batching = ClusteredBatcher(45, ContextLenBucketedKey(3), True,
                                          False)
        data = PreprocessedData(SquadCorpus(),
                                prepro,
                                StratifyParagraphsBuilder(train_batching, 1),
                                eval_dataset,
                                eval_on_verified=False)
        data.preprocess(1)

    params = trainer.TrainParams(trainer.SerializableOptimizer(
        "Adadelta", dict(learning_rate=1.0)),
                                 ema=0.999,
                                 max_checkpoints_to_keep=2,
                                 async_encoding=10,
                                 num_epochs=n_epochs,
                                 log_period=30,
                                 eval_period=1200,
                                 save_period=1200,
                                 best_weights=("dev", "b17/text-f1"),
                                 eval_samples=dict(dev=None, train=8000))

    lm_reduce = MapperSeq(
        ElmoLayer(args.l2,
                  layer_norm=False,
                  top_layer_only=args.top_layer_only),
        DropoutLayer(0.5),
    )
    model = AttentionWithElmo(
        encoder=DocumentAndQuestionEncoder(answer_encoder),
        lm_model=SquadContextConcatSkip(),
        append_before_atten=(args.mode == "both" or args.mode == "output"),
        append_embed=(args.mode == "both" or args.mode == "input"),
        max_batch_size=128,
        word_embed=FixedWordEmbedder(vec_name="glove.840B.300d",
                                     word_vec_init_scale=0,
                                     learn_unk=False,
                                     cpu=True),
        char_embed=CharWordEmbedder(LearnedCharEmbedder(word_size_th=14,
                                                        char_th=49,
                                                        char_dim=20,
                                                        init_scale=0.05,
                                                        force_cpu=True),
                                    MaxPool(Conv1d(100, 5, 0.8)),
                                    shared_parameters=True),
        embed_mapper=SequenceMapperSeq(
            VariationalDropoutLayer(0.8),
            recurrent_layer,
            VariationalDropoutLayer(0.8),
        ),
        lm_reduce=None,
        lm_reduce_shared=lm_reduce,
        per_sentence=False,
        memory_builder=NullBiMapper(),
        attention=BiAttention(TriLinear(bias=True), True),
        match_encoder=SequenceMapperSeq(
            FullyConnected(dim * 2, activation="relu"),
            ResidualLayer(
                SequenceMapperSeq(
                    VariationalDropoutLayer(0.8),
                    recurrent_layer,
                    VariationalDropoutLayer(0.8),
                    StaticAttentionSelf(TriLinear(bias=True),
                                        ConcatWithProduct()),
                    FullyConnected(dim * 2, activation="relu"),
                )), VariationalDropoutLayer(0.8)),
        predictor=predictor)

    with open(__file__, "r") as f:
        notes = f.read()
        notes = str(sorted(args.__dict__.items(),
                           key=lambda x: x[0])) + "\n" + notes

    trainer.start_training(
        data, model, params,
        [LossEvaluator(),
         SpanEvaluator(bound=[17], text_eval="squad")], ModelDir(out), notes)
def get_model(char_th: int, dim: int, mode: str,
              preprocess: Optional[TextPreprocessor]):
    recurrent_layer = CudnnGru(dim, w_init=TruncatedNormal(stddev=0.05))

    if mode.startswith("shared-norm"):
        answer_encoder = GroupedSpanAnswerEncoder()
        predictor = BoundsPredictor(
            ChainBiMapper(first_layer=recurrent_layer,
                          second_layer=recurrent_layer),
            span_predictor=IndependentBoundsGrouped(aggregate="sum"))
    elif mode == "confidence":
        answer_encoder = DenseMultiSpanAnswerEncoder()
        predictor = ConfidencePredictor(ChainBiMapper(
            first_layer=recurrent_layer,
            second_layer=recurrent_layer,
        ),
                                        AttentionEncoder(),
                                        FullyConnected(80, activation="tanh"),
                                        aggregate="sum")
    elif mode == "sigmoid":
        answer_encoder = DenseMultiSpanAnswerEncoder()
        predictor = BoundsPredictor(
            ChainBiMapper(first_layer=recurrent_layer,
                          second_layer=recurrent_layer),
            span_predictor=IndependentBoundsSigmoidLoss())
    elif mode == "paragraph" or mode == "merge":
        answer_encoder = DenseMultiSpanAnswerEncoder()
        predictor = BoundsPredictor(
            ChainBiMapper(first_layer=recurrent_layer,
                          second_layer=recurrent_layer))
    else:
        raise NotImplementedError(mode)

    return MacAttention(
        encoder=DocumentAndQuestionEncoder(answer_encoder),
        word_embed=FixedWordEmbedder(vec_name="glove.840B.300d",
                                     word_vec_init_scale=0,
                                     learn_unk=False,
                                     cpu=True),
        char_embed=CharWordEmbedder(LearnedCharEmbedder(word_size_th=14,
                                                        char_th=char_th,
                                                        char_dim=20,
                                                        init_scale=0.05,
                                                        force_cpu=True),
                                    MaxPool(Conv1d(100, 5, 0.8)),
                                    shared_parameters=True),
        preprocess=preprocess,
        word_embed_layer=None,
        embed_mapper=SequenceMapperSeq(
            VariationalDropoutLayer(0.8),
            recurrent_layer,
            VariationalDropoutLayer(0.8),
        ),
        question_mapper=None,
        context_mapper=None,
        memory_builder=NullBiMapper(),
        mac=MacNetwork(2, dim * 2),
        match_encoder=SequenceMapperSeq(
            FullyConnected(dim * 2, activation="relu"),
            ResidualLayer(
                SequenceMapperSeq(
                    VariationalDropoutLayer(0.8),
                    recurrent_layer,
                    VariationalDropoutLayer(0.8),
                    StaticAttentionSelf(TriLinear(bias=True),
                                        ConcatWithProduct()),
                    FullyConnected(dim * 2, activation="relu"),
                )), VariationalDropoutLayer(0.8)),
        predictor=predictor)