コード例 #1
0
def main():
    parser = argparse.ArgumentParser("Train our ELMo model on SQuAD")
    parser.add_argument("output_dir")
    parser.add_argument("--dim", type=int, default=90)
    parser.add_argument("--l2", type=float, default=0)
    parser.add_argument("--mode",
                        choices=["input", "output", "both", "none"],
                        default="both")
    parser.add_argument("--top_layer_only", action="store_true")
    args = parser.parse_args()

    out = args.output_dir + "-" + datetime.now().strftime("%m%d-%H%M%S")

    dim = args.dim
    recurrent_layer = CudnnGru(dim, w_init=TruncatedNormal(stddev=0.05))

    params = trainer.TrainParams(trainer.SerializableOptimizer(
        "Adadelta", dict(learning_rate=1.0)),
                                 ema=0.999,
                                 max_checkpoints_to_keep=2,
                                 async_encoding=10,
                                 num_epochs=24,
                                 log_period=30,
                                 eval_period=1200,
                                 save_period=1200,
                                 best_weights=("dev", "b17/text-f1"),
                                 eval_samples=dict(dev=None, train=8000))

    lm_reduce = MapperSeq(
        ElmoLayer(args.l2,
                  layer_norm=False,
                  top_layer_only=args.top_layer_only),
        DropoutLayer(0.5),
    )

    model = AttentionWithElmo(
        encoder=DocumentAndQuestionEncoder(SingleSpanAnswerEncoder()),
        lm_model=SquadContextConcatSkip(),
        append_before_atten=(args.mode == "both" or args.mode == "output"),
        append_embed=(args.mode == "both" or args.mode == "input"),
        max_batch_size=128,
        word_embed=FixedWordEmbedder(vec_name="glove.840B.300d",
                                     word_vec_init_scale=0,
                                     learn_unk=False,
                                     cpu=True),
        char_embed=CharWordEmbedder(LearnedCharEmbedder(word_size_th=14,
                                                        char_th=49,
                                                        char_dim=20,
                                                        init_scale=0.05,
                                                        force_cpu=True),
                                    MaxPool(Conv1d(100, 5, 0.8)),
                                    shared_parameters=True),
        embed_mapper=SequenceMapperSeq(
            VariationalDropoutLayer(0.8),
            recurrent_layer,
            VariationalDropoutLayer(0.8),
        ),
        lm_reduce=None,
        lm_reduce_shared=lm_reduce,
        per_sentence=False,
        memory_builder=NullBiMapper(),
        attention=BiAttention(TriLinear(bias=True), True),
        match_encoder=SequenceMapperSeq(
            FullyConnected(dim * 2, activation="relu"),
            ResidualLayer(
                SequenceMapperSeq(
                    VariationalDropoutLayer(0.8),
                    recurrent_layer,
                    VariationalDropoutLayer(0.8),
                    StaticAttentionSelf(TriLinear(bias=True),
                                        ConcatWithProduct()),
                    FullyConnected(dim * 2, activation="relu"),
                )), VariationalDropoutLayer(0.8)),
        predictor=BoundsPredictor(
            ChainBiMapper(first_layer=recurrent_layer,
                          second_layer=recurrent_layer)))

    batcher = ClusteredBatcher(45, ContextLenKey(), False, False)
    data = DocumentQaTrainingData(SquadCorpus(), None, batcher, batcher)

    with open(__file__, "r") as f:
        notes = f.read()
        notes = str(sorted(args.__dict__.items(),
                           key=lambda x: x[0])) + "\n" + notes

    trainer.start_training(
        data, model, params,
        [LossEvaluator(),
         SpanEvaluator(bound=[17], text_eval="squad")], ModelDir(out), notes)
コード例 #2
0
def main():
    parser = argparse.ArgumentParser(
        description='Train a model on document-level RACE')
    parser.add_argument(
        'mode',
        choices=["paragraph", "confidence", "shared-norm", "merge", "sigmoid"])
    parser.add_argument("name", help="Output directory")
    args = parser.parse_args()
    mode = args.mode
    out = args.name + "-" + datetime.now().strftime("%m%d-%H%M%S")

    corpus = SquadCorpus()
    if mode == "merge":
        # Adds paragraph start tokens, since we will be concatenating paragraphs together
        pre = WithIndicators(True, para_tokens=False, doc_start_token=False)
    else:
        pre = None

    model = get_model(50, 100, args.mode, pre)

    if mode == "paragraph":
        # Run in the "standard" known-paragraph setting
        if model.preprocessor is not None:
            raise NotImplementedError()
        #pdb.set_trace()
        n_epochs = 25  #from 26  for dev vs dev
        num_choices = 4
        train_batching = ClusteredBatcher(60, ContextLenBucketedKey(3), True,
                                          False)  #150
        #eval_batching = ClusteredBatcher(45, ContextLenBucketedKey(3), False, False)
        eval_batching = ClusteredBatcher(60, ContextLenKey(), False, False)
        data = DocumentQaTrainingData(corpus, None, train_batching,
                                      eval_batching, num_choices)
        #eval = [LossEvaluator(), SpanEvaluator(bound=[17], text_eval="squad")]
        eval = [LossEvaluator(), MultiChoiceEvaluator(num_choices)]
    else:
        eval_set_mode = {
            "confidence": "flatten",
            "sigmoid": "flatten",
            "shared-norm": "group",
            "merge": "merge"
        }[mode]
        eval_dataset = RandomParagraphSetDatasetBuilder(
            100, eval_set_mode, True, 0)

        if mode == "confidence" or mode == "sigmoid":
            if mode == "sigmoid":
                # needs to be trained for a really long time for reasons unknown, even this might be too small
                n_epochs = 100
            else:
                n_epochs = 50  # more epochs since we only "see" the label very other epoch-osh
            train_batching = ClusteredBatcher(45, ContextLenBucketedKey(3),
                                              True, False)
            data = PreprocessedData(
                SquadCorpus(),
                SquadTfIdfRanker(NltkPlusStopWords(True), 4, True,
                                 model.preprocessor),
                StratifyParagraphsBuilder(train_batching, 1),
                eval_dataset,
                eval_on_verified=False,
            )
        else:
            n_epochs = 26
            data = PreprocessedData(
                SquadCorpus(),
                SquadTfIdfRanker(NltkPlusStopWords(True), 4, True,
                                 model.preprocessor),
                StratifyParagraphSetsBuilder(25, args.mode == "merge", True,
                                             1),
                eval_dataset,
                eval_on_verified=False,
            )

        #eval = [LossEvaluator(), MultiParagraphSpanEvaluator(17, "squad")]
        eval = [LossEvaluator()]
        data.preprocess(1)

    with open(__file__, "r") as f:
        notes = f.read()
        notes = args.mode + "\n" + notes

    trainer.start_training(data, model, train_params(n_epochs), eval,
                           model_dir.ModelDir(out), notes)
コード例 #3
0
def main():
    parser = argparse.ArgumentParser(
        description='Train a model on TriviaQA unfiltered')
    parser.add_argument(
        'mode',
        choices=["confidence", "merge", "shared-norm", "sigmoid", "paragraph"])
    parser.add_argument("name", help="Where to store the model")
    parser.add_argument("-t",
                        "--n_tokens",
                        default=400,
                        type=int,
                        help="Paragraph size")
    parser.add_argument(
        '-n',
        '--n_processes',
        type=int,
        default=2,
        help="Number of processes (i.e., select which paragraphs to train on) "
        "the data with")
    args = parser.parse_args()
    mode = args.mode

    out = args.name + "-" + datetime.now().strftime("%m%d-%H%M%S")

    model = get_model(100, 140, mode, WithIndicators())

    extract = ExtractMultiParagraphsPerQuestion(MergeParagraphs(args.n_tokens),
                                                ShallowOpenWebRanker(16),
                                                model.preprocessor,
                                                intern=True)

    eval = [
        LossEvaluator(),
        MultiParagraphSpanEvaluator(8, "triviaqa", mode != "merge")
    ]
    oversample = [1] * 4

    if mode == "paragraph":
        n_epochs = 120
        test = RandomParagraphSetDatasetBuilder(120, "flatten", True,
                                                oversample)
        train = StratifyParagraphsBuilder(ClusteredBatcher(
            60, ContextLenBucketedKey(3), True),
                                          oversample,
                                          only_answers=True)
    elif mode == "confidence" or mode == "sigmoid":
        if mode == "sigmoid":
            n_epochs = 640
        else:
            n_epochs = 160
        test = RandomParagraphSetDatasetBuilder(120, "flatten", True,
                                                oversample)
        train = StratifyParagraphsBuilder(
            ClusteredBatcher(60, ContextLenBucketedKey(3), True), oversample)
    else:
        n_epochs = 80
        test = RandomParagraphSetDatasetBuilder(
            120, "merge" if mode == "merge" else "group", True, oversample)
        train = StratifyParagraphSetsBuilder(30, mode == "merge", True,
                                             oversample)

    data = TriviaQaWebDataset()

    params = TrainParams(SerializableOptimizer("Adadelta",
                                               dict(learning_rate=1)),
                         num_epochs=n_epochs,
                         ema=0.999,
                         max_checkpoints_to_keep=2,
                         async_encoding=10,
                         log_period=30,
                         eval_period=1800,
                         save_period=1800,
                         eval_samples=dict(dev=None, train=6000))

    data = PreprocessedData(data, extract, train, test, eval_on_verified=False)

    data.preprocess(args.n_processes, 1000)

    with open(__file__, "r") as f:
        notes = f.read()
    notes = "Mode: " + args.mode + "\n" + notes

    trainer.start_training(data, model, params, eval, model_dir.ModelDir(out),
                           notes)
コード例 #4
0
def main():
    parser = argparse.ArgumentParser(
        description='Train a model on document-level SQuAD')
    parser.add_argument(
        'mode',
        choices=["paragraph", "confidence", "shared-norm", "merge", "sigmoid"])
    parser.add_argument("name", help="Output directory")
    parser.add_argument("--no-tfidf",
                        action='store_true',
                        help="Don't add TF-IDF negative examples")
    parser.add_argument("--weighted-questions",
                        action='store_true',
                        help="Read a weighted training dataset")
    args = parser.parse_known_args()[0]
    mode = args.mode
    out = args.name + "-" + datetime.now().strftime("%m%d-%H%M%S")

    corpus = SquadCorpus()
    if mode == "merge":
        # Adds paragraph start tokens, since we will be concatenating paragraphs together
        pre = WithIndicators(True, para_tokens=False, doc_start_token=False)
    else:
        pre = None

    model = get_model(50, 100, args.mode, pre)

    if mode == "paragraph":
        if model.preprocessor is not None:
            raise NotImplementedError()
        n_epochs = 26
        train_batching = ClusteredBatcher(45, ContextLenBucketedKey(3), True,
                                          False)
        eval_batching = ClusteredBatcher(45, ContextLenKey(), False, False)
        data = DocumentQaTrainingData(corpus, None, train_batching,
                                      eval_batching)
        eval = [LossEvaluator(), SpanEvaluator(bound=[17], text_eval="squad")]
    else:
        eval_set_mode = {
            "confidence": "flatten",
            "sigmoid": "flatten",
            "shared-norm": "group",
            "merge": "merge"
        }[mode]
        eval_dataset = RandomParagraphSetDatasetBuilder(
            100, eval_set_mode, True, 0)
        if args.no_tfidf:
            if args.weighted_questions:
                prepro = SquadWeighted(model.preprocessor)
            else:
                prepro = SquadDefault(model.preprocessor)
        else:
            if args.weighted_questions:
                raise NotImplementedError(
                    "Weighted questions not supported for tf-idf mode yet")
            prepro = SquadTfIdfRanker(NltkPlusStopWords(True), 4, True,
                                      model.preprocessor)

        if mode == "confidence" or mode == "sigmoid":
            if mode == "sigmoid":
                # needs to be trained for a really long time for reasons unknown, even this might be too small
                n_epochs = 100
            else:
                n_epochs = 50  # more epochs since we only "see" the label very other epoch-osh
                if args.no_tfidf:
                    n_epochs = 15  # We see the whole dataset every epoch

            train_batching = ClusteredBatcher(45, ContextLenBucketedKey(3),
                                              True, False)

            if args.weighted_questions:
                dataset_builder = WeightedStratifyParagraphsBuilder(
                    train_batching, 1)
            else:
                dataset_builder = StratifyParagraphsBuilder(train_batching, 1)

            data = PreprocessedData(
                SquadCorpus(),
                prepro,
                dataset_builder,
                eval_dataset,
                eval_on_verified=False,
            )
        else:
            n_epochs = 26
            if args.weighted_questions:
                dataset_builder = WeightedStratifyParagraphSetsBuilder(
                    25, args.mode == "merge", True, 1)
            else:
                dataset_builder = StratifyParagraphSetsBuilder(
                    25, args.mode == "merge", True, 1)

            data = PreprocessedData(
                SquadCorpus(),
                prepro,
                dataset_builder,
                eval_dataset,
                eval_on_verified=False,
            )

        eval = [LossEvaluator(), MultiParagraphSpanEvaluator(17, "squad")]
        data.preprocess(1)

    with open(__file__, "r") as f:
        notes = f.read()
        notes = args.mode + "\n" + notes

    trainer.start_training(data, model, train_params(n_epochs), eval,
                           model_dir.ModelDir(out), notes)
コード例 #5
0
def main():
    parser = argparse.ArgumentParser(
        description='Train a model on TriviaQA web')
    parser.add_argument('mode',
                        choices=[
                            "paragraph-level", "confidence", "merge",
                            "shared-norm", "sigmoid", "shared-norm-600"
                        ])
    parser.add_argument("name", help="Where to store the model")
    parser.add_argument(
        '-n',
        '--n_processes',
        type=int,
        default=2,
        help="Number of processes (i.e., select which paragraphs to train on) "
        "the data with")
    args = parser.parse_args()
    mode = args.mode

    out = args.name + "-" + datetime.now().strftime("%m%d-%H%M%S")

    model = get_model(100, 140, mode, WithIndicators())

    stop = NltkPlusStopWords(True)

    if mode == "paragraph-level":
        extract = ExtractSingleParagraph(MergeParagraphs(400),
                                         TopTfIdf(stop, 1),
                                         model.preprocessor,
                                         intern=True)
    elif mode == "shared-norm-600":
        extract = ExtractMultiParagraphs(MergeParagraphs(600),
                                         TopTfIdf(stop, 4),
                                         model.preprocessor,
                                         intern=True)
    else:
        extract = ExtractMultiParagraphs(MergeParagraphs(400),
                                         TopTfIdf(stop, 4),
                                         model.preprocessor,
                                         intern=True)

    if mode == "paragraph-level":
        n_epochs = 16
        train = ParagraphAndQuestionsBuilder(
            ClusteredBatcher(60, ContextLenBucketedKey(3), True))
        test = ParagraphAndQuestionsBuilder(
            ClusteredBatcher(60, ContextLenKey(), False))
        n_dev, n_train = 21000, 12000
        eval = [LossEvaluator(), SpanEvaluator([4, 8], "triviaqa")]
    else:
        eval = [
            LossEvaluator(),
            MultiParagraphSpanEvaluator(8, "triviaqa", mode != "merge")
        ]
        # we sample two paragraphs per a (question, doc) pair, so evaluate on fewer questions
        n_dev, n_train = 15000, 8000

        if mode == "confidence" or mode == "sigmoid":
            if mode == "sigmoid":
                # Trains very slowly, do this at your own risk
                n_epochs = 71
            else:
                n_epochs = 28
            test = RandomParagraphSetDatasetBuilder(120, "flatten", True, 1)
            train = StratifyParagraphsBuilder(
                ClusteredBatcher(60, ContextLenBucketedKey(3), True), 0, 1)
        else:
            n_epochs = 14
            test = RandomParagraphSetDatasetBuilder(
                120, "merge" if mode == "merge" else "group", True, 1)
            train = StratifyParagraphSetsBuilder(35, mode == "merge", True, 1)

    data = TriviaQaWebDataset()

    params = get_triviaqa_train_params(n_epochs, n_dev, n_train)

    data = PreprocessedData(data, extract, train, test, eval_on_verified=False)

    data.preprocess(args.n_processes, 1000)

    with open(__file__, "r") as f:
        notes = f.read()
    notes = "*" * 10 + "\nMode: " + args.mode + "\n" + "*" * 10 + "\n" + notes

    trainer.start_training(data, model, params, eval, model_dir.ModelDir(out),
                           notes)
コード例 #6
0
class RandomParagraphSetDataset(Dataset):
    """
    Sample multiple paragraphs for each question and include them in the same batch
    """
    def __init__(self, questions: List[MultiParagraphQuestion], true_len: int,
                 n_paragraphs: int, batch_size: int, mode: str,
                 force_answer: bool, oversample_first_answer: List[int]):
        self.mode = mode
        self.questions = questions
        self.force_answer = force_answer
        self.true_len = true_len
        self.n_paragraphs = n_paragraphs
        self.oversample_first_answer = oversample_first_answer
        self._n_pairs = sum(
            min(len(q.paragraphs), n_paragraphs) for q in questions)
        self.batcher = ClusteredBatcher(batch_size,
                                        lambda x: x.n_context_words,
                                        truncate_batches=True)

    def get_vocab(self):
        voc = set()
        for q in self.questions:
            voc.update(q.question)
            for para in q.paragraphs:
                voc.update(para.text)
        return voc

    def get_spec(self):
        max_q_len = max(len(q.question) for q in self.questions)
        max_c_len = max(
            max(len(p.text) for p in q.paragraphs) for q in self.questions)
        return ParagraphAndQuestionSpec(
            self.batcher.get_fixed_batch_size()
            if self.mode == "merge" else None, max_q_len, max_c_len, None)

    def get_epoch(self):
        return self._build_expanded_batches(self.questions)

    def _build_expanded_batches(self, questions):
        # We first pick paragraph(s) for each question in the entire training set so we
        # can cluster by context length accurately
        out = []
        for q in questions:
            if len(q.paragraphs) <= self.n_paragraphs:
                selected = np.arange(len(q.paragraphs))
            elif not self.force_answer and len(
                    self.oversample_first_answer) == 0:
                selected = np.random.choice(len(q.paragraphs),
                                            self.n_paragraphs,
                                            replace=False)
            else:
                if not self.force_answer:
                    raise NotImplementedError()
                with_answer = [
                    i for i, p in enumerate(q.paragraphs)
                    if len(p.answer_spans) > 0
                ]
                for ix, over_sample in zip(list(with_answer),
                                           self.oversample_first_answer):
                    with_answer += [ix] * over_sample
                if with_answer:
                    answer_selection = with_answer[np.random.randint(
                        len(with_answer))]
                    other = np.array([
                        i for i, x in enumerate(q.paragraphs)
                        if i != answer_selection
                    ])
                    selected = np.random.choice(other,
                                                min(len(other),
                                                    self.n_paragraphs - 1),
                                                replace=False)
                    selected = np.insert(selected, 0, answer_selection)
                else:
                    selected = np.random.choice(len(q.paragraphs),
                                                self.n_paragraphs,
                                                replace=False)

            if self.mode == "flatten":
                for i in selected:
                    out.append(q.paragraphs[i].build_qa_pair(
                        q.question, q.question_id, q.answer_text))
            else:
                out.append(ParagraphSelection(q, selected))

        out.sort(key=lambda x: x.n_context_words)

        if self.mode == "flatten":
            for batch in self.batcher.get_epoch(out):
                yield batch
        elif self.mode == "group":
            group = 0
            for selection_batch in self.batcher.get_epoch(out):
                batch = []
                for selected in selection_batch:
                    q = selected.question
                    for i in selected.selection:
                        para = q.paragraphs[i]
                        batch.append(
                            para.build_qa_pair(q.question, q.question_id,
                                               q.answer_text, group))
                    group += 1
                yield batch
        elif self.mode == "merge":
            for selection_batch in self.batcher.get_epoch(out):
                batch = []
                for selected in selection_batch:
                    q = selected.question
                    paras = [q.paragraphs[i] for i in selected.selection]
                    para = paras[0].merge(paras)
                    batch.append(
                        para.build_qa_pair(q.question, q.question_id,
                                           q.answer_text))
                yield batch
        else:
            raise RuntimeError()

    def get_samples(self, n_examples):
        questions = np.random.choice(self.questions, n_examples, replace=False)
        if self.mode == "flatten":
            n_batches = self.batcher.epoch_size(
                sum(
                    min(len(q.paragraphs), self.n_paragraphs)
                    for q in questions))
        else:
            n_batches = self.batcher.epoch_size(n_examples)
        return self._build_expanded_batches(
            np.random.choice(questions, n_examples, replace=False)), n_batches

    def percent_filtered(self):
        return (self.true_len - len(self.questions)) / self.true_len

    def __len__(self):
        if self.mode == "flatten":
            return self.batcher.epoch_size(self._n_pairs)
        else:
            return self.batcher.epoch_size(len(self.questions))
コード例 #7
0
class StratifiedParagraphSetDataset(Dataset):
    """
    Sample multiple paragraphs each epoch and include them in the same batch,
    but stratify the sampling across epochs
    """
    def __init__(self, questions: List[MultiParagraphQuestion], true_len: int,
                 batch_size: int, force_answer: bool,
                 overample_first_answer: List[int], merge: bool):
        self.overample_first_answer = overample_first_answer
        self.questions = questions
        self.merge = merge
        self.true_len = true_len
        self.batcher = ClusteredBatcher(batch_size,
                                        lambda x: x.n_context_words,
                                        truncate_batches=True)
        self._order = []
        self._on = np.zeros(len(questions), dtype=np.int32)
        for q in questions:
            if len(q.paragraphs) == 1:
                self._order.append(np.zeros((1, 1), dtype=np.int32))
                continue
            if force_answer:
                sample1 = [
                    i for i, p in enumerate(q.paragraphs)
                    if len(p.answer_spans) > 0
                ]
            else:
                sample1 = list(range(len(q.paragraphs)))

            if (len(self.overample_first_answer) > 0
                    and not (force_answer and len(sample1) == 1)
                ):  # don't bother if there only is one answer
                ix = 0
                for i, p in enumerate(q.paragraphs):
                    if len(p.answer_spans) > 0:
                        sample1 += [i] * self.overample_first_answer[ix]
                        ix += 1
                        if ix >= len(self.overample_first_answer):
                            break

            permutations = []
            for i in sample1:
                for j in range(len(q.paragraphs)):
                    if j != i:
                        permutations.append((i, j))
            permutations = np.array(permutations, dtype=np.int32)
            np.random.shuffle(permutations)
            self._order.append(permutations)

    def get_vocab(self):
        voc = set()
        for q in self.questions:
            voc.update(q.question)
            for para in q.paragraphs:
                voc.update(para.text)
        return voc

    def get_spec(self):
        max_q_len = max(len(q.question) for q in self.questions)
        max_c_len = max(
            max(len(p.text) for p in q.paragraphs) for q in self.questions)
        return ParagraphAndQuestionSpec(None, max_q_len, max_c_len, None)

    def get_epoch(self):
        return self._build_expanded_batches(self.questions)

    def _build_expanded_batches(self, questions):
        out = []
        for i, q in enumerate(questions):
            order = self._order[i]
            out.append(ParagraphSelection(q, order[self._on[i]]))
            self._on[i] += 1
            if self._on[i] == len(order):
                self._on[i] = 0
                np.random.shuffle(order)

        out.sort(key=lambda x: x.n_context_words)

        group = 0
        for selection_batch in self.batcher.get_epoch(out):
            batch = []
            for selected in selection_batch:
                q = selected.question
                if self.merge:
                    paras = [q.paragraphs[i] for i in selected.selection]
                    # Sort paragraph my reading order, not rank order
                    paras.sort(key=lambda x: x.get_order())
                    answer_spans = []
                    text = []
                    for para in paras:
                        answer_spans.append(len(text) + para.answer_spans)
                        text += para.text
                    batch.append(
                        ParagraphAndQuestion(
                            text, q.question,
                            TokenSpans(q.answer_text,
                                       np.concatenate(answer_spans)),
                            q.question_id))
                else:
                    for i in selected.selection:
                        para = q.paragraphs[i]
                        batch.append(
                            para.build_qa_pair(q.question, q.question_id,
                                               q.answer_text, group))
                    group += 1
            yield batch

    def get_samples(self, n_examples):
        n_batches = self.batcher.epoch_size(n_examples)
        return self._build_expanded_batches(
            np.random.choice(self.questions, n_examples,
                             replace=False)), n_batches

    def percent_filtered(self):
        return (self.true_len - len(self.questions)) / self.true_len

    def __len__(self):
        return self.batcher.epoch_size(len(self.questions))
コード例 #8
0
def main():
    """
    A close-as-possible impelemntation of BiDaF, its based on the `dev` tensorflow 1.1 branch of Ming's repo
    which, in particular, uses Adam not Adadelta. I was not able to replicate the results in paper using Adadelta,
    but with Adam i was able to get to 78.0 F1 on the dev set with this scripts. I believe this approach is
    an exact reproduction up the code in the repo, up to initializations.

    Notes: Exponential Moving Average is very important, as is early stopping. This is also in particualr best run
    on a GPU due to the large number of parameters and batch size involved.
    """
    out = get_output_name_from_cli()

    train_params = TrainParams(SerializableOptimizer(
        "Adam", dict(learning_rate=0.001)),
                               num_epochs=12,
                               ema=0.999,
                               async_encoding=10,
                               log_period=30,
                               eval_period=1000,
                               save_period=1000,
                               eval_samples=dict(dev=None, train=8000))

    # recurrent_layer = BiRecurrentMapper(LstmCellSpec(100, keep_probs=0.8))
    # recurrent_layer = FusedLstm()
    recurrent_layer = SequenceMapperSeq(DropoutLayer(0.8), CudnnLstm(100))

    model = Attention(
        encoder=DocumentAndQuestionEncoder(SingleSpanAnswerEncoder()),
        word_embed=FixedWordEmbedder(vec_name="glove.6B.100d",
                                     word_vec_init_scale=0,
                                     learn_unk=False),
        char_embed=CharWordEmbedder(embedder=LearnedCharEmbedder(16, 49, 8),
                                    layer=ReduceLayer("max",
                                                      Conv1d(100, 5, 0.8),
                                                      mask=False),
                                    shared_parameters=True),
        word_embed_layer=None,
        embed_mapper=SequenceMapperSeq(HighwayLayer(activation="relu"),
                                       HighwayLayer(activation="relu"),
                                       recurrent_layer),
        question_mapper=None,
        context_mapper=None,
        memory_builder=NullBiMapper(),
        attention=BiAttention(TriLinear(bias=True), True),
        match_encoder=NullMapper(),
        predictor=BoundsPredictor(
            ChainConcat(start_layer=SequenceMapperSeq(recurrent_layer,
                                                      recurrent_layer),
                        end_layer=recurrent_layer)))

    with open(__file__, "r") as f:
        notes = f.read()

    eval = [
        LossEvaluator(),
        SquadSpanEvaluator(),
        BoundedSquadSpanEvaluator([18]),
        SentenceSpanEvaluator()
    ]

    corpus = SquadCorpus()
    train_batching = ClusteredBatcher(60, ContextLenBucketedKey(3), True,
                                      False)
    eval_batching = ClusteredBatcher(60, ContextLenKey(), False, False)
    data = DocumentQaTrainingData(corpus, None, train_batching, eval_batching)

    trainer.start_training(data, model, train_params, eval,
                           model_dir.ModelDir(out), notes, False)
コード例 #9
0
def main():
    parser = argparse.ArgumentParser(
        description='Train a model on TriviaQA unfiltered')
    parser.add_argument(
        'mode',
        choices=["confidence", "merge", "shared-norm", "sigmoid", "paragraph"])
    parser.add_argument("name", help="Where to store the model")
    parser.add_argument("-t",
                        "--n_tokens",
                        default=400,
                        type=int,
                        help="Paragraph size")
    parser.add_argument(
        '-n',
        '--n_processes',
        type=int,
        default=2,
        help="Number of processes (i.e., select which paragraphs to train on) "
        "the data with")
    parser.add_argument("-s",
                        "--source_dir",
                        type=str,
                        default=None,
                        help="where to take input files")
    parser.add_argument("--n_epochs",
                        type=int,
                        default=None,
                        help="Max number of epoches to train on ")
    parser.add_argument("--char_th",
                        type=int,
                        default=None,
                        help="char level embeddings")
    parser.add_argument("--hl_dim",
                        type=int,
                        default=None,
                        help="hidden layer dim size")
    parser.add_argument("--regularization",
                        type=int,
                        default=None,
                        help="hidden layer dim size")
    parser.add_argument("--LR",
                        type=float,
                        default=1.0,
                        help="hidden layer dim size")
    parser.add_argument("--save_every",
                        type=int,
                        default=1800,
                        help="save period")

    parser.add_argument("--init_from",
                        type=str,
                        default=None,
                        help="model to init from")
    args = parser.parse_args()
    mode = args.mode

    #out = args.name + "-" + datetime.now().strftime("%m%d-%H%M%S")
    out = join('models', args.name)

    char_th = 100
    hl_dim = 140
    if args.char_th is not None:
        print(args.char_th)
        char_th = int(args.char_th)
        out += '--th' + str(char_th)
    if args.hl_dim is not None:
        print(args.hl_dim)
        hl_dim = int(args.hl_dim)
        out += '--hl' + str(hl_dim)

    if args.init_from is None:
        model = get_model(char_th, hl_dim, mode, WithIndicators())
    else:
        md = model_dir.ModelDir(args.init_from)
        model = md.get_model()

    extract = ExtractMultiParagraphsPerQuestion(MergeParagraphs(args.n_tokens),
                                                ShallowOpenWebRanker(16),
                                                model.preprocessor,
                                                intern=True)

    eval = [
        LossEvaluator(),
        MultiParagraphSpanEvaluator(8,
                                    "triviaqa",
                                    mode != "merge",
                                    per_doc=False)
    ]
    oversample = [1] * 4

    if mode == "paragraph":
        n_epochs = 120
        test = RandomParagraphSetDatasetBuilder(120, "flatten", True,
                                                oversample)
        train = StratifyParagraphsBuilder(ClusteredBatcher(
            60, ContextLenBucketedKey(3), True),
                                          oversample,
                                          only_answers=True)
    elif mode == "confidence" or mode == "sigmoid":
        if mode == "sigmoid":
            n_epochs = 640
        else:
            n_epochs = 160
        test = RandomParagraphSetDatasetBuilder(120, "flatten", True,
                                                oversample)
        train = StratifyParagraphsBuilder(
            ClusteredBatcher(60, ContextLenBucketedKey(3), True), oversample)
    else:
        n_epochs = 80
        test = RandomParagraphSetDatasetBuilder(
            120, "merge" if mode == "merge" else "group", True, oversample)
        train = StratifyParagraphSetsBuilder(30, mode == "merge", True,
                                             oversample)

    if args.n_epochs is not None:
        n_epochs = args.n_epochs
        out += '--' + str(n_epochs)

    if args.LR != 1.0:
        out += '--' + str(args.LR)

    data = TriviaQaOpenDataset(args.source_dir)

    async_encoding = 10
    #async_encoding = 0
    params = TrainParams(SerializableOptimizer("Adadelta",
                                               dict(learning_rate=args.LR)),
                         num_epochs=n_epochs,
                         num_of_steps=250000,
                         ema=0.999,
                         max_checkpoints_to_keep=2,
                         async_encoding=async_encoding,
                         log_period=30,
                         eval_period=1800,
                         save_period=args.save_every,
                         eval_samples=dict(dev=None, train=6000),
                         regularization_weight=None)

    data = PreprocessedData(data, extract, train, test, eval_on_verified=False)

    data.preprocess(args.n_processes, 1000)

    with open(__file__, "r") as f:
        notes = f.read()
    notes = "Mode: " + args.mode + "\n" + notes

    if args.init_from is not None:
        init_from = model_dir.ModelDir(args.init_from).get_best_weights()
        if init_from is None:
            init_from = model_dir.ModelDir(
                args.init_from).get_latest_checkpoint()
    else:
        init_from = None

    trainer.start_training(data,
                           model,
                           params,
                           eval,
                           model_dir.ModelDir(out),
                           notes,
                           initialize_from=init_from)
コード例 #10
0
def run():
    parser = argparse.ArgumentParser()
    parser.add_argument("input_data")
    parser.add_argument("output_data")

    parser.add_argument("--plot_dir", type=str, default=None)

    parser.add_argument("--model_dir", type=str, default="/tmp/model/document-qa")
    parser.add_argument("--lm_dir", type=str, default="/home/castle/data/lm/squad-context-concat-skip")
    parser.add_argument("--glove_dir", type=str, default="/home/castle/data/glove")

    parser.add_argument("--n", type=int, default=None)
    parser.add_argument("-b", "--batch_size", type=int, default=30)
    parser.add_argument("--ema", action="store_true")
    args = parser.parse_args()

    input_data = args.input_data
    output_path = args.output_data
    model_dir = ModelDir(args.model_dir)
    nltk.data.path.append("nltk_data")

    print("Loading data")
    docs = parse_squad_data(input_data, "", NltkAndPunctTokenizer(), False)
    pairs = split_docs(docs)
    dataset = ParagraphAndQuestionDataset(pairs, ClusteredBatcher(args.batch_size, ContextLenKey(), False, True))

    print("Done, init model")
    model = model_dir.get_model()
    # small hack, just load the vector file at its expected location rather then using the config location
    loader = ResourceLoader(lambda a, b: load_word_vector_file(join(args.glove_dir, "glove.840B.300d.txt"), b))
    lm_model = model.lm_model
    basedir = args.lm_dir
    plotdir = args.plot_dir

    lm_model.lm_vocab_file = join(basedir, "squad_train_dev_all_unique_tokens.txt")
    lm_model.options_file = join(basedir, "options_squad_lm_2x4096_512_2048cnn_2xhighway_skip.json")
    lm_model.weight_file = join(basedir, "squad_context_concat_lm_2x4096_512_2048cnn_2xhighway_skip.hdf5")
    lm_model.embed_weights_file = None

    model.set_inputs([dataset], loader)

    print("Done, building graph")
    sess = tf.Session()
    with sess.as_default():
        pred = model.get_prediction()
    best_span = pred.get_best_span(17)[0]

    if plotdir != None:
        start_logits_op, end_logits_op = pred.get_logits()

    all_vars = tf.global_variables() + tf.get_collection(tf.GraphKeys.SAVEABLE_OBJECTS)
    dont_restore_names = {x.name for x in all_vars if x.name.startswith("bilm")}
    print(sorted(dont_restore_names))
    vars = [x for x in all_vars if x.name not in dont_restore_names]

    print("Done, loading weights")
    checkpoint = model_dir.get_best_weights()
    if checkpoint is None:
        print("Loading most recent checkpoint")
        checkpoint = model_dir.get_latest_checkpoint()
    else:
        print("Loading best weights")

    saver = tf.train.Saver(vars)
    saver.restore(sess, checkpoint)

    if args.ema:
        ema = tf.train.ExponentialMovingAverage(0)
        saver = tf.train.Saver({ema.average_name(x): x for x in tf.trainable_variables()})
        saver.restore(sess, checkpoint)

    sess.run(tf.variables_initializer([x for x in all_vars if x.name in dont_restore_names]))

    print("Done, starting evaluation")
    out = {}
    for i, batch in enumerate(dataset.get_epoch()):
        if args.n is not None and i == args.n:
            break
        print("On batch size [%d], now in %d th batch" % (args.batch_size, i +1))
        enc = model.encode(batch, False)
        if plotdir != None:
            spans, start_logits, end_logits = sess.run([best_span, start_logits_op, end_logits_op], feed_dict=enc)
            for bi, point in enumerate(batch):
                q = ' '.join(point.question)
                c = point.paragraph.get_context()
                gt = ' | '.join(point.answer.answer_text)
                s, e = spans[bi]
                pred = point.get_original_text(s, e)
                start_dist = start_logits[bi]
                end_dist = end_logits[bi]
                c_interval = np.arange(0.0, start_dist.shape[0], 1)
                c_label = c
                plt.figure(1)
                plt.subplot(211)
                plt.plot(c_interval, start_dist, color='r')
                plt.title("Q : " + q + " // A : " + gt, fontsize=9)
                plt.text(0, 0, r'Predict : %s [%d:%d]' % (pred, s, e), color='b')
                axes = plt.gca()
                axes.set_ylim([-20, 20])

                plt.subplot(212)
                plt.plot(c_interval, end_dist, color='g')
                plt.xticks(c_interval, c_label, rotation=90, fontsize=5)
                axes = plt.gca()
                axes.set_ylim([-20, 20])
                plt.show()

            break
        else:
            spans = sess.run(best_span, feed_dict=enc)

        for (s, e), point in zip(spans, batch):
            out[point.question_id] = point.get_original_text(s, e)

    sess.close()

    print("Done, saving")
    with open(output_path, "w") as f:
        json.dump(out, f)

    print("Mission accomplished!")
コード例 #11
0
def main():
    parser = argparse.ArgumentParser(description='Evaluate a model on SQuAD')
    parser.add_argument('model', help='model directory to evaluate')
    parser.add_argument("-o", "--official_output", type=str, help="where to output an official result file")
    parser.add_argument('-n', '--sample_questions', type=int, default=None,
                        help="(for testing) run on a subset of questions")
    parser.add_argument('--answer_bounds', nargs='+', type=int, default=[17],
                        help="Max size of answer")
    parser.add_argument('-b', '--batch_size', type=int, default=45,
                        help="Batch size, larger sizes can be faster but uses more memory")
    parser.add_argument('-s', '--step', default=None,
                        help="Weights to load, can be a checkpoint step or 'latest'")
    parser.add_argument('-c', '--corpus', choices=["dev", "train"], default="dev")
    parser.add_argument('--no_ema', action="store_true", help="Don't use EMA weights even if they exist")
    args = parser.parse_args()

    num_choices = 4

    model_dir = ModelDir(args.model)

    corpus = SquadCorpus()
    if args.corpus == "dev":
        questions = corpus.get_dev()
    else:
        questions = corpus.get_train()
    questions = split_docs(questions)

    if args.sample_questions:
        np.random.RandomState(0).shuffle(sorted(questions, key=lambda x: x.question_id))
        questions = questions[:args.sample_questions]


    questions.sort(key=lambda x:x.n_context_words, reverse=True)
    #pdb.set_trace()
    #print(args.batch_size)
    #dataset = ParagraphAndQuestionDataset(questions, FixedOrderBatcher(args.batch_size, False),None,num_choices)


    dataset = ParagraphAndQuestionDataset(questions, ClusteredBatcher(45, ContextLenKey(), False, False),None,num_choices)
    
    #ClusteredBatcher(45, ContextLenKey(), False, False)

    evaluators = [MultiChoiceEvaluator(num_choices)]
    #if args.official_output is not None:
        #evaluators.append(RecordSpanPrediction(args.answer_bounds[0]))
    #pdb.set_trace()
    if args.step is not None:
        if args.step == "latest":
            checkpoint = model_dir.get_latest_checkpoint()
        else:
            checkpoint = model_dir.get_checkpoint(int(args.step))
    else:
        checkpoint = model_dir.get_best_weights()
        if checkpoint is not None:
            print("Using best weights")
        else:
            print("Using latest checkpoint")
            checkpoint = model_dir.get_latest_checkpoint()

    model = model_dir.get_model()
    #pdb.set_trace()
    evaluation = trainer.test(model, evaluators, {args.corpus: dataset},
                              corpus.get_resource_loader(), checkpoint, not args.no_ema)[args.corpus]
    
    #pdb.set_trace()
    
    # Print the scalar results in a two column table
    scalars = evaluation.scalars
    cols = list(sorted(scalars.keys()))
    table = [cols]
    header = ["Metric", ""]
    table.append([("%s" % scalars[x] if x in scalars else "-") for x in cols])
    print_table([header] + transpose_lists(table))

    # Save the official output
    if args.official_output is not None:
        data_to_dump = {}

        list_of_choices = ['A','B','C','D']

        q_ids = evaluation.per_sample["question_id"]
        correct_ans = evaluation.per_sample["correct answer"]
        correct_ids = evaluation.per_sample["correct index"]
        pred_ids = evaluation.per_sample["predictied index"]
        pred_ans = evaluation.per_sample["predictied answer"]
        is_correct  = evaluation.per_sample["is correct"]
        #pdb.set_trace()
        for ix, q_ids in enumerate(q_ids):
            if(is_correct[ix]):
                data_to_dump[q_ids] = {'Is Correct' : 'True',
                 'predictied' : [' '.join(pred_ans[ix]),list_of_choices[pred_ids[ix]]],
                 'correct' : [' '.join(correct_ans[ix]),list_of_choices[correct_ids[ix]]]
                } 
            else:
                data_to_dump[q_ids] = {'Is Correct' : 'False',
                 'predictied' : [' '.join(pred_ans[ix]),list_of_choices[pred_ids[ix]]],
                 'correct' : [' '.join(correct_ans[ix]),list_of_choices[correct_ids[ix]]]
                } 
        #pdb.set_trace()
        with open(args.official_output, "w") as f:
            json.dump(data_to_dump , f)
コード例 #12
0
def run():
    parser = argparse.ArgumentParser()
    parser.add_argument("input_data")
    parser.add_argument("output_data")
    parser.add_argument("--n", type=int, default=None)
    parser.add_argument("-b", "--batch_size", type=int, default=100)
    parser.add_argument("--ema", action="store_true")
    args = parser.parse_args()

    input_data = args.input_data
    output_path = args.output_data
    model_dir = ModelDir("model")
    nltk.data.path.append("nltk_data")

    print("Loading data")
    docs = parse_squad_data(input_data, "", NltkAndPunctTokenizer(), False)
    pairs = split_docs(docs)
    dataset = ParagraphAndQuestionDataset(
        pairs, ClusteredBatcher(args.batch_size, ContextLenKey(), False, True))

    print("Done, init model")
    model = model_dir.get_model()
    # small hack, just load the vector file at its expected location rather then using the config location
    loader = ResourceLoader(
        lambda a, b: load_word_vector_file("glove.840B.300d.txt", b))
    lm_model = model.lm_model
    basedir = "lm"
    lm_model.lm_vocab_file = join(basedir,
                                  "squad_train_dev_all_unique_tokens.txt")
    lm_model.options_file = join(
        basedir, "options_squad_lm_2x4096_512_2048cnn_2xhighway_skip.json")
    lm_model.weight_file = join(
        basedir,
        "squad_context_concat_lm_2x4096_512_2048cnn_2xhighway_skip.hdf5")
    lm_model.embed_weights_file = None

    model.set_inputs([dataset], loader)

    print("Done, building graph")
    sess = tf.Session()
    with sess.as_default():
        pred = model.get_prediction()
    best_span = pred.get_best_span(17)[0]

    all_vars = tf.global_variables() + tf.get_collection(
        tf.GraphKeys.SAVEABLE_OBJECTS)
    dont_restore_names = {
        x.name
        for x in all_vars if x.name.startswith("bilm")
    }
    print(sorted(dont_restore_names))
    vars = [x for x in all_vars if x.name not in dont_restore_names]

    print("Done, loading weights")
    checkpoint = model_dir.get_best_weights()
    if checkpoint is None:
        print("Loading most recent checkpoint")
        checkpoint = model_dir.get_latest_checkpoint()
    else:
        print("Loading best weights")

    saver = tf.train.Saver(vars)
    saver.restore(sess, checkpoint)

    if args.ema:
        ema = tf.train.ExponentialMovingAverage(0)
        saver = tf.train.Saver(
            {ema.average_name(x): x
             for x in tf.trainable_variables()})
        saver.restore(sess, checkpoint)

    sess.run(
        tf.variables_initializer(
            [x for x in all_vars if x.name in dont_restore_names]))

    print("Done, starting evaluation")
    out = {}
    for i, batch in enumerate(dataset.get_epoch()):
        if args.n is not None and i == args.n:
            break
        print("On batch: %d" % (i + 1))
        enc = model.encode(batch, False)
        spans = sess.run(best_span, feed_dict=enc)
        for (s, e), point in zip(spans, batch):
            out[point.question_id] = point.get_original_text(s, e)

    sess.close()

    print("Done, saving")
    with open(output_path, "w") as f:
        json.dump(out, f)

    print("Mission accomplished!")