Python ModelDir Exemples, docqa.model_dir.ModelDir Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : cape_ablate.py Projet : trunghlt/cape-document-qa

def run_training(savename: str,
                 train_config: TrainConfig,
                 dataset_oversampling: Dict[str, int],
                 n_processes: int,
                 use_cudnn: bool
                 ):
    """Train a Cape-Flavoured DocumentQA model.

    After preparing the datasets for training, a model will be created and saved in a directory
    specified by `savename`. Logging (Tensorboard) can be found in the log subdirectory of the model directory.

    The datasets to train the model on are specified in the `dataset_oversampling` dictionary.
    E.g. {'squad': 2, 'wiki':1} will train a model on one equivalence of triviaqa wiki and two equivalences of squad.

    :param savename: Name of model
    :param train_config: cape_config.TrainConfig object containing hyperparameters etc
    :param dataset_oversampling: dictionary mapping dataset names to integer counts of how much
       to oversample them
    :param n_processes: Number of processes to paralellize prepro on
    :param use_cudnn: Whether to train with GRU's optimized for Cudnn (recommended)
    """

    model = build_model(WithIndicators(), train_config, use_cudnn=use_cudnn)
    data = prepare_data(model, train_config, dataset_oversampling, n_processes)
    eval = get_evaluators(train_config)
    params = get_training_params(train_config)

    with open(__file__, "r", encoding='utf8') as f:
        notes = f.read()
    notes = "Mode: " + train_config.trivia_qa_mode + "\n" + notes
    notes += '\nDataset oversampling : ' + str(dataset_oversampling)

    # pull the trigger
    trainer.start_training(data, model, params, eval, model_dir.ModelDir(savename), notes)

Exemple #2

0

Afficher le fichier

Fichier : ablate_xqa.py Projet : zengyy8/XQA

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("corpus", choices=["en", "en_trans_de", "en_trans_zh"])
    parser.add_argument(
        'mode',
        choices=["confidence", "merge", "shared-norm", "sigmoid", "paragraph"])
    # Note I haven't tested modes other than `shared-norm` on this corpus, so
    # some things might need adjusting
    parser.add_argument("name", help="Where to store the model")
    parser.add_argument("-t",
                        "--n_tokens",
                        default=400,
                        type=int,
                        help="Paragraph size")
    parser.add_argument(
        '-n',
        '--n_processes',
        type=int,
        default=2,
        help="Number of processes (i.e., select which paragraphs to train on) "
        "the data with")
    args = parser.parse_args()
    mode = args.mode
    corpus = args.corpus

    out = args.name + "-" + datetime.now().strftime("%m%d-%H%M%S")

    model = get_model(100, 140, mode, WithIndicators())

    extract = ExtractMultiParagraphsPerQuestion(MergeParagraphs(args.n_tokens),
                                                ShallowOpenWebRanker(16),
                                                model.preprocessor,
                                                intern=True)

    eval = [
        LossEvaluator(),
        MultiParagraphSpanEvaluator(8,
                                    "triviaqa",
                                    mode != "merge",
                                    per_doc=False)
    ]
    oversample = [
        1
    ] * 2  # Sample the top two answer-containing paragraphs twice

    if mode == "paragraph":
        n_epochs = 120
        test = RandomParagraphSetDatasetBuilder(120, "flatten", True,
                                                oversample)
        train = StratifyParagraphsBuilder(ClusteredBatcher(
            60, ContextLenBucketedKey(3), True),
                                          oversample,
                                          only_answers=True)
    elif mode == "confidence" or mode == "sigmoid":
        if mode == "sigmoid":
            n_epochs = 640
        else:
            n_epochs = 160
        test = RandomParagraphSetDatasetBuilder(120, "flatten", True,
                                                oversample)
        train = StratifyParagraphsBuilder(
            ClusteredBatcher(60, ContextLenBucketedKey(3), True), oversample)
    else:
        n_epochs = 80
        test = RandomParagraphSetDatasetBuilder(
            120, "merge" if mode == "merge" else "group", True, oversample)
        train = StratifyParagraphSetsBuilder(30, mode == "merge", True,
                                             oversample)

    data = XQADataset(corpus)

    params = TrainParams(SerializableOptimizer("Adadelta",
                                               dict(learning_rate=1)),
                         num_epochs=n_epochs,
                         ema=0.999,
                         max_checkpoints_to_keep=2,
                         async_encoding=10,
                         log_period=30,
                         eval_period=1800,
                         save_period=1800,
                         best_weights=("dev", "b8/question-text-f1"),
                         eval_samples=dict(dev=None, train=6000))

    data = PreprocessedData(data, extract, train, test, eval_on_verified=False)

    data.preprocess(args.n_processes, 1000)

    with open(__file__, "r") as f:
        notes = f.read()
    notes = "Mode: " + args.mode + "\n" + notes
    trainer.start_training(data, model, params, eval, model_dir.ModelDir(out),
                           notes)

Exemple #3

0

Afficher le fichier

Fichier : ablate_triviaqa.py Projet : artiom-zayats/docqa_squad

def main():
    parser = argparse.ArgumentParser(description='Train a model on TriviaQA web')
    parser.add_argument('mode', choices=["paragraph-level", "confidence", "merge",
                                         "shared-norm", "sigmoid", "shared-norm-600"])
    parser.add_argument("name", help="Where to store the model")
    parser.add_argument('-n', '--n_processes', type=int, default=2,
                        help="Number of processes (i.e., select which paragraphs to train on) "
                             "the data with")
    args = parser.parse_args()
    mode = args.mode

    out = args.name + "-" + datetime.now().strftime("%m%d-%H%M%S")

    model = get_model(100, 140, mode, WithIndicators())

    stop = NltkPlusStopWords(True)

    if mode == "paragraph-level":
        extract = ExtractSingleParagraph(MergeParagraphs(400), TopTfIdf(stop, 1), model.preprocessor, intern=True)
    elif mode == "shared-norm-600":
        extract = ExtractMultiParagraphs(MergeParagraphs(600), TopTfIdf(stop, 4), model.preprocessor, intern=True)
    else:
        extract = ExtractMultiParagraphs(MergeParagraphs(400), TopTfIdf(stop, 4), model.preprocessor, intern=True)
    
    if mode == "paragraph-level":
        n_epochs = 16
        train = ParagraphAndQuestionsBuilder(ClusteredBatcher(60, ContextLenBucketedKey(3), True))
        test = ParagraphAndQuestionsBuilder(ClusteredBatcher(60, ContextLenKey(), False))
        n_dev, n_train = 21000, 12000
        eval = [LossEvaluator(), SpanEvaluator([4, 8], "triviaqa")]
    else:
        eval = [LossEvaluator(), MultiParagraphSpanEvaluator(8, "triviaqa", mode != "merge")]
        # we sample two paragraphs per a (question, doc) pair, so evaluate on fewer questions
        n_dev, n_train = 15000, 8000

        if mode == "confidence" or mode == "sigmoid":
            if mode == "sigmoid":
                # Trains very slowly, do this at your own risk
                n_epochs = 71
            else:
                n_epochs = 28
            test = RandomParagraphSetDatasetBuilder(120, "flatten", True, 1)
            train = StratifyParagraphsBuilder(ClusteredBatcher(60, ContextLenBucketedKey(3), True), 0, 1)
        else:
            n_epochs = 14
            test = RandomParagraphSetDatasetBuilder(120, "merge" if mode == "merge" else "group", True, 1)
            train = StratifyParagraphSetsBuilder(35, mode == "merge", True, 1)

    data = TriviaQaWebDataset()

    params = get_triviaqa_train_params(n_epochs, n_dev, n_train)

    data = PreprocessedData(data, extract, train, test, eval_on_verified=False)

    data.preprocess(args.n_processes, 1000)

    with open(__file__, "r") as f:
        notes = f.read()
    notes = "*" * 10 + "\nMode: " + args.mode + "\n" + "*"*10 + "\n" + notes

    trainer.start_training(data, model, params, eval, model_dir.ModelDir(out), notes)

Exemple #4

0

Afficher le fichier

def main():
    parser = argparse.ArgumentParser(
        description='Train a model on document-level SQuAD')
    parser.add_argument(
        'mode',
        choices=["paragraph", "confidence", "shared-norm", "merge", "sigmoid"])
    parser.add_argument("name", help="Output directory")
    args = parser.parse_args()
    mode = args.mode
    out = args.name + "-" + datetime.now().strftime("%m%d-%H%M%S")

    corpus = SquadCorpus()
    if mode == "merge":
        # Adds paragraph start tokens, since we will be concatenating paragraphs together
        pre = WithIndicators(True, para_tokens=False, doc_start_token=False)
    else:
        pre = None

    model = get_model(50, 100, args.mode, pre)

    if mode == "paragraph":
        # Run in the "standard" known-paragraph setting
        if model.preprocessor is not None:
            raise NotImplementedError()
        n_epochs = 26
        train_batching = ClusteredBatcher(45, ContextLenBucketedKey(3), True,
                                          False)
        eval_batching = ClusteredBatcher(45, ContextLenKey(), False, False)
        data = DocumentQaTrainingData(corpus, None, train_batching,
                                      eval_batching)
        eval = [LossEvaluator(), SpanEvaluator(bound=[17], text_eval="squad")]
    else:
        eval_set_mode = {
            "confidence": "flatten",
            "sigmoid": "flatten",
            "shared-norm": "group",
            "merge": "merge"
        }[mode]
        eval_dataset = RandomParagraphSetDatasetBuilder(
            100, eval_set_mode, True, 0)

        if mode == "confidence" or mode == "sigmoid":
            if mode == "sigmoid":
                # needs to be trained for a really long time for reasons unknown, even this might be too small
                n_epochs = 100
            else:
                n_epochs = 50  # more epochs since we only "see" the label very other epoch-osh
            train_batching = ClusteredBatcher(45, ContextLenBucketedKey(3),
                                              True, False)
            data = PreprocessedData(
                SquadCorpus(),
                SquadTfIdfRanker(NltkPlusStopWords(True), 4, True,
                                 model.preprocessor),
                StratifyParagraphsBuilder(train_batching, 1),
                eval_dataset,
                eval_on_verified=False,
            )
        else:
            n_epochs = 26
            data = PreprocessedData(
                SquadCorpus(),
                SquadTfIdfRanker(NltkPlusStopWords(True), 4, True,
                                 model.preprocessor),
                StratifyParagraphSetsBuilder(25, args.mode == "merge", True,
                                             1),
                eval_dataset,
                eval_on_verified=False,
            )

        eval = [LossEvaluator(), MultiParagraphSpanEvaluator(17, "squad")]
        data.preprocess(1)

    with open(__file__, "r") as f:
        notes = f.read()
        notes = args.mode + "\n" + notes

    trainer.start_training(data, model, train_params(n_epochs), eval,
                           model_dir.ModelDir(out), notes)

Exemple #5

0

Afficher le fichier

def main():
    """
    A close-as-possible impelemntation of BiDaF, its based on the `dev` tensorflow 1.1 branch of Ming's repo
    which, in particular, uses Adam not Adadelta. I was not able to replicate the results in paper using Adadelta,
    but with Adam i was able to get to 78.0 F1 on the dev set with this scripts. I believe this approach is
    an exact reproduction up the code in the repo, up to initializations.

    Notes: Exponential Moving Average is very important, as is early stopping. This is also in particualr best run
    on a GPU due to the large number of parameters and batch size involved.
    """
    out = get_output_name_from_cli()

    train_params = TrainParams(SerializableOptimizer(
        "Adam", dict(learning_rate=0.001)),
                               num_epochs=12,
                               ema=0.999,
                               async_encoding=10,
                               log_period=30,
                               eval_period=1000,
                               save_period=1000,
                               eval_samples=dict(dev=None, train=8000))

    # recurrent_layer = BiRecurrentMapper(LstmCellSpec(100, keep_probs=0.8))
    # recurrent_layer = FusedLstm()
    recurrent_layer = SequenceMapperSeq(DropoutLayer(0.8), CudnnLstm(100))

    model = Attention(
        encoder=DocumentAndQuestionEncoder(SingleSpanAnswerEncoder()),
        word_embed=FixedWordEmbedder(vec_name="glove.6B.100d",
                                     word_vec_init_scale=0,
                                     learn_unk=False),
        char_embed=CharWordEmbedder(embedder=LearnedCharEmbedder(16, 49, 8),
                                    layer=ReduceLayer("max",
                                                      Conv1d(100, 5, 0.8),
                                                      mask=False),
                                    shared_parameters=True),
        word_embed_layer=None,
        embed_mapper=SequenceMapperSeq(HighwayLayer(activation="relu"),
                                       HighwayLayer(activation="relu"),
                                       recurrent_layer),
        preprocess=None,
        question_mapper=None,
        context_mapper=None,
        memory_builder=NullBiMapper(),
        attention=BiAttention(TriLinear(bias=True), True),
        match_encoder=NullMapper(),
        predictor=BoundsPredictor(
            ChainConcat(start_layer=SequenceMapperSeq(recurrent_layer,
                                                      recurrent_layer),
                        end_layer=recurrent_layer)),
    )

    with open(__file__, "r") as f:
        notes = f.read()

    eval = [LossEvaluator(), SpanEvaluator(bound=[17], text_eval="squad")]

    corpus = SquadCorpus()
    train_batching = ClusteredBatcher(60, ContextLenBucketedKey(3), True,
                                      False)
    eval_batching = ClusteredBatcher(60, ContextLenKey(), False, False)
    data = DocumentQaTrainingData(corpus, None, train_batching, eval_batching)

    trainer.start_training(data, model, train_params, eval,
                           model_dir.ModelDir(out), notes)

Exemple #6

0

Afficher le fichier

def main():
    parser = argparse.ArgumentParser(
        description='Train a model on TriviaQA unfiltered')
    parser.add_argument(
        'mode',
        choices=["confidence", "merge", "shared-norm", "sigmoid", "paragraph"])
    parser.add_argument("name", help="Where to store the model")
    parser.add_argument("-t",
                        "--n_tokens",
                        default=400,
                        type=int,
                        help="Paragraph size")
    parser.add_argument(
        '-n',
        '--n_processes',
        type=int,
        default=2,
        help="Number of processes (i.e., select which paragraphs to train on) "
        "the data with")
    parser.add_argument("-s",
                        "--source_dir",
                        type=str,
                        default=None,
                        help="where to take input files")
    parser.add_argument("--n_epochs",
                        type=int,
                        default=None,
                        help="Max number of epoches to train on ")
    parser.add_argument("--char_th",
                        type=int,
                        default=None,
                        help="char level embeddings")
    parser.add_argument("--hl_dim",
                        type=int,
                        default=None,
                        help="hidden layer dim size")
    parser.add_argument("--regularization",
                        type=int,
                        default=None,
                        help="hidden layer dim size")
    parser.add_argument("--LR",
                        type=float,
                        default=1.0,
                        help="hidden layer dim size")
    parser.add_argument("--save_every",
                        type=int,
                        default=1800,
                        help="save period")

    parser.add_argument("--init_from",
                        type=str,
                        default=None,
                        help="model to init from")
    args = parser.parse_args()
    mode = args.mode

    #out = args.name + "-" + datetime.now().strftime("%m%d-%H%M%S")
    out = join('models', args.name)

    char_th = 100
    hl_dim = 140
    if args.char_th is not None:
        print(args.char_th)
        char_th = int(args.char_th)
        out += '--th' + str(char_th)
    if args.hl_dim is not None:
        print(args.hl_dim)
        hl_dim = int(args.hl_dim)
        out += '--hl' + str(hl_dim)

    if args.init_from is None:
        model = get_model(char_th, hl_dim, mode, WithIndicators())
    else:
        md = model_dir.ModelDir(args.init_from)
        model = md.get_model()

    extract = ExtractMultiParagraphsPerQuestion(MergeParagraphs(args.n_tokens),
                                                ShallowOpenWebRanker(16),
                                                model.preprocessor,
                                                intern=True)

    eval = [
        LossEvaluator(),
        MultiParagraphSpanEvaluator(8,
                                    "triviaqa",
                                    mode != "merge",
                                    per_doc=False)
    ]
    oversample = [1] * 4

    if mode == "paragraph":
        n_epochs = 120
        test = RandomParagraphSetDatasetBuilder(120, "flatten", True,
                                                oversample)
        train = StratifyParagraphsBuilder(ClusteredBatcher(
            60, ContextLenBucketedKey(3), True),
                                          oversample,
                                          only_answers=True)
    elif mode == "confidence" or mode == "sigmoid":
        if mode == "sigmoid":
            n_epochs = 640
        else:
            n_epochs = 160
        test = RandomParagraphSetDatasetBuilder(120, "flatten", True,
                                                oversample)
        train = StratifyParagraphsBuilder(
            ClusteredBatcher(60, ContextLenBucketedKey(3), True), oversample)
    else:
        n_epochs = 80
        test = RandomParagraphSetDatasetBuilder(
            120, "merge" if mode == "merge" else "group", True, oversample)
        train = StratifyParagraphSetsBuilder(30, mode == "merge", True,
                                             oversample)

    if args.n_epochs is not None:
        n_epochs = args.n_epochs
        out += '--' + str(n_epochs)

    if args.LR != 1.0:
        out += '--' + str(args.LR)

    data = TriviaQaOpenDataset(args.source_dir)

    async_encoding = 10
    #async_encoding = 0
    params = TrainParams(SerializableOptimizer("Adadelta",
                                               dict(learning_rate=args.LR)),
                         num_epochs=n_epochs,
                         num_of_steps=250000,
                         ema=0.999,
                         max_checkpoints_to_keep=2,
                         async_encoding=async_encoding,
                         log_period=30,
                         eval_period=1800,
                         save_period=args.save_every,
                         eval_samples=dict(dev=None, train=6000),
                         regularization_weight=None)

    data = PreprocessedData(data, extract, train, test, eval_on_verified=False)

    data.preprocess(args.n_processes, 1000)

    with open(__file__, "r") as f:
        notes = f.read()
    notes = "Mode: " + args.mode + "\n" + notes

    if args.init_from is not None:
        init_from = model_dir.ModelDir(args.init_from).get_best_weights()
        if init_from is None:
            init_from = model_dir.ModelDir(
                args.init_from).get_latest_checkpoint()
    else:
        init_from = None

    trainer.start_training(data,
                           model,
                           params,
                           eval,
                           model_dir.ModelDir(out),
                           notes,
                           initialize_from=init_from)