Ejemplo n.º 1
0
def main(pytorch: bool = False, gpu_id: int = -1):
    global CONFIG
    fix_random_seed(0)
    if gpu_id >= 0:
        require_gpu(gpu_id)
        print("Set GPU", gpu_id)
    backends = {"pytorch": pytorch}
    for name, use_backend in backends.items():
        if not use_backend:
            print(f"Skipping {name}")
            continue
        set_backend(name, gpu_id)
        C = registry.resolve(Config().from_str(CONFIG))
        model = C["model"]
        X, Y = get_dummy_data(**C["data"])
        print("Copy to device")
        X = [model.ops.asarray(x) for x in X]
        Y = [model.ops.asarray(y) for y in Y]
        print("Begin init", len(X))
        model.initialize(X=X[:5])
        print("Pre-batch")
        n_words = sum(len(x) for x in X)
        X = [
            model.layers[0].predict(batch)
            for batch in model.ops.minibatch(16, X)
        ]
        model.layers.pop(0)
        print("Start")
        start_time = timer()
        end_time = timer()
        print(name, n_words, end_time - start_time)
Ejemplo n.º 2
0
def test_issue5551(textcat_config):
    """Test that after fixing the random seed, the results of the pipeline are truly identical"""
    component = "textcat"

    pipe_cfg = Config().from_str(textcat_config)
    results = []
    for i in range(3):
        fix_random_seed(0)
        nlp = English()
        text = "Once hot, form ping-pong-ball-sized balls of the mixture, each weighing roughly 25 g."
        annots = {"cats": {"Labe1": 1.0, "Label2": 0.0, "Label3": 0.0}}
        pipe = nlp.add_pipe(component, config=pipe_cfg, last=True)
        for label in set(annots["cats"]):
            pipe.add_label(label)
        # Train
        nlp.initialize()
        doc = nlp.make_doc(text)
        nlp.update([Example.from_dict(doc, annots)])
        # Store the result of each iteration
        result = pipe.model.predict([doc])
        results.append(result[0])
    # All results should be the same because of the fixed seed
    assert len(results) == 3
    ops = get_current_ops()
    assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1]))
    assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]))
Ejemplo n.º 3
0
def test_LSTM_learns():
    fix_random_seed(0)

    nO = 2
    nI = 2

    def sgd(key, weights, gradient):
        weights -= 0.001 * gradient
        return weights, gradient * 0

    model = with_padded(LSTM(nO, nI))
    X = [[0.1, 0.1], [0.2, 0.2], [0.3, 0.3]]
    Y = [[0.2, 0.2], [0.3, 0.3], [0.4, 0.4]]
    X = [model.ops.asarray(x, dtype="f").reshape((1, -1)) for x in X]
    Y = [model.ops.asarray(y, dtype="f").reshape((1, -1)) for y in Y]
    model = model.initialize(X, Y)
    Yhs, bp_Yhs = model.begin_update(X)
    loss1 = sum([((yh - y) ** 2).sum() for yh, y in zip(Yhs, Y)])
    Yhs, bp_Yhs = model.begin_update(X)
    dYhs = [yh - y for yh, y in zip(Yhs, Y)]
    dXs = bp_Yhs(dYhs)
    model.finish_update(sgd)
    Yhs, bp_Yhs = model.begin_update(X)
    dYhs = [yh - y for yh, y in zip(Yhs, Y)]
    dXs = bp_Yhs(dYhs)  # noqa: F841
    loss2 = sum([((yh - y) ** 2).sum() for yh, y in zip(Yhs, Y)])
    assert loss1 > loss2, (loss1, loss2)
Ejemplo n.º 4
0
def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
    raw_config = config
    config = raw_config.interpolate()
    if "seed" not in config["training"]:
        raise ValueError(Errors.E1015.format(value="[training] seed"))
    if "gpu_allocator" not in config["training"]:
        raise ValueError(Errors.E1015.format(value="[training] gpu_allocator"))
    if config["training"]["seed"] is not None:
        fix_random_seed(config["training"]["seed"])
    allocator = config["training"]["gpu_allocator"]
    if use_gpu >= 0 and allocator:
        set_gpu_allocator(allocator)
    # Use original config here before it's resolved to functions
    sourced = get_sourced_components(config)
    nlp = load_model_from_config(raw_config, auto_fill=True)
    logger.info("Set up nlp object from config")
    config = nlp.config.interpolate()
    # Resolve all training-relevant sections using the filled nlp config
    T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
    dot_names = [T["train_corpus"], T["dev_corpus"]]
    if not isinstance(T["train_corpus"], str):
        raise ConfigValidationError(
            desc=Errors.E897.format(
                field="training.train_corpus", type=type(T["train_corpus"])
            )
        )
    if not isinstance(T["dev_corpus"], str):
        raise ConfigValidationError(
            desc=Errors.E897.format(
                field="training.dev_corpus", type=type(T["dev_corpus"])
            )
        )
    train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
    optimizer = T["optimizer"]
    # Components that shouldn't be updated during training
    frozen_components = T["frozen_components"]
    # Sourced components that require resume_training
    resume_components = [p for p in sourced if p not in frozen_components]
    logger.info(f"Pipeline: {nlp.pipe_names}")
    if resume_components:
        with nlp.select_pipes(enable=resume_components):
            logger.info(f"Resuming training for: {resume_components}")
            nlp.resume_training(sgd=optimizer)
    # Make sure that listeners are defined before initializing further
    nlp._link_components()
    with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
        nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
        logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
    # Detect components with listeners that are not frozen consistently
    for name, proc in nlp.pipeline:
        if getattr(proc, "listening_components", None):  # e.g. tok2vec/transformer
            for listener in proc.listening_components:
                if listener in frozen_components and name not in frozen_components:
                    logger.warning(Warnings.W087.format(name=name, listener=listener))
                # We always check this regardless, in case user freezes tok2vec
                if listener not in frozen_components and name in frozen_components:
                    logger.warning(Warnings.W086.format(name=name, listener=listener))
    return nlp
Ejemplo n.º 5
0
def test_overfitting_IO():
    # Simple test to try and quickly overfit the single-label textcat component - ensuring the ML models work correctly
    fix_random_seed(0)
    nlp = English()
    textcat = nlp.add_pipe("textcat")

    train_examples = []
    for text, annotations in TRAIN_DATA_SINGLE_LABEL:
        train_examples.append(
            Example.from_dict(nlp.make_doc(text), annotations))
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    assert textcat.model.get_dim("nO") == 2

    for i in range(50):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses["textcat"] < 0.01

    # test the trained model
    test_text = "I am happy."
    doc = nlp(test_text)
    cats = doc.cats
    assert cats["POSITIVE"] > 0.9
    assert cats["POSITIVE"] + cats["NEGATIVE"] == pytest.approx(1.0, 0.001)

    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        doc2 = nlp2(test_text)
        cats2 = doc2.cats
        assert cats2["POSITIVE"] > 0.9
        assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(
            1.0, 0.001)

    # Test scoring
    scores = nlp.evaluate(train_examples)
    assert scores["cats_micro_f"] == 1.0
    assert scores["cats_macro_f"] == 1.0
    assert scores["cats_macro_auc"] == 1.0
    assert scores["cats_score"] == 1.0
    assert "cats_score_desc" in scores

    # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
    texts = [
        "Just a sentence.", "I like green eggs.", "I am happy.", "I eat ham."
    ]
    batch_cats_1 = [doc.cats for doc in nlp.pipe(texts)]
    batch_cats_2 = [doc.cats for doc in nlp.pipe(texts)]
    no_batch_cats = [doc.cats for doc in [nlp(text) for text in texts]]
    for cats_1, cats_2 in zip(batch_cats_1, batch_cats_2):
        for cat in cats_1:
            assert_almost_equal(cats_1[cat], cats_2[cat], decimal=5)
    for cats_1, cats_2 in zip(batch_cats_1, no_batch_cats):
        for cat in cats_1:
            assert_almost_equal(cats_1[cat], cats_2[cat], decimal=5)
Ejemplo n.º 6
0
def test_models_initialize_consistently(seed, model_func, kwargs):
    fix_random_seed(seed)
    model1 = model_func(**kwargs)
    model1.initialize()
    fix_random_seed(seed)
    model2 = model_func(**kwargs)
    model2.initialize()
    params1 = get_all_params(model1)
    params2 = get_all_params(model2)
    assert_array_equal(params1, params2)
Ejemplo n.º 7
0
def test_cat_readers(reader, additional_config):
    nlp_config_string = """
    [training]
    seed = 0

    [training.score_weights]
    cats_macro_auc = 1.0

    [corpora]
    @readers = "PLACEHOLDER"

    [nlp]
    lang = "en"
    pipeline = ["tok2vec", "textcat_multilabel"]

    [components]

    [components.tok2vec]
    factory = "tok2vec"

    [components.textcat_multilabel]
    factory = "textcat_multilabel"
    """
    config = Config().from_str(nlp_config_string)
    fix_random_seed(config["training"]["seed"])
    config["corpora"]["@readers"] = reader
    config["corpora"].update(additional_config)
    nlp = load_model_from_config(config, auto_fill=True)
    T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
    dot_names = [T["train_corpus"], T["dev_corpus"]]
    train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names)
    optimizer = T["optimizer"]
    # simulate a training loop
    nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
    for example in train_corpus(nlp):
        assert example.y.cats
        # this shouldn't fail if each training example has at least one positive label
        assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0]
        nlp.update([example], sgd=optimizer)
    # simulate performance benchmark on dev corpus
    dev_examples = list(dev_corpus(nlp))
    for example in dev_examples:
        # this shouldn't fail if each dev example has at least one positive label
        assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0]
    scores = nlp.evaluate(dev_examples)
    assert scores["cats_score"]
    # ensure the pipeline runs
    doc = nlp("Quick test")
    assert doc.cats
Ejemplo n.º 8
0
def test_overfitting_IO_multi():
    # Simple test to try and quickly overfit the multi-label textcat component - ensuring the ML models work correctly
    fix_random_seed(0)
    nlp = English()
    textcat = nlp.add_pipe("textcat_multilabel")

    train_examples = []
    for text, annotations in TRAIN_DATA_MULTI_LABEL:
        train_examples.append(
            Example.from_dict(nlp.make_doc(text), annotations))
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    assert textcat.model.get_dim("nO") == 3

    for i in range(100):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses["textcat_multilabel"] < 0.01

    # test the trained model
    test_text = "I am confused but happy."
    doc = nlp(test_text)
    cats = doc.cats
    assert cats["HAPPY"] > 0.9
    assert cats["CONFUSED"] > 0.9

    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        doc2 = nlp2(test_text)
        cats2 = doc2.cats
        assert cats2["HAPPY"] > 0.9
        assert cats2["CONFUSED"] > 0.9

    # Test scoring
    scores = nlp.evaluate(train_examples)
    assert scores["cats_micro_f"] == 1.0
    assert scores["cats_macro_f"] == 1.0
    assert "cats_score_desc" in scores

    # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
    texts = [
        "Just a sentence.", "I like green eggs.", "I am happy.", "I eat ham."
    ]
    batch_deps_1 = [doc.cats for doc in nlp.pipe(texts)]
    batch_deps_2 = [doc.cats for doc in nlp.pipe(texts)]
    no_batch_deps = [doc.cats for doc in [nlp(text) for text in texts]]
    assert_equal(batch_deps_1, batch_deps_2)
    assert_equal(batch_deps_1, no_batch_deps)
Ejemplo n.º 9
0
 def get_updated_model():
     fix_random_seed(seed)
     optimizer = Adam(0.001)
     model = model_func(**kwargs).initialize()
     initial_params = get_all_params(model)
     set_dropout_rate(model, dropout)
     for _ in range(5):
         Y, get_dX = model.begin_update(get_X())
         dY = get_gradient(model, Y)
         get_dX(dY)
         model.finish_update(optimizer)
     updated_params = get_all_params(model)
     with pytest.raises(AssertionError):
         assert_array_equal(initial_params, updated_params)
     return model
Ejemplo n.º 10
0
def _train_parser(parser):
    fix_random_seed(1)
    parser.add_label("left")
    parser.initialize(lambda: [_parser_example(parser)])
    sgd = Adam(0.001)

    for i in range(5):
        losses = {}
        doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
        gold = {
            "heads": [1, 1, 3, 3],
            "deps": ["left", "ROOT", "left", "ROOT"]
        }
        example = Example.from_dict(doc, gold)
        parser.update([example], sgd=sgd, losses=losses)
    return parser
Ejemplo n.º 11
0
def test_multibatch():
    fix_random_seed(0)
    ops = get_current_ops()
    arr1 = numpy.asarray([1, 2, 3, 4])
    arr2 = numpy.asarray([5, 6, 7, 8])
    batches = list(ops.multibatch(2, arr1, arr2))
    assert numpy.concatenate(batches).tolist() == [[1, 2], [5, 6], [3, 4], [7, 8]]
    batches = list(ops.multibatch(2, arr1, arr2, shuffle=True))
    assert len(batches) == 2
    assert len(batches[0]) == 2
    assert len(batches[1]) == 2
    batches = list(ops.multibatch(2, [1, 2, 3, 4], [5, 6, 7, 8]))
    assert batches == [[[1, 2], [5, 6]], [[3, 4], [7, 8]]]
    with pytest.raises(ValueError):
        ops.multibatch(10, (i for i in range(100)), (i for i in range(100)))
    with pytest.raises(ValueError):
        ops.multibatch(10, arr1, (i for i in range(100)), arr2)
Ejemplo n.º 12
0
def test_resize_same_results(name, textcat_config):
    # Ensure that the resized textcat classifiers still produce the same results for old labels
    fix_random_seed(0)
    nlp = English()
    pipe_config = {"model": textcat_config}
    textcat = nlp.add_pipe(name, config=pipe_config)

    train_examples = []
    for text, annotations in TRAIN_DATA_SINGLE_LABEL:
        train_examples.append(
            Example.from_dict(nlp.make_doc(text), annotations))
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    assert textcat.model.maybe_get_dim("nO") in [2, None]

    for i in range(5):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)

    # test the trained model before resizing
    test_text = "I am happy."
    doc = nlp(test_text)
    assert len(doc.cats) == 2
    pos_pred = doc.cats["POSITIVE"]
    neg_pred = doc.cats["NEGATIVE"]

    # test the trained model again after resizing
    textcat.add_label("NEUTRAL")
    doc = nlp(test_text)
    assert len(doc.cats) == 3
    assert doc.cats["POSITIVE"] == pos_pred
    assert doc.cats["NEGATIVE"] == neg_pred
    assert doc.cats["NEUTRAL"] <= 1

    for i in range(5):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)

    # test the trained model again after training further with new label
    doc = nlp(test_text)
    assert len(doc.cats) == 3
    assert doc.cats["POSITIVE"] != pos_pred
    assert doc.cats["NEGATIVE"] != neg_pred
    for cat in doc.cats:
        assert doc.cats[cat] <= 1
Ejemplo n.º 13
0
def test_minibatch():
    fix_random_seed(0)
    ops = get_current_ops()
    items = [1, 2, 3, 4, 5, 6]
    batches = ops.minibatch(3, items)
    assert list(batches) == [[1, 2, 3], [4, 5, 6]]
    batches = ops.minibatch((i for i in (3, 2, 1)), items)
    assert list(batches) == [[1, 2, 3], [4, 5], [6]]
    batches = list(ops.minibatch(3, numpy.asarray(items)))
    assert isinstance(batches[0], numpy.ndarray)
    assert numpy.array_equal(batches[0], numpy.asarray([1, 2, 3]))
    assert numpy.array_equal(batches[1], numpy.asarray([4, 5, 6]))
    batches = list(ops.minibatch((i for i in (3, 2, 1)), items, shuffle=True))
    assert batches != [[1, 2, 3], [4, 5], [6]]
    assert len(batches[0]) == 3
    assert len(batches[1]) == 2
    assert len(batches[2]) == 1
    with pytest.raises(ValueError):
        ops.minibatch(10, (i for i in range(100)))
    with pytest.raises(ValueError):
        ops.minibatch(10, True)
Ejemplo n.º 14
0
def main(numpy: bool = False,
         pytorch: bool = False,
         generic: bool = False,
         gpu_id: int = -1):
    global CONFIG
    fix_random_seed(0)
    if gpu_id >= 0:
        require_gpu(gpu_id)
        print("Set GPU", gpu_id)
    backends = {"pytorch": pytorch, "numpy": numpy, "generic": generic}
    for name, use_backend in backends.items():
        if not use_backend:
            print(f"Skipping {name}")
            continue
        set_backend(name, gpu_id)
        print("Getting data")
        C = registry.resolve(Config().from_str(CONFIG))
        model = C["model"]
        X, Y = get_dummy_data(**C["data"])
        print("Copy to device")
        X = [model.ops.asarray(x) for x in X]
        Y = [model.ops.asarray(y) for y in Y]
        print("Begin init", len(X))
        model.initialize(X=X[:5])
        print("Pre-batch")
        n_words = sum(len(x) for x in X)
        batches = model.ops.multibatch(16, X, Y)
        batches = [(model.layers[0].predict(x), y) for x, y in batches]
        model.layers.pop(0)
        print("Start")
        start_time = timer()
        total = run_forward(model, [x for x, y in batches])
        end_time = timer()
        print(name, n_words, total, end_time - start_time)
        start_time = timer()
        total = run_forward_backward(model, batches)
        end_time = timer()
        print(name, n_words, total, end_time - start_time)
Ejemplo n.º 15
0
def test_models_predict_consistently(seed, model_func, kwargs, get_X):
    fix_random_seed(seed)
    model1 = model_func(**kwargs).initialize()
    Y1 = model1.predict(get_X())
    fix_random_seed(seed)
    model2 = model_func(**kwargs).initialize()
    Y2 = model2.predict(get_X())

    if model1.has_ref("tok2vec"):
        tok2vec1 = model1.get_ref("tok2vec").predict(get_X())
        tok2vec2 = model2.get_ref("tok2vec").predict(get_X())
        for i in range(len(tok2vec1)):
            for j in range(len(tok2vec1[i])):
                assert_array_equal(
                    numpy.asarray(model1.ops.to_numpy(tok2vec1[i][j])),
                    numpy.asarray(model2.ops.to_numpy(tok2vec2[i][j])),
                )

    try:
        Y1 = model1.ops.to_numpy(Y1)
        Y2 = model2.ops.to_numpy(Y2)
    except Exception:
        pass
    if isinstance(Y1, numpy.ndarray):
        assert_array_equal(Y1, Y2)
    elif isinstance(Y1, List):
        assert len(Y1) == len(Y2)
        for y1, y2 in zip(Y1, Y2):
            try:
                y1 = model1.ops.to_numpy(y1)
                y2 = model2.ops.to_numpy(y2)
            except Exception:
                pass
            assert_array_equal(y1, y2)
    else:
        raise ValueError(f"Could not compare type {type(Y1)}")
Ejemplo n.º 16
0
def evaluate(
    model: str,
    data_path: Path,
    output: Optional[Path] = None,
    use_gpu: int = -1,
    gold_preproc: bool = False,
    displacy_path: Optional[Path] = None,
    displacy_limit: int = 25,
    silent: bool = True,
    spans_key: str = "sc",
) -> Dict[str, Any]:
    msg = Printer(no_print=silent, pretty=not silent)
    fix_random_seed()
    setup_gpu(use_gpu, silent=silent)
    data_path = util.ensure_path(data_path)
    output_path = util.ensure_path(output)
    displacy_path = util.ensure_path(displacy_path)
    if not data_path.exists():
        msg.fail("Evaluation data not found", data_path, exits=1)
    if displacy_path and not displacy_path.exists():
        msg.fail("Visualization output directory not found", displacy_path, exits=1)
    corpus = Corpus(data_path, gold_preproc=gold_preproc)
    nlp = util.load_model(model)
    dev_dataset = list(corpus(nlp))
    scores = nlp.evaluate(dev_dataset)
    metrics = {
        "TOK": "token_acc",
        "TAG": "tag_acc",
        "POS": "pos_acc",
        "MORPH": "morph_acc",
        "LEMMA": "lemma_acc",
        "UAS": "dep_uas",
        "LAS": "dep_las",
        "NER P": "ents_p",
        "NER R": "ents_r",
        "NER F": "ents_f",
        "TEXTCAT": "cats_score",
        "SENT P": "sents_p",
        "SENT R": "sents_r",
        "SENT F": "sents_f",
        "SPAN P": f"spans_{spans_key}_p",
        "SPAN R": f"spans_{spans_key}_r",
        "SPAN F": f"spans_{spans_key}_f",
        "SPEED": "speed",
    }
    results = {}
    data = {}
    for metric, key in metrics.items():
        if key in scores:
            if key == "cats_score":
                metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
            if isinstance(scores[key], (int, float)):
                if key == "speed":
                    results[metric] = f"{scores[key]:.0f}"
                else:
                    results[metric] = f"{scores[key]*100:.2f}"
            else:
                results[metric] = "-"
            data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]

    msg.table(results, title="Results")
    data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent)

    if displacy_path:
        factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
        docs = list(nlp.pipe(ex.reference.text for ex in dev_dataset[:displacy_limit]))
        render_deps = "parser" in factory_names
        render_ents = "ner" in factory_names
        render_parses(
            docs,
            displacy_path,
            model_name=model,
            limit=displacy_limit,
            deps=render_deps,
            ents=render_ents,
        )
        msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)

    if output_path is not None:
        srsly.write_json(output_path, data)
        msg.good(f"Saved results to {output_path}")
    return data
Ejemplo n.º 17
0
def train(
    nlp: "Language",
    output_path: Optional[Path] = None,
    *,
    use_gpu: int = -1,
    stdout: IO = sys.stdout,
    stderr: IO = sys.stderr,
) -> Tuple["Language", Optional[Path]]:
    """Train a pipeline.

    nlp (Language): The initialized nlp object with the full config.
    output_path (Path): Optional output path to save trained model to.
    use_gpu (int): Whether to train on GPU. Make sure to call require_gpu
        before calling this function.
    stdout (file): A file-like object to write output messages. To disable
        printing, set to io.StringIO.
    stderr (file): A second file-like object to write output messages. To disable
        printing, set to io.StringIO.

    RETURNS (tuple): The final nlp object and the path to the exported model.
    """
    # We use no_print here so we can respect the stdout/stderr options.
    msg = Printer(no_print=True)
    # Create iterator, which yields out info after each optimization step.
    config = nlp.config.interpolate()
    if config["training"]["seed"] is not None:
        fix_random_seed(config["training"]["seed"])
    allocator = config["training"]["gpu_allocator"]
    if use_gpu >= 0 and allocator:
        set_gpu_allocator(allocator)
    T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
    dot_names = [T["train_corpus"], T["dev_corpus"]]
    train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
    optimizer = T["optimizer"]
    score_weights = T["score_weights"]
    batcher = T["batcher"]
    train_logger = T["logger"]
    before_to_disk = create_before_to_disk_callback(T["before_to_disk"])

    # Helper function to save checkpoints. This is a closure for convenience,
    # to avoid passing in all the args all the time.
    def save_checkpoint(is_best):
        with nlp.use_params(optimizer.averages):
            before_to_disk(nlp).to_disk(output_path / DIR_MODEL_LAST)
        if is_best:
            # Avoid saving twice (saving will be more expensive than
            # the dir copy)
            if (output_path / DIR_MODEL_BEST).exists():
                shutil.rmtree(output_path / DIR_MODEL_BEST)
            shutil.copytree(output_path / DIR_MODEL_LAST,
                            output_path / DIR_MODEL_BEST)

    # Components that shouldn't be updated during training
    frozen_components = T["frozen_components"]
    # Components that should set annotations on update
    annotating_components = T["annotating_components"]
    # Create iterator, which yields out info after each optimization step.
    training_step_iterator = train_while_improving(
        nlp,
        optimizer,
        create_train_batches(nlp, train_corpus, batcher, T["max_epochs"]),
        create_evaluation_callback(nlp, dev_corpus, score_weights),
        dropout=T["dropout"],
        accumulate_gradient=T["accumulate_gradient"],
        patience=T["patience"],
        max_steps=T["max_steps"],
        eval_frequency=T["eval_frequency"],
        exclude=frozen_components,
        annotating_components=annotating_components,
    )
    clean_output_dir(output_path)
    stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}") + "\n")
    if frozen_components:
        stdout.write(
            msg.info(f"Frozen components: {frozen_components}") + "\n")
    if annotating_components:
        stdout.write(
            msg.info(f"Set annotations on update for: {annotating_components}")
            + "\n")
    stdout.write(
        msg.info(f"Initial learn rate: {optimizer.learn_rate}") + "\n")
    with nlp.select_pipes(disable=frozen_components):
        log_step, finalize_logger = train_logger(nlp, stdout, stderr)
    try:
        for batch, info, is_best_checkpoint in training_step_iterator:
            if is_best_checkpoint is not None:
                with nlp.select_pipes(disable=frozen_components):
                    update_meta(T, nlp, info)
                if output_path is not None:
                    save_checkpoint(is_best_checkpoint)
                    info["output_path"] = str(output_path / DIR_MODEL_LAST)
            log_step(info if is_best_checkpoint is not None else None)
    except Exception as e:
        if output_path is not None:
            stdout.write(
                msg.warn(f"Aborting and saving the final best model. "
                         f"Encountered exception: {repr(e)}") + "\n")
        raise e
    finally:
        finalize_logger()
        if output_path is not None:
            save_checkpoint(False)
    # This will only run if we did't hit an error
    if optimizer.averages:
        nlp.use_params(optimizer.averages)
    if output_path is not None:
        stdout.write(
            msg.good("Saved pipeline to output directory", output_path /
                     DIR_MODEL_LAST) + "\n")
        return (nlp, output_path / DIR_MODEL_LAST)
    else:
        return (nlp, None)
Ejemplo n.º 18
0
def pretrain(
    config: Config,
    output_dir: Path,
    resume_path: Optional[Path] = None,
    epoch_resume: Optional[int] = None,
    use_gpu: int = -1,
    silent: bool = True,
):
    msg = Printer(no_print=silent)
    if config["training"]["seed"] is not None:
        fix_random_seed(config["training"]["seed"])
    allocator = config["training"]["gpu_allocator"]
    if use_gpu >= 0 and allocator:
        set_gpu_allocator(allocator)
    nlp = load_model_from_config(config)
    _config = nlp.config.interpolate()
    P = registry.resolve(_config["pretraining"], schema=ConfigSchemaPretrain)
    corpus = dot_to_object(_config, P["corpus"])
    corpus = registry.resolve({"corpus": corpus})["corpus"]
    batcher = P["batcher"]
    model = create_pretraining_model(nlp, P)
    optimizer = P["optimizer"]
    # Load in pretrained weights to resume from
    if resume_path is not None:
        _resume_model(model, resume_path, epoch_resume, silent=silent)
    else:
        # Without '--resume-path' the '--epoch-resume' argument is ignored
        epoch_resume = 0
    objective = model.attrs["loss"]
    # TODO: move this to logger function?
    tracker = ProgressTracker(frequency=10000)
    msg.divider(
        f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
    row_settings = {
        "widths": (3, 10, 10, 6, 4),
        "aligns": ("r", "r", "r", "r", "r")
    }
    msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)

    def _save_model(epoch, is_temp=False):
        is_temp_str = ".temp" if is_temp else ""
        with model.use_params(optimizer.averages):
            with (output_dir /
                  f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
                file_.write(model.get_ref("tok2vec").to_bytes())
            log = {
                "nr_word": tracker.nr_word,
                "loss": tracker.loss,
                "epoch_loss": tracker.epoch_loss,
                "epoch": epoch,
            }
            with (output_dir / "log.jsonl").open("a") as file_:
                file_.write(srsly.json_dumps(log) + "\n")

    # TODO: I think we probably want this to look more like the
    # 'create_train_batches' function?
    for epoch in range(epoch_resume, P["max_epochs"]):
        for batch_id, batch in enumerate(batcher(corpus(nlp))):
            docs = ensure_docs(batch)
            loss = make_update(model, docs, optimizer, objective)
            progress = tracker.update(epoch, loss, docs)
            if progress:
                msg.row(progress, **row_settings)
            if P["n_save_every"] and (batch_id % P["n_save_every"] == 0):
                _save_model(epoch, is_temp=True)
        _save_model(epoch)
        tracker.epoch_loss = 0.0
Ejemplo n.º 19
0
def debug_model_cli(
    # fmt: off
    ctx: typer.Context,  # This is only used to read additional arguments
    config_path: Path = Arg(...,
                            help="Path to config file",
                            exists=True,
                            allow_dash=True),
    component: str = Arg(
        ...,
        help=
        "Name of the pipeline component of which the model should be analysed"
    ),
    layers: str = Opt("",
                      "--layers",
                      "-l",
                      help="Comma-separated names of layer IDs to print"),
    dimensions: bool = Opt(False,
                           "--dimensions",
                           "-DIM",
                           help="Show dimensions"),
    parameters: bool = Opt(False,
                           "--parameters",
                           "-PAR",
                           help="Show parameters"),
    gradients: bool = Opt(False, "--gradients", "-GRAD",
                          help="Show gradients"),
    attributes: bool = Opt(False,
                           "--attributes",
                           "-ATTR",
                           help="Show attributes"),
    P0: bool = Opt(False,
                   "--print-step0",
                   "-P0",
                   help="Print model before training"),
    P1: bool = Opt(False,
                   "--print-step1",
                   "-P1",
                   help="Print model after initialization"),
    P2: bool = Opt(False,
                   "--print-step2",
                   "-P2",
                   help="Print model after training"),
    P3: bool = Opt(False,
                   "--print-step3",
                   "-P3",
                   help="Print final predictions"),
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
    # fmt: on
):
    """
    Analyze a Thinc model implementation. Includes checks for internal structure
    and activations during training.

    DOCS: https://spacy.io/api/cli#debug-model
    """
    setup_gpu(use_gpu)
    layers = string_to_list(layers, intify=True)
    print_settings = {
        "dimensions": dimensions,
        "parameters": parameters,
        "gradients": gradients,
        "attributes": attributes,
        "layers": layers,
        "print_before_training": P0,
        "print_after_init": P1,
        "print_after_training": P2,
        "print_prediction": P3,
    }
    config_overrides = parse_config_overrides(ctx.args)
    with show_validation_error(config_path):
        raw_config = util.load_config(config_path,
                                      overrides=config_overrides,
                                      interpolate=False)
    config = raw_config.interpolate()
    allocator = config["training"]["gpu_allocator"]
    if use_gpu >= 0 and allocator:
        set_gpu_allocator(allocator)
    with show_validation_error(config_path):
        nlp = util.load_model_from_config(raw_config)
        config = nlp.config.interpolate()
        T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
    seed = T["seed"]
    if seed is not None:
        msg.info(f"Fixing random seed: {seed}")
        fix_random_seed(seed)
    pipe = nlp.get_pipe(component)
    if not hasattr(pipe, "model"):
        msg.fail(
            f"The component '{component}' does not specify an object that holds a Model.",
            exits=1,
        )
    model = pipe.model
    debug_model(config, T, nlp, model, print_settings=print_settings)
Ejemplo n.º 20
0
loaded_config = api.registry.make_from_config(config)
batch_size = loaded_config['training']['batch_size']
n_iter = loaded_config['training']['n_iter']

# dataset
(train_X, train_Y), (dev_X, dev_Y) = ml_datasets.mnist()
cowsay.cow(f"Training size={len(train_X)}, dev size={len(dev_X)}")

# model
model = api.Softmax()
model.initialize(X=train_X, Y=train_Y)
cowsay.cow(
    f"Initialized model with input dimension "
    f"nI={model.get_dim('nI')} and output dimension nO={model.get_dim('nO')}")

api.fix_random_seed(0)
optimizer = loaded_config['optimizer']
print("Training")
for _ in range(n_iter):
    for X, Y in model.ops.multibatch(batch_size,
                                     train_X,
                                     train_Y,
                                     shuffle=True):
        Yh, backprop = model.begin_update(X)
        backprop(Yh - Y)
        model.finish_update(optimizer)

print("Testing")
n_correct = 0
n_total = 0
for X, Y in model.ops.multibatch(batch_size, dev_X, dev_Y, shuffle=True):
Ejemplo n.º 21
0
CONFIG = """
[hyper_params]
width = 32
vector_width = 16
learn_rate = 0.001

[training]
n_iter = 10
batch_size = 128

[model]
@call = cnn_tagger
width = ${hyper_params.width}
vector_width = ${hyper_params.vector_width}
nr_classes = 17

[optimizer]
@call = thinc.api:Adam
learn_rate = ${hyper_params.learn_rate}
"""

fix_random_seed(0)

oh.config.load_str(CONFIG)

model = oh.config.model()
optimizer = oh.config.optimizer()
n_iter = oh.config.training["n_iter"]
batch_size = oh.config.training["batch_size"]
train_model(model, optimizer, n_iter, batch_size)