Exemple #1
0
def main(
    # fmt: off
    in_file: str = typer.Argument(..., help="Vectors file (text-based)"),
    vocab_file: str = typer.Argument(..., help="Vocabulary file"),
    out_dir: str = typer.Argument(..., help="Path to output directory"),
    min_freq_ratio: float = typer.Option(
        0.0,
        "--min-freq-ratio",
        "-r",
        help=
        "Frequency ratio threshold for discarding minority senses or casings"),
    min_distance: float = typer.Option(
        0.0,
        "--min-distance",
        "-s",
        help="Similarity threshold for discarding redundant keys"),
    # fmt: on
):
    """
    Step 5: Export a sense2vec component

    Expects a vectors.txt and a vocab file trained with GloVe and exports
    a component that can be loaded with Sense2vec.from_disk.
    """
    input_path = Path(in_file)
    vocab_path = Path(vocab_file)
    output_path = Path(out_dir)
    if not input_path.exists():
        msg.fail("Can't find input file", in_file, exits=1)
    if input_path.suffix == ".bin":
        msg.fail("Need text-based vectors file, not binary", in_file, exits=1)
    if not vocab_path.exists():
        msg.fail("Can't find vocab file", vocab_file, exits=1)
    if not output_path.exists():
        output_path.mkdir(parents=True)
        msg.good(f"Created output directory {out_dir}")
    with input_path.open("r", encoding="utf8") as f:
        (n_vectors, vector_size), f = _get_shape(f)
        vectors_data = f.readlines()
    with vocab_path.open("r", encoding="utf8") as f:
        vocab = read_vocab(f)
    vectors = {}
    all_senses = set()
    for item in vectors_data:
        item = item.rstrip().rsplit(" ", vector_size)
        key = item[0]
        try:
            _, sense = split_key(key)
        except ValueError:
            continue
        vec = item[1:]
        if len(vec) != vector_size:
            msg.fail(f"Wrong vector size: {len(vec)} (expected {vector_size})",
                     exits=1)
        all_senses.add(sense)
        vectors[key] = numpy.asarray(vec, dtype=numpy.float32)
    discarded = set()
    discarded.update(get_minority_keys(vocab, min_freq_ratio))
    discarded.update(get_redundant_keys(vocab, vectors, min_distance))
    n_vectors = len(vectors) - len(discarded)
    s2v = Sense2Vec(shape=(n_vectors, vector_size), senses=list(all_senses))
    for key, vector in vectors.items():
        if key not in discarded:
            s2v.add(key, vector)
            s2v.set_freq(key, vocab[key])
    msg.good("Created the sense2vec model")
    msg.info(f"{n_vectors} vectors, {len(all_senses)} total senses")
    s2v.to_disk(output_path)
    msg.good("Saved model to directory", out_dir)
Exemple #2
0
def train(
    lang,
    output_path,
    train_path,
    dev_path,
    raw_text=None,
    base_model=None,
    pipeline="tagger,parser,ner",
    replace_components=False,
    vectors=None,
    width=96,
    conv_depth=4,
    cnn_window=1,
    cnn_pieces=3,
    use_chars=False,
    bilstm_depth=0,
    embed_rows=2000,
    n_iter=30,
    n_early_stopping=None,
    n_examples=0,
    use_gpu=-1,
    version="0.0.0",
    meta_path=None,
    init_tok2vec=None,
    parser_multitasks="",
    entity_multitasks="",
    noise_level=0.0,
    orth_variant_level=0.0,
    eval_beam_widths="",
    gold_preproc=False,
    learn_tokens=False,
    textcat_multilabel=False,
    textcat_arch="bow",
    textcat_positive_label=None,
    tag_map_path=None,
    verbose=False,
    debug=False,
):
    """
    Train or update a spaCy model. Requires data to be formatted in spaCy's
    JSON format. To convert data from other formats, use the `spacy convert`
    command.
    """
    util.fix_random_seed()
    util.set_env_log(verbose)

    # Make sure all files and paths exists if they are needed
    train_path = util.ensure_path(train_path)
    dev_path = util.ensure_path(dev_path)
    meta_path = util.ensure_path(meta_path)
    output_path = util.ensure_path(output_path)
    if raw_text is not None:
        raw_text = list(srsly.read_jsonl(raw_text))
    if not train_path or not train_path.exists():
        msg.fail("Training data not found", train_path, exits=1)
    if not dev_path or not dev_path.exists():
        msg.fail("Development data not found", dev_path, exits=1)
    if meta_path is not None and not meta_path.exists():
        msg.fail("Can't find model meta.json", meta_path, exits=1)
    meta = srsly.read_json(meta_path) if meta_path else {}
    if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]:
        msg.warn(
            "Output directory is not empty",
            "This can lead to unintended side effects when saving the model. "
            "Please use an empty directory or a different path instead. If "
            "the specified output path doesn't exist, the directory will be "
            "created for you.",
        )
    if not output_path.exists():
        output_path.mkdir()
        msg.good("Created output directory: {}".format(output_path))

    tag_map = {}
    if tag_map_path is not None:
        tag_map = srsly.read_json(tag_map_path)
    # Take dropout and batch size as generators of values -- dropout
    # starts high and decays sharply, to force the optimizer to explore.
    # Batch size starts at 1 and grows, so that we make updates quickly
    # at the beginning of training.
    dropout_rates = util.decaying(
        util.env_opt("dropout_from", 0.2),
        util.env_opt("dropout_to", 0.2),
        util.env_opt("dropout_decay", 0.0),
    )
    batch_sizes = util.compounding(
        util.env_opt("batch_from", 100.0),
        util.env_opt("batch_to", 1000.0),
        util.env_opt("batch_compound", 1.001),
    )

    if not eval_beam_widths:
        eval_beam_widths = [1]
    else:
        eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")]
        if 1 not in eval_beam_widths:
            eval_beam_widths.append(1)
        eval_beam_widths.sort()
    has_beam_widths = eval_beam_widths != [1]

    # Set up the base model and pipeline. If a base model is specified, load
    # the model and make sure the pipeline matches the pipeline setting. If
    # training starts from a blank model, intitalize the language class.
    pipeline = [p.strip() for p in pipeline.split(",")]
    disabled_pipes = None
    pipes_added = False
    msg.text("Training pipeline: {}".format(pipeline))
    if use_gpu >= 0:
        activated_gpu = None
        try:
            activated_gpu = set_gpu(use_gpu)
        except Exception as e:
            msg.warn("Exception: {}".format(e))
        if activated_gpu is not None:
            msg.text("Using GPU: {}".format(use_gpu))
        else:
            msg.warn("Unable to activate GPU: {}".format(use_gpu))
            msg.text("Using CPU only")
            use_gpu = -1
    if base_model:
        msg.text("Starting with base model '{}'".format(base_model))
        nlp = util.load_model(base_model)
        if nlp.lang != lang:
            msg.fail(
                "Model language ('{}') doesn't match language specified as "
                "`lang` argument ('{}') ".format(nlp.lang, lang),
                exits=1,
            )
        for pipe in pipeline:
            pipe_cfg = {}
            if pipe == "parser":
                pipe_cfg = {"learn_tokens": learn_tokens}
            elif pipe == "textcat":
                pipe_cfg = {
                    "exclusive_classes": not textcat_multilabel,
                    "architecture": textcat_arch,
                    "positive_label": textcat_positive_label,
                }
            if pipe not in nlp.pipe_names:
                msg.text("Adding component to base model '{}'".format(pipe))
                nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
                pipes_added = True
            elif replace_components:
                msg.text("Replacing component from base model '{}'".format(pipe))
                nlp.replace_pipe(pipe, nlp.create_pipe(pipe, config=pipe_cfg))
                pipes_added = True
            else:
                if pipe == "textcat":
                    textcat_cfg = nlp.get_pipe("textcat").cfg
                    base_cfg = {
                        "exclusive_classes": textcat_cfg["exclusive_classes"],
                        "architecture": textcat_cfg["architecture"],
                        "positive_label": textcat_cfg["positive_label"],
                    }
                    if base_cfg != pipe_cfg:
                        msg.fail(
                            "The base textcat model configuration does"
                            "not match the provided training options. "
                            "Existing cfg: {}, provided cfg: {}".format(
                                base_cfg, pipe_cfg
                            ),
                            exits=1,
                        )
                msg.text("Extending component from base model '{}'".format(pipe))
        disabled_pipes = nlp.disable_pipes(
            [p for p in nlp.pipe_names if p not in pipeline]
        )
    else:
        msg.text("Starting with blank model '{}'".format(lang))
        lang_cls = util.get_lang_class(lang)
        nlp = lang_cls()
        for pipe in pipeline:
            if pipe == "parser":
                pipe_cfg = {"learn_tokens": learn_tokens}
            elif pipe == "textcat":
                pipe_cfg = {
                    "exclusive_classes": not textcat_multilabel,
                    "architecture": textcat_arch,
                    "positive_label": textcat_positive_label,
                }
            else:
                pipe_cfg = {}
            nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))

    # Update tag map with provided mapping
    nlp.vocab.morphology.tag_map.update(tag_map)

    if vectors:
        msg.text("Loading vector from model '{}'".format(vectors))
        _load_vectors(nlp, vectors)

    # Multitask objectives
    multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)]
    for pipe_name, multitasks in multitask_options:
        if multitasks:
            if pipe_name not in pipeline:
                msg.fail(
                    "Can't use multitask objective without '{}' in the "
                    "pipeline".format(pipe_name)
                )
            pipe = nlp.get_pipe(pipe_name)
            for objective in multitasks.split(","):
                pipe.add_multitask_objective(objective)

    # Prepare training corpus
    msg.text("Counting training words (limit={})".format(n_examples))
    corpus = GoldCorpus(train_path, dev_path, limit=n_examples)
    n_train_words = corpus.count_train()

    if base_model and not pipes_added:
        # Start with an existing model, use default optimizer
        optimizer = create_default_optimizer(Model.ops)
    else:
        # Start with a blank model, call begin_training
        cfg = {"device": use_gpu}
        cfg["conv_depth"] = conv_depth
        cfg["token_vector_width"] = width
        cfg["bilstm_depth"] = bilstm_depth
        cfg["cnn_maxout_pieces"] = cnn_pieces
        cfg["embed_size"] = embed_rows
        cfg["conv_window"] = cnn_window
        cfg["subword_features"] = not use_chars
        optimizer = nlp.begin_training(lambda: corpus.train_tuples, **cfg)

    nlp._optimizer = None

    # Load in pretrained weights
    if init_tok2vec is not None:
        components = _load_pretrained_tok2vec(nlp, init_tok2vec)
        msg.text("Loaded pretrained tok2vec for: {}".format(components))

    # Verify textcat config
    if "textcat" in pipeline:
        textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", [])
        if textcat_positive_label and textcat_positive_label not in textcat_labels:
            msg.fail(
                "The textcat_positive_label (tpl) '{}' does not match any "
                "label in the training data.".format(textcat_positive_label),
                exits=1,
            )
        if textcat_positive_label and len(textcat_labels) != 2:
            msg.fail(
                "A textcat_positive_label (tpl) '{}' was provided for training "
                "data that does not appear to be a binary classification "
                "problem with two labels.".format(textcat_positive_label),
                exits=1,
            )
        train_docs = corpus.train_docs(
            nlp,
            noise_level=noise_level,
            gold_preproc=gold_preproc,
            max_length=0,
            ignore_misaligned=True,
        )
        train_labels = set()
        if textcat_multilabel:
            multilabel_found = False
            for text, gold in train_docs:
                train_labels.update(gold.cats.keys())
                if list(gold.cats.values()).count(1.0) != 1:
                    multilabel_found = True
            if not multilabel_found and not base_model:
                msg.warn(
                    "The textcat training instances look like they have "
                    "mutually-exclusive classes. Remove the flag "
                    "'--textcat-multilabel' to train a classifier with "
                    "mutually-exclusive classes."
                )
        if not textcat_multilabel:
            for text, gold in train_docs:
                train_labels.update(gold.cats.keys())
                if list(gold.cats.values()).count(1.0) != 1 and not base_model:
                    msg.warn(
                        "Some textcat training instances do not have exactly "
                        "one positive label. Modifying training options to "
                        "include the flag '--textcat-multilabel' for classes "
                        "that are not mutually exclusive."
                    )
                    nlp.get_pipe("textcat").cfg["exclusive_classes"] = False
                    textcat_multilabel = True
                    break
        if base_model and set(textcat_labels) != train_labels:
            msg.fail(
                "Cannot extend textcat model using data with different "
                "labels. Base model labels: {}, training data labels: "
                "{}.".format(textcat_labels, list(train_labels)),
                exits=1,
            )
        if textcat_multilabel:
            msg.text(
                "Textcat evaluation score: ROC AUC score macro-averaged across "
                "the labels '{}'".format(", ".join(textcat_labels))
            )
        elif textcat_positive_label and len(textcat_labels) == 2:
            msg.text(
                "Textcat evaluation score: F1-score for the "
                "label '{}'".format(textcat_positive_label)
            )
        elif len(textcat_labels) > 1:
            if len(textcat_labels) == 2:
                msg.warn(
                    "If the textcat component is a binary classifier with "
                    "exclusive classes, provide '--textcat-positive-label' for "
                    "an evaluation on the positive class."
                )
            msg.text(
                "Textcat evaluation score: F1-score macro-averaged across "
                "the labels '{}'".format(", ".join(textcat_labels))
            )
        else:
            msg.fail(
                "Unsupported textcat configuration. Use `spacy debug-data` "
                "for more information."
            )

    # fmt: off
    row_head, output_stats = _configure_training_output(pipeline, use_gpu, has_beam_widths)
    row_widths = [len(w) for w in row_head]
    row_settings = {"widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2}
    # fmt: on
    print("")
    msg.row(row_head, **row_settings)
    msg.row(["-" * width for width in row_settings["widths"]], **row_settings)
    try:
        iter_since_best = 0
        best_score = 0.0
        for i in range(n_iter):
            train_docs = corpus.train_docs(
                nlp,
                noise_level=noise_level,
                orth_variant_level=orth_variant_level,
                gold_preproc=gold_preproc,
                max_length=0,
                ignore_misaligned=True,
            )
            if raw_text:
                random.shuffle(raw_text)
                raw_batches = util.minibatch(
                    (nlp.make_doc(rt["text"]) for rt in raw_text), size=8
                )
            words_seen = 0
            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
                losses = {}
                for batch in util.minibatch_by_words(train_docs, size=batch_sizes):
                    if not batch:
                        continue
                    docs, golds = zip(*batch)
                    try:
                        nlp.update(
                            docs,
                            golds,
                            sgd=optimizer,
                            drop=next(dropout_rates),
                            losses=losses,
                        )
                    except ValueError as e:
                        err = "Error during training"
                        if init_tok2vec:
                            err += " Did you provide the same parameters during 'train' as during 'pretrain'?"
                        msg.fail(err, "Original error message: {}".format(e), exits=1)
                    if raw_text:
                        # If raw text is available, perform 'rehearsal' updates,
                        # which use unlabelled data to reduce overfitting.
                        raw_batch = list(next(raw_batches))
                        nlp.rehearse(raw_batch, sgd=optimizer, losses=losses)
                    if not int(os.environ.get("LOG_FRIENDLY", 0)):
                        pbar.update(sum(len(doc) for doc in docs))
                    words_seen += sum(len(doc) for doc in docs)
            with nlp.use_params(optimizer.averages):
                util.set_env_log(False)
                epoch_model_path = output_path / ("model%d" % i)
                nlp.to_disk(epoch_model_path)
                nlp_loaded = util.load_model_from_path(epoch_model_path)
                for beam_width in eval_beam_widths:
                    for name, component in nlp_loaded.pipeline:
                        if hasattr(component, "cfg"):
                            component.cfg["beam_width"] = beam_width
                    dev_docs = list(
                        corpus.dev_docs(
                            nlp_loaded,
                            gold_preproc=gold_preproc,
                            ignore_misaligned=True,
                        )
                    )
                    nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
                    start_time = timer()
                    scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
                    end_time = timer()
                    if use_gpu < 0:
                        gpu_wps = None
                        cpu_wps = nwords / (end_time - start_time)
                    else:
                        gpu_wps = nwords / (end_time - start_time)
                        # Only evaluate on CPU in the first iteration (for
                        # timing) if GPU is enabled
                        if i == 0:
                            with Model.use_device("cpu"):
                                nlp_loaded = util.load_model_from_path(epoch_model_path)
                                for name, component in nlp_loaded.pipeline:
                                    if hasattr(component, "cfg"):
                                        component.cfg["beam_width"] = beam_width
                                dev_docs = list(
                                    corpus.dev_docs(
                                        nlp_loaded,
                                        gold_preproc=gold_preproc,
                                        ignore_misaligned=True,
                                    )
                                )
                                start_time = timer()
                                scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
                                end_time = timer()
                                cpu_wps = nwords / (end_time - start_time)
                    acc_loc = output_path / ("model%d" % i) / "accuracy.json"
                    srsly.write_json(acc_loc, scorer.scores)

                    # Update model meta.json
                    meta["lang"] = nlp.lang
                    meta["pipeline"] = nlp.pipe_names
                    meta["spacy_version"] = ">=%s" % about.__version__
                    if beam_width == 1:
                        meta["speed"] = {
                            "nwords": nwords,
                            "cpu": cpu_wps,
                            "gpu": gpu_wps,
                        }
                        meta.setdefault("accuracy", {})
                        for component in nlp.pipe_names:
                            for metric in _get_metrics(component):
                                meta["accuracy"][metric] = scorer.scores[metric]
                    else:
                        meta.setdefault("beam_accuracy", {})
                        meta.setdefault("beam_speed", {})
                        for component in nlp.pipe_names:
                            for metric in _get_metrics(component):
                                meta["beam_accuracy"][metric] = scorer.scores[metric]
                        meta["beam_speed"][beam_width] = {
                            "nwords": nwords,
                            "cpu": cpu_wps,
                            "gpu": gpu_wps,
                        }
                    meta["vectors"] = {
                        "width": nlp.vocab.vectors_length,
                        "vectors": len(nlp.vocab.vectors),
                        "keys": nlp.vocab.vectors.n_keys,
                        "name": nlp.vocab.vectors.name,
                    }
                    meta.setdefault("name", "model%d" % i)
                    meta.setdefault("version", version)
                    meta["labels"] = nlp.meta["labels"]
                    meta_loc = output_path / ("model%d" % i) / "meta.json"
                    srsly.write_json(meta_loc, meta)
                    util.set_env_log(verbose)

                    progress = _get_progress(
                        i,
                        losses,
                        scorer.scores,
                        output_stats,
                        beam_width=beam_width if has_beam_widths else None,
                        cpu_wps=cpu_wps,
                        gpu_wps=gpu_wps,
                    )
                    if i == 0 and "textcat" in pipeline:
                        textcats_per_cat = scorer.scores.get("textcats_per_cat", {})
                        for cat, cat_score in textcats_per_cat.items():
                            if cat_score.get("roc_auc_score", 0) < 0:
                                msg.warn(
                                    "Textcat ROC AUC score is undefined due to "
                                    "only one value in label '{}'.".format(cat)
                                )
                    msg.row(progress, **row_settings)
                # Early stopping
                if n_early_stopping is not None:
                    current_score = _score_for_model(meta)
                    if current_score < best_score:
                        iter_since_best += 1
                    else:
                        iter_since_best = 0
                        best_score = current_score
                    if iter_since_best >= n_early_stopping:
                        msg.text(
                            "Early stopping, best iteration "
                            "is: {}".format(i - iter_since_best)
                        )
                        msg.text(
                            "Best score = {}; Final iteration "
                            "score = {}".format(best_score, current_score)
                        )
                        break
    except Exception as e:
        msg.warn(
            "Aborting and saving the final best model. "
            "Encountered exception: {}".format(e),
            exits=1,
        )
    finally:
        best_pipes = nlp.pipe_names
        if disabled_pipes:
            disabled_pipes.restore()
        with nlp.use_params(optimizer.averages):
            final_model_path = output_path / "model-final"
            nlp.to_disk(final_model_path)
            meta_loc = output_path / "model-final" / "meta.json"
            final_meta = srsly.read_json(meta_loc)
            final_meta.setdefault("accuracy", {})
            final_meta["accuracy"].update(meta.get("accuracy", {}))
            final_meta.setdefault("speed", {})
            final_meta["speed"].setdefault("cpu", None)
            final_meta["speed"].setdefault("gpu", None)
            meta.setdefault("speed", {})
            meta["speed"].setdefault("cpu", None)
            meta["speed"].setdefault("gpu", None)
            # combine cpu and gpu speeds with the base model speeds
            if final_meta["speed"]["cpu"] and meta["speed"]["cpu"]:
                speed = _get_total_speed(
                    [final_meta["speed"]["cpu"], meta["speed"]["cpu"]]
                )
                final_meta["speed"]["cpu"] = speed
            if final_meta["speed"]["gpu"] and meta["speed"]["gpu"]:
                speed = _get_total_speed(
                    [final_meta["speed"]["gpu"], meta["speed"]["gpu"]]
                )
                final_meta["speed"]["gpu"] = speed
            # if there were no speeds to update, overwrite with meta
            if (
                final_meta["speed"]["cpu"] is None
                and final_meta["speed"]["gpu"] is None
            ):
                final_meta["speed"].update(meta["speed"])
            # note: beam speeds are not combined with the base model
            if has_beam_widths:
                final_meta.setdefault("beam_accuracy", {})
                final_meta["beam_accuracy"].update(meta.get("beam_accuracy", {}))
                final_meta.setdefault("beam_speed", {})
                final_meta["beam_speed"].update(meta.get("beam_speed", {}))
            srsly.write_json(meta_loc, final_meta)
        msg.good("Saved model to output directory", final_model_path)
        with msg.loading("Creating best model..."):
            best_model_path = _collate_best_model(final_meta, output_path, best_pipes)
        msg.good("Created best model", best_model_path)
Exemple #3
0
    for fund in funds:
        for stock in fund.stocks:
            if stocks.get(stock.code):
                stocks[stock.code].proportion += float(
                    stock.proportion) / fund_count
            else:
                stocks[stock.code] = stock
                stocks[stock.code].proportion = float(
                    stocks[stock.code].proportion) / fund_count
    total_money = 0
    sorted_stocks = sorted(stocks.values(),
                           key=lambda item: item.proportion,
                           reverse=True)
    stock_table = PrettyTable()
    stock_table.title = '股票列表'
    stock_table.field_names = ['股票代码', '股票名称', '占净值比例', '买入金额']
    stock_table.align['占净值比例'] = 'r'
    stock_table.align['买入金额'] = 'r'
    for stock in sorted_stocks:
        cost = int(money * stock.proportion / fund_count / 100)
        total_money += cost
        stock_table.add_row([
            stock.code, stock.name, f'{stock.proportion / fund_count:.02f}%',
            f'{cost:.02f}'
        ])
    print(stock_table)
    msg.good(
        f'总计买入: {total_money:.02f} 元,剩余 {money - total_money:.02f} 元,建议存入余额宝/余利宝/零钱通等。'
    )
    msg.warn('以上信息仅供参考,股市有风险,请理性投资!')
def main(
    model="./zh_vectors_web_ud_lg/model-final",
    new_model_name="zh_vectors_web_ud_clue_lg",
    output_dir="./zh_vectors_web_ud_clue_lg",
    train_path="./clue_spacy_train.jsonl",
    dev_path="./clue_spacy_dev.jsonl",
    meta_path="./meta.json",
    use_gpu=0,
    n_iter=50
):
    import tqdm
    """Set up the pipeline and entity recognizer, and train the new entity."""
    random.seed(0)
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)

    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe("ner")

    for label in LABEL:
        if label not in ner.labels:
            ner.add_label(label)  # add new entity label to entity recognizer

    train_path = ensure_path(train_path)
    dev_path = ensure_path(dev_path)

    if not train_path or not train_path.exists():
        msg.fail("Training data not found", train_path, exits=1)
    if not dev_path or not dev_path.exists():
        msg.fail("Development data not found", dev_path, exits=1)

    if output_dir.exists() and [p for p in output_dir.iterdir() if p.is_dir()]:
        msg.warn(
            "Output directory is not empty",
            "This can lead to unintended side effects when saving the model. "
            "Please use an empty directory or a different path instead. If "
            "the specified output path doesn't exist, the directory will be "
            "created for you.",
        )
    if not output_dir.exists():
        output_dir.mkdir()

    meta = srsly.read_json(meta_path) if meta_path else {}

    # Prepare training corpus
    msg.text("Counting training words (limit={})".format(0))
    corpus = GoldCorpus(train_path, dev_path, limit=0)
    n_train_words = corpus.count_train()
    
    if model is None:   
        optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
    else:
        optimizer = create_default_optimizer(Model.ops)
        # Todo: gpu train?

    dropout_rates = decaying( 0.2, 0.2, 0.0)
    
    batch_sizes = compounding( 100.0, 1000.0 , 1.001)

    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [
        pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]


    # UnboundLocalError: local variable 'has_beam_widths' referenced before assignment
    # fmt: off
    eval_beam_widths=""
    if not eval_beam_widths:
        eval_beam_widths = [1]
    else:
        eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")]
        if 1 not in eval_beam_widths:
            eval_beam_widths.append(1)
        eval_beam_widths.sort()
    has_beam_widths = eval_beam_widths != [1]
    row_head, output_stats = _configure_training_output(["ner"], use_gpu, has_beam_widths)
    row_widths = [len(w) for w in row_head]
    row_settings = {"widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2}
    # fmt: on
    print("")
    msg.row(row_head, **row_settings)
    msg.row(["-" * width for width in row_settings["widths"]], **row_settings)
    try:
        noise_level = 0.0
        orth_variant_level = 0.0
        gold_preproc = False
        verbose = False

        best_score = 0.0
        with nlp.disable_pipes(*other_pipes):  # only train NER
            for itn in range(n_iter):
                train_docs = corpus.train_docs(
                    nlp,
                    noise_level=noise_level,
                    orth_variant_level=orth_variant_level,
                    gold_preproc=gold_preproc,
                    max_length=0,
                    ignore_misaligned=True,
                )
                words_seen = 0
                with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
                    losses = {}
                    for batch in minibatch_by_words(train_docs, size=batch_sizes):
                        if not batch:
                            continue
                        docs, golds = zip(*batch)
                        nlp.update(
                            docs,
                            golds,
                            sgd=optimizer,
                            drop=next(dropout_rates),
                            losses=losses,
                        )
                        if not int(os.environ.get("LOG_FRIENDLY", 0)):
                            pbar.update(sum(len(doc) for doc in docs))
                        words_seen += sum(len(doc) for doc in docs)
                with nlp.use_params(optimizer.averages):
                    set_env_log(False)
                    epoch_model_path = output_dir / ("model%d" % itn)
                    nlp.to_disk(epoch_model_path)
                    nlp_loaded = load_model_from_path(epoch_model_path)
                    for beam_width in eval_beam_widths:
                        for name, component in nlp_loaded.pipeline:
                            if hasattr(component, "cfg"):
                                component.cfg["beam_width"] = beam_width
                        dev_docs = list(
                            corpus.dev_docs(
                                nlp_loaded,
                                gold_preproc=gold_preproc,
                                ignore_misaligned=True,
                            )
                        )
                        nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
                        start_time = timer()
                        scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
                        end_time = timer()
                        if use_gpu < 0:
                            gpu_wps = None
                            cpu_wps = nwords / (end_time - start_time)
                        else:
                            gpu_wps = nwords / (end_time - start_time)
                            with Model.use_device("cpu"):
                                nlp_loaded = load_model_from_path(epoch_model_path)
                                for name, component in nlp_loaded.pipeline:
                                    if hasattr(component, "cfg"):
                                        component.cfg["beam_width"] = beam_width
                                dev_docs = list(
                                    corpus.dev_docs(
                                        nlp_loaded,
                                        gold_preproc=gold_preproc,
                                        ignore_misaligned=True,
                                    )
                                )
                                start_time = timer()
                                scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
                                end_time = timer()
                                cpu_wps = nwords / (end_time - start_time)
                        acc_loc = output_dir / ("model%d" % itn) / "accuracy.json"
                        srsly.write_json(acc_loc, scorer.scores)

                        # Update model meta.json
                        meta["lang"] = nlp.lang
                        meta["pipeline"] = nlp.pipe_names
                        meta["spacy_version"] = ">=%s" % spacy.__version__
                        if beam_width == 1:
                            meta["speed"] = {
                                "nwords": nwords,
                                "cpu": cpu_wps,
                                "gpu": gpu_wps,
                            }
                            meta["accuracy"] = scorer.scores
                        else:
                            meta.setdefault("beam_accuracy", {})
                            meta.setdefault("beam_speed", {})
                            meta["beam_accuracy"][beam_width] = scorer.scores
                            meta["beam_speed"][beam_width] = {
                                "nwords": nwords,
                                "cpu": cpu_wps,
                                "gpu": gpu_wps,
                            }
                        meta["vectors"] = {
                            "width": nlp.vocab.vectors_length,
                            "vectors": len(nlp.vocab.vectors),
                            "keys": nlp.vocab.vectors.n_keys,
                            "name": nlp.vocab.vectors.name,
                        }
                        meta.setdefault("name", "model%d" % itn)
                        meta.setdefault("version", "0.0.1")
                        meta["labels"] = nlp.meta["labels"]
                        meta_loc = output_dir / ("model%d" % itn) / "meta.json"
                        srsly.write_json(meta_loc, meta)
                        set_env_log(verbose)

                        progress = _get_progress(
                            itn,
                            losses,
                            scorer.scores,
                            output_stats,
                            beam_width=beam_width if has_beam_widths else None,
                            cpu_wps=cpu_wps,
                            gpu_wps=gpu_wps,
                        )

                        msg.row(progress, **row_settings)

    finally:
        with nlp.use_params(optimizer.averages):
            final_model_path = output_dir / "model-final"
            nlp.to_disk(final_model_path)
        msg.good("Saved model to output directory", final_model_path)
        meta["pipeline"] = nlp.pipe_names
        meta["labels"] = nlp.meta["labels"]
        meta["factories"] = nlp.meta["factories"]
        with msg.loading("Creating best model..."):
            best_model_path = _collate_best_model(meta, output_dir, nlp.pipe_names)
        msg.good("Created best model", best_model_path)
Exemple #5
0
def main(vectors,
         gpu_id=-1,
         n_neighbors=100,
         batch_size=1024,
         cutoff=0,
         start=0,
         end=None):
    """
    Step 6: Precompute nearest-neighbor queries (optional)

    Precompute nearest-neighbor queries for every entry in the vocab to make
    Sense2Vec.most_similar faster. The --cutoff option lets you define the
    number of earliest rows to limit the neighbors to. For instance, if cutoff
    is 100000, no word will have a nearest neighbor outside of the top 100k
    vectors.
    """
    if gpu_id == -1:
        xp = numpy
    else:
        import cupy as xp
        import cupy.cuda.device

        xp.take_along_axis = take_along_axis
        device = cupy.cuda.device.Device(gpu_id)
        cupy.cuda.get_cublas_handle()
        device.use()
    vectors_dir = Path(vectors)
    vectors_file = vectors_dir / "vectors"
    if not vectors_dir.is_dir() or not vectors_file.exists():
        err = "Are you passing in the exported sense2vec directory containing a vectors file?"
        msg.fail(f"Can't load vectors from {vectors}", err, exits=1)
    with msg.loading(f"Loading vectors from {vectors}"):
        vectors = xp.load(str(vectors_file))
    msg.good(
        f"Loaded {vectors.shape[0]:,} vectors with dimension {vectors.shape[1]}"
    )
    norms = xp.linalg.norm(vectors, axis=1, keepdims=True)
    norms[norms == 0] = 1
    # Normalize to unit norm
    vectors /= norms
    if cutoff < 1:
        cutoff = vectors.shape[0]
    if end is None:
        end = vectors.shape[0]
    mean = float(norms.mean())
    var = float(norms.var())
    msg.good(f"Normalized (mean {mean:,.2f}, variance {var:,.2f})")
    msg.info(
        f"Finding {n_neighbors:,} neighbors among {cutoff:,} most frequent")
    n = min(n_neighbors, vectors.shape[0])
    subset = vectors[:cutoff]
    best_rows = xp.zeros((end - start, n), dtype="i")
    scores = xp.zeros((end - start, n), dtype="f")
    for i in tqdm.tqdm(list(range(start, end, batch_size))):
        size = min(batch_size, end - i)
        batch = vectors[i:i + size]
        sims = xp.dot(batch, subset.T)
        # Set self-similarities to -inf, so that we don't return them.
        for j in range(size):
            if i + j < sims.shape[1]:
                sims[j, i + j] = -xp.inf
        # This used to use argpartition, to do a partial sort...But this ended
        # up being a ratsnest of terrible numpy crap. Just sorting the whole
        # list isn't really slower, and it's much simpler to read.
        ranks = xp.argsort(sims, axis=1)
        batch_rows = ranks[:, -n:]
        # Reverse
        batch_rows = batch_rows[:, ::-1]
        batch_scores = xp.take_along_axis(sims, batch_rows, axis=1)
        best_rows[i:i + size] = batch_rows
        scores[i:i + size] = batch_scores
    msg.info("Saving output")
    if not isinstance(best_rows, numpy.ndarray):
        best_rows = best_rows.get()
    if not isinstance(scores, numpy.ndarray):
        scores = scores.get()
    output = {
        "indices": best_rows,
        "scores": scores.astype("float16"),
        "start": start,
        "end": end,
        "cutoff": cutoff,
    }
    output_file = vectors_dir / "cache"
    with msg.loading("Saving output..."):
        srsly.write_msgpack(output_file, output)
    msg.good(f"Saved cache to {output_file}")
Exemple #6
0
def pretrain(
    texts_loc,
    vectors_model,
    output_dir,
    width=96,
    depth=4,
    bilstm_depth=0,
    cnn_pieces=3,
    sa_depth=0,
    use_chars=False,
    cnn_window=1,
    embed_rows=2000,
    loss_func="cosine",
    use_vectors=False,
    dropout=0.2,
    n_iter=1000,
    batch_size=3000,
    max_length=500,
    min_length=5,
    seed=0,
    n_save_every=None,
    init_tok2vec=None,
    epoch_start=None,
):
    """
    Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
    using an approximate language-modelling objective. Specifically, we load
    pretrained vectors, and train a component like a CNN, BiLSTM, etc to predict
    vectors which match the pretrained ones. The weights are saved to a directory
    after each epoch. You can then pass a path to one of these pretrained weights
    files to the 'spacy train' command.

    This technique may be especially helpful if you have little labelled data.
    However, it's still quite experimental, so your mileage may vary.

    To load the weights back in during 'spacy train', you need to ensure
    all settings are the same between pretraining and training. The API and
    errors around this need some improvement.
    """
    config = dict(locals())
    for key in config:
        if isinstance(config[key], Path):
            config[key] = str(config[key])
    util.fix_random_seed(seed)

    has_gpu = prefer_gpu()
    if has_gpu:
        import torch

        torch.set_default_tensor_type("torch.cuda.FloatTensor")
    msg.info("Using GPU" if has_gpu else "Not using GPU")

    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
        msg.good("Created output directory")
    srsly.write_json(output_dir / "config.json", config)
    msg.good("Saved settings to config.json")

    # Load texts from file or stdin
    if texts_loc != "-":  # reading from a file
        texts_loc = Path(texts_loc)
        if not texts_loc.exists():
            msg.fail("Input text file doesn't exist", texts_loc, exits=1)
        with msg.loading("Loading input texts..."):
            texts = list(srsly.read_jsonl(texts_loc))
        if not texts:
            msg.fail("Input file is empty", texts_loc, exits=1)
        msg.good("Loaded input texts")
        random.shuffle(texts)
    else:  # reading from stdin
        msg.text("Reading input text from stdin...")
        texts = srsly.read_jsonl("-")

    with msg.loading("Loading model '{}'...".format(vectors_model)):
        nlp = util.load_model(vectors_model)
    msg.good("Loaded model '{}'".format(vectors_model))
    pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
    model = create_pretraining_model(
        nlp,
        Tok2Vec(
            width,
            embed_rows,
            conv_depth=depth,
            pretrained_vectors=pretrained_vectors,
            bilstm_depth=bilstm_depth,  # Requires PyTorch. Experimental.
            subword_features=not use_chars,  # Set to False for Chinese etc
            cnn_maxout_pieces=cnn_pieces,  # If set to 1, use Mish activation.
        ),
    )
    # Load in pretrained weights
    if init_tok2vec is not None:
        components = _load_pretrained_tok2vec(nlp, init_tok2vec)
        msg.text("Loaded pretrained tok2vec for: {}".format(components))
        # Parse the epoch number from the given weight file
        model_name = re.search(r"model\d+\.bin", str(init_tok2vec))
        if model_name:
            # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
            epoch_start = int(model_name.group(0)[5:][:-4]) + 1
        else:
            if not epoch_start:
                msg.fail(
                    "You have to use the '--epoch-start' argument when using a renamed weight file for "
                    "'--init-tok2vec'",
                    exits=True,
                )
            elif epoch_start < 0:
                msg.fail(
                    "The argument '--epoch-start' has to be greater or equal to 0. '%d' is invalid"
                    % epoch_start,
                    exits=True,
                )
    else:
        # Without '--init-tok2vec' the '--epoch-start' argument is ignored
        epoch_start = 0

    optimizer = create_default_optimizer(model.ops)
    tracker = ProgressTracker(frequency=10000)
    msg.divider("Pre-training tok2vec layer - starting at epoch %d" % epoch_start)
    row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
    msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)

    def _save_model(epoch, is_temp=False):
        is_temp_str = ".temp" if is_temp else ""
        with model.use_params(optimizer.averages):
            with (output_dir / ("model%d%s.bin" % (epoch, is_temp_str))).open(
                "wb"
            ) as file_:
                file_.write(model.tok2vec.to_bytes())
            log = {
                "nr_word": tracker.nr_word,
                "loss": tracker.loss,
                "epoch_loss": tracker.epoch_loss,
                "epoch": epoch,
            }
            with (output_dir / "log.jsonl").open("a") as file_:
                file_.write(srsly.json_dumps(log) + "\n")

    skip_counter = 0
    for epoch in range(epoch_start, n_iter + epoch_start):
        for batch_id, batch in enumerate(
            util.minibatch_by_words(((text, None) for text in texts), size=batch_size)
        ):
            docs, count = make_docs(
                nlp,
                [text for (text, _) in batch],
                max_length=max_length,
                min_length=min_length,
            )
            skip_counter += count
            loss = make_update(
                model, docs, optimizer, objective=loss_func, drop=dropout
            )
            progress = tracker.update(epoch, loss, docs)
            if progress:
                msg.row(progress, **row_settings)
                if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7:
                    break
            if n_save_every and (batch_id % n_save_every == 0):
                _save_model(epoch, is_temp=True)
        _save_model(epoch)
        tracker.epoch_loss = 0.0
        if texts_loc != "-":
            # Reshuffle the texts if texts were loaded from a file
            random.shuffle(texts)
    if skip_counter > 0:
        msg.warn("Skipped {count} empty values".format(count=str(skip_counter)))
    msg.good("Successfully finished pretrain")
Exemple #7
0
def project_assets(
    project_dir: Path,
    *,
    overrides: Dict[str, Any] = SimpleFrozenDict(),
    sparse_checkout: bool = False,
) -> None:
    """Fetch assets for a project using DVC if possible.

    project_dir (Path): Path to project directory.
    """
    project_path = ensure_path(project_dir)
    config = load_project_config(project_path, overrides=overrides)
    assets = config.get("assets", {})
    if not assets:
        msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
    msg.info(f"Fetching {len(assets)} asset(s)")
    for asset in assets:
        dest = (project_dir / asset["dest"]).resolve()
        checksum = asset.get("checksum")
        if "git" in asset:
            git_err = (
                f"Cloning spaCy project templates requires Git and the 'git' command. "
                f"Make sure it's installed and that the executable is available."
            )
            get_git_version(error=git_err)
            if dest.exists():
                # If there's already a file, check for checksum
                if checksum and checksum == get_checksum(dest):
                    msg.good(
                        f"Skipping download with matching checksum: {asset['dest']}"
                    )
                    continue
                else:
                    if dest.is_dir():
                        shutil.rmtree(dest)
                    else:
                        dest.unlink()
            if "repo" not in asset["git"] or asset["git"]["repo"] is None:
                msg.fail(
                    "A git asset must include 'repo', the repository address.",
                    exits=1)
            if "path" not in asset["git"] or asset["git"]["path"] is None:
                msg.fail(
                    "A git asset must include 'path' - use \"\" to get the entire repository.",
                    exits=1,
                )
            git_checkout(
                asset["git"]["repo"],
                asset["git"]["path"],
                dest,
                branch=asset["git"].get("branch"),
                sparse=sparse_checkout,
            )
            msg.good(f"Downloaded asset {dest}")
        else:
            url = asset.get("url")
            if not url:
                # project.yml defines asset without URL that the user has to place
                check_private_asset(dest, checksum)
                continue
            fetch_asset(project_path, url, dest, checksum)
Exemple #8
0
def package(input_dir,
            output_dir,
            meta_path=None,
            create_meta=False,
            force=False):
    """
    Generate Python package for model data, including meta and required
    installation files. A new directory will be created in the specified
    output directory, and model data will be copied over. If --create-meta is
    set and a meta.json already exists in the output directory, the existing
    values will be used as the defaults in the command-line prompt.
    """
    input_path = util.ensure_path(input_dir)
    output_path = util.ensure_path(output_dir)
    meta_path = util.ensure_path(meta_path)
    if not input_path or not input_path.exists():
        msg.fail("Can't locate model data", input_path, exits=1)
    if not output_path or not output_path.exists():
        msg.fail("Output directory not found", output_path, exits=1)
    if meta_path and not meta_path.exists():
        msg.fail("Can't find model meta.json", meta_path, exits=1)

    meta_path = meta_path or input_path / "meta.json"
    if meta_path.is_file():
        meta = srsly.read_json(meta_path)
        if not create_meta:  # only print if user doesn't want to overwrite
            msg.good("Loaded meta.json from file", meta_path)
        else:
            meta = generate_meta(input_dir, meta, msg)
    for key in ("lang", "name", "version"):
        if key not in meta or meta[key] == "":
            msg.fail(
                "No '{}' setting found in meta.json".format(key),
                "This setting is required to build your package.",
                exits=1,
            )
    model_name = meta["lang"] + "_" + meta["name"]
    model_name_v = model_name + "-" + meta["version"]
    main_path = output_path / model_name_v
    package_path = main_path / model_name

    if package_path.exists():
        if force:
            shutil.rmtree(path2str(package_path))
        else:
            msg.fail(
                "Package directory already exists",
                "Please delete the directory and try again, or use the "
                "`--force` flag to overwrite existing "
                "directories.".format(path=path2str(package_path)),
                exits=1,
            )
    Path.mkdir(package_path, parents=True)
    shutil.copytree(path2str(input_path),
                    path2str(package_path / model_name_v))
    create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
    create_file(main_path / "setup.py", TEMPLATE_SETUP)
    create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
    create_file(package_path / "__init__.py", TEMPLATE_INIT)
    msg.good("Successfully created package '{}'".format(model_name_v),
             main_path)
    msg.text(
        "To build the package, run `python setup.py sdist` in this directory.")
Exemple #9
0
    msg.text("RUNNING  " + command)
    wrapped_command = f"cd {directory} && {command}"
    pipe = subprocess.Popen(
        wrapped_command, shell=True,
    )
    pipe.wait()
    if pipe.returncode == 0:
        msg.good("TEST PASSED")
    else:
        msg.fail("TEST FAILED")
    msg.text('')
    return pipe.returncode


root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

# run the tests
if isinstance(tests, str):
    returncode = run_test(tests, root_dir)
elif isinstance(tests, (list, tuple)):
    returncode = 0
    for test in tests:
        returncode += run_test(test, root_dir)

if returncode == 0:
    msg.good("ALL TESTS PASSED")
else:
    msg.fail("SOME TESTS FAILED, SEE ABOVE")

sys.exit(returncode)
def speech_to_text_demo(asr: "ASROnlineAudioFrame") -> None:
    """Speech to Text (ASR) Microphone Demo.

    Interrupt the notebook's kernel to stop the app from recoring.
    """
    asr.reset()
    audio = pyaudio.PyAudio()

    offset = {"count": 0}
    columns = []
    devices = []

    for idx in range(audio.get_device_count()):
        device = audio.get_device_info_by_index(idx)
        if not device.get("maxInputChannels"):
            continue
        devices.append(idx)
        columns.append((idx, device.get("name")))

    if columns:
        msg.good("Found the following input devices!")
        msg.table(columns, header=("ID", "Devices"), divider=True)

    if devices:
        device_index = -2
        while device_index not in devices:
            msg.info("Please enter the device ID")
            device_index = int(input())

        def callback(in_data, frame_count, time_info, status):
            signal = np.frombuffer(in_data, dtype=np.int16)
            text = asr.transcribe(signal)
            if text:
                print(text, end="")
                offset["count"] = asr.params.offset
            elif offset["count"] > 0:
                offset["count"] -= 1
                if offset["count"] == 0:
                    print(" ", end="")
            return (in_data, pyaudio.paContinue)

        stream = audio.open(
            input=True,
            format=pyaudio.paInt16,
            input_device_index=device_index,
            stream_callback=callback,
            channels=asr.params.channels,
            rate=asr.params.sample_rate,
            frames_per_buffer=asr.chunk_size,
        )

        msg.loading("Listening...")
        stream.start_stream()

        try:
            while stream.is_active():
                time.sleep(0.1)
        except (KeyboardInterrupt, Exception) as e:
            stream.stop_stream()
            stream.close()
            audio.terminate()
            msg.warn("WARNING: ASR stream stopped.", e)
    else:
        msg.fail("ERROR", "No audio input device found.")
Exemple #11
0
def debug_model(
    config,
    resolved_train_config,
    nlp,
    pipe,
    *,
    print_settings: Optional[Dict[str, Any]] = None,
):
    if not hasattr(pipe, "model"):
        msg.fail(
            f"The component '{pipe}' does not specify an object that holds a Model.",
            exits=1,
        )
    model = pipe.model
    if not isinstance(model, Model):
        msg.fail(
            f"Requires a Thinc Model to be analysed, but found {type(model)} instead.",
            exits=1,
        )
    if print_settings is None:
        print_settings = {}

    # STEP 0: Printing before training
    msg.info(f"Analysing model with ID {model.id}")
    if print_settings.get("print_before_training"):
        msg.divider(f"STEP 0 - before training")
        _print_model(model, print_settings)

    # STEP 1: Initializing the model and printing again
    with data_validation(False):
        try:
            dot_names = [resolved_train_config["train_corpus"]]
            with show_validation_error():
                (train_corpus, ) = resolve_dot_names(config, dot_names)
                nlp.initialize(lambda: train_corpus(nlp))
            msg.info("Initialized the model with the training corpus.")
            examples = list(itertools.islice(train_corpus(nlp), 5))
        except ValueError:
            try:
                _set_output_dim(nO=7, model=model)
                with show_validation_error():
                    examples = [Example.from_dict(x, {}) for x in _get_docs()]
                    nlp.initialize(lambda: examples)
                msg.info("Initialized the model with dummy data.")
            except Exception:
                msg.fail(
                    "Could not initialize the model: you'll have to provide a valid 'train_corpus' argument in the config file.",
                    exits=1,
                )

    if print_settings.get("print_after_init"):
        msg.divider(f"STEP 1 - after initialization")
        _print_model(model, print_settings)

    # STEP 2: Updating the model and printing again
    set_dropout_rate(model, 0.2)
    # ugly hack to deal with Tok2Vec/Transformer listeners
    upstream_component = None
    if model.has_ref("tok2vec") and "tok2vec-listener" in model.get_ref(
            "tok2vec").name:
        upstream_component = nlp.get_pipe("tok2vec")
    if (model.has_ref("tok2vec")
            and "transformer-listener" in model.get_ref("tok2vec").name):
        upstream_component = nlp.get_pipe("transformer")
    for e in range(3):
        if upstream_component:
            upstream_component.update(examples)
        pipe.update(examples)
    if print_settings.get("print_after_training"):
        msg.divider(f"STEP 2 - after training")
        _print_model(model, print_settings)

    # STEP 3: the final prediction
    prediction = model.predict([ex.predicted for ex in examples])
    if print_settings.get("print_prediction"):
        msg.divider(f"STEP 3 - prediction")
        msg.info(str(prediction))

    msg.good(f"Succesfully ended analysis - model looks good.")
Exemple #12
0
def create_wikigraph(
    output_path: Path,
    wiki="en",
    version="latest",
    dumps_path: Path = None,
    max_workers: int = None,
    silent: bool = None,
    force: bool = None,
):
    """
    Create a `WikiGraph` from a specific dump.

    It can then be used by directly loading it, or
    it can be packaged with the `package-wikigraph` command.

    Parameters
    ----------
    output_path : Path
        Path in which to store the `WikiGraph`.
    wiki : str, optional
        Wikipedia dump type to use, by default "en".
    version : str, optional
        Wikipedia dump version to use, by default "latest".
    dumps_path : Path, optional
        Path in which to find previously downloaded dumps,
        or where to save dumps downloaded in this call, by default None.
    max_workers : int, optional
        Maximum number of processes to use, by default None.
    silent : bool, optional
        Do not print anything in stout, by default None.
    force : bool, optional
        Overwrite content in output_path, if any, by default None.
    """
    if not output_path.exists():
        output_path.mkdir()
        msg.good(f"Created output directory: {output_path}")
    graph_name = f"{wiki}wiki_core"
    graph_path = output_path.joinpath(graph_name)
    if not force and graph_path.exists():
        msg.fail(
            f"Output path already contains {graph_name} directory",
            "Use --force to overwrite it",
            exits=1,
        )
    kwargs = {
        "dumps_path": dumps_path,
        "max_workers": max_workers,
        "wiki": wiki,
        "version": version,
        "verbose": not silent,
    }
    wg = WikiGraph.build(**kwargs)
    if not graph_path.exists():
        graph_path.mkdir()
    with msg.loading("dump to disk..."):
        wg.dump(graph_path)
    spikex_ver = ".".join(spikex_version.split(".")[:2])
    meta = get_meta()
    meta["name"] = graph_name
    meta["wiki"] = wiki
    meta["version"] = wg.version
    meta["spikex_version"] = f">={spikex_ver}"
    meta["fullname"] = f"{graph_name}-{spikex_ver}"
    meta["sources"].append("Wikipedia")
    meta_path = graph_path.joinpath("meta.json")
    meta_path.write_text(json_dumps(meta, indent=2))
    msg.good(f"Successfully created {graph_name}.")
def init_model(
    lang,
    output_dir,
    freqs_loc=None,
    clusters_loc=None,
    jsonl_loc=None,
    vectors_loc=None,
    truncate_vectors=0,
    prune_vectors=-1,
    vectors_name=None,
    model_name=None,
    omit_extra_lookups=False,
    base_model=None,
):
    """
    Create a new model from raw data, like word frequencies, Brown clusters
    and word vectors. If vectors are provided in Word2Vec format, they can
    be either a .txt or zipped as a .zip or .tar.gz.
    """
    if jsonl_loc is not None:
        if freqs_loc is not None or clusters_loc is not None:
            settings = ["-j"]
            if freqs_loc:
                settings.append("-f")
            if clusters_loc:
                settings.append("-c")
            msg.warn(
                "Incompatible arguments",
                "The -f and -c arguments are deprecated, and not compatible "
                "with the -j argument, which should specify the same "
                "information. Either merge the frequencies and clusters data "
                "into the JSONL-formatted file (recommended), or use only the "
                "-f and -c files, without the other lexical attributes.",
            )
        jsonl_loc = ensure_path(jsonl_loc)
        lex_attrs = srsly.read_jsonl(jsonl_loc)
    else:
        clusters_loc = ensure_path(clusters_loc)
        freqs_loc = ensure_path(freqs_loc)
        if freqs_loc is not None and not freqs_loc.exists():
            msg.fail("Can't find words frequencies file", freqs_loc, exits=1)
        lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc)

    with msg.loading("Creating model..."):
        nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model)

    # Create empty extra lexeme tables so the data from spacy-lookups-data
    # isn't loaded if these features are accessed
    if omit_extra_lookups:
        nlp.vocab.lookups_extra = Lookups()
        nlp.vocab.lookups_extra.add_table("lexeme_cluster")
        nlp.vocab.lookups_extra.add_table("lexeme_prob")
        nlp.vocab.lookups_extra.add_table("lexeme_settings")

    msg.good("Successfully created model")
    if vectors_loc is not None:
        add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name)
    vec_added = len(nlp.vocab.vectors)
    lex_added = len(nlp.vocab)
    msg.good(
        "Sucessfully compiled vocab",
        "{} entries, {} vectors".format(lex_added, vec_added),
    )
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    return nlp
                        help='Path to save answers',
                        default='answers.jsonl')

    return parser


if __name__ == '__main__':

    parser = create_argparser()
    args = parser.parse_args()

    if args.pre_trained_model_path:
        with msg.loading("Loading pre trained model"):
            with open(args.pre_trained_model_path, "rb") as f:
                covid_q = pickle.load(f)
        msg.good(f"Loaded {args.pre_trained_model_path}")
    else:
        msg.text("Training new model")
        covid_q = train()
        msg.good("Trained")
        with open("models/covid_q.pkl", "wb") as f:
            pickle.dump(covid_q, f)

    challenge_tasks = [{
        "task":
        "What is known about transmission, incubation, and environmental stability?",
        "questions": [
            "Is the virus transmitted by aerosol, droplets, food, close contact, fecal matter, or water?",
            "How long is the incubation period for the virus?",
            "Can the virus be transmitted asymptomatically or during the incubation period?",
            "How does weather, heat, and humidity affect the tramsmission of 2019-nCoV?",
Exemple #15
0
def link(origin, link_name, force=False, model_path=None):
    """
    Create a symlink for models within the spacy/data directory. Accepts
    either the name of a pip package, or the local path to the model data
    directory. Linking models allows loading them via spacy.load(link_name).
    """
    if util.is_package(origin):
        model_path = util.get_package_path(origin)
    else:
        model_path = Path(origin) if model_path is None else Path(model_path)
    if not model_path.exists():
        msg.fail(
            "Can't locate model data",
            "The data should be located in {}".format(path2str(model_path)),
            exits=1,
        )
    data_path = util.get_data_path()
    if not data_path or not data_path.exists():
        spacy_loc = Path(__file__).parent.parent
        msg.fail(
            "Can't find the spaCy data path to create model symlink",
            "Make sure a directory `/data` exists within your spaCy "
            "installation and try again. The data directory should be located "
            "here:".format(path=spacy_loc),
            exits=1,
        )
    link_path = util.get_data_path() / link_name
    if link_path.is_symlink() and not force:
        msg.fail(
            "Link '{}' already exists".format(link_name),
            "To overwrite an existing link, use the --force flag",
            exits=1,
        )
    elif link_path.is_symlink():  # does a symlink exist?
        # NB: It's important to check for is_symlink here and not for exists,
        # because invalid/outdated symlinks would return False otherwise.
        link_path.unlink()
    elif link_path.exists():  # does it exist otherwise?
        # NB: Check this last because valid symlinks also "exist".
        msg.fail(
            "Can't overwrite symlink '{}'".format(link_name),
            "This can happen if your data directory contains a directory or "
            "file of the same name.",
            exits=1,
        )
    details = "%s --> %s" % (path2str(model_path), path2str(link_path))
    try:
        symlink_to(link_path, model_path)
    except:  # noqa: E722
        # This is quite dirty, but just making sure other errors are caught.
        msg.fail(
            "Couldn't link model to '{}'".format(link_name),
            "Creating a symlink in spacy/data failed. Make sure you have the "
            "required permissions and try re-running the command as admin, or "
            "use a virtualenv. You can still import the model as a module and "
            "call its load() method, or create the symlink manually.",
        )
        msg.text(details)
        raise
    msg.good("Linking successful", details)
    msg.text("You can now load the model via spacy.load('{}')".format(link_name))
sess = requests.session()

base_url = "https://exporter.nih.gov/"

# Get the links from the target webpage
r = sess.get(url_exporter)

assert r.ok
soup = bs4.BeautifulSoup(r.content, "lxml")

# Filter for only our matches
target = "CSVs/final/RePORTER_"
save_links = [a["href"] for a in soup.find_all("a", href=True)]
save_links = [link for link in save_links if target in link]

msg.good(f"Found {len(save_links)} links in Exporter")


def download(f0, f1):
    url = base_url + f0

    r = sess.get(url)
    if not r.ok:
        print(r.content)
        print(r.status_code)
        msg.fail(f"Failed {url}")
        exit()

    with open(f1, "wb") as FOUT:
        FOUT.write(r.content)
def main(
    # fmt: off
    glove_dir: str = typer.Argument(
        ..., help="Directory containing the GloVe build"),
    in_file: str = typer.Argument(...,
                                  help="Input file (shuffled cooccurrences)"),
    vocab_file: str = typer.Argument(..., help="Vocabulary file"),
    out_dir: str = typer.Argument(..., help="Path to output directory"),
    n_threads: int = typer.Option(8,
                                  "--n-threads",
                                  "-t",
                                  help="Number of threads"),
    n_iter: int = typer.Option(15,
                               "--n-iter",
                               "-n",
                               help="Number of iterations"),
    x_max: int = typer.Option(
        10,
        "--x-max",
        "-x",
        help="Parameter specifying cutoff in weighting function"),
    vector_size: int = typer.Option(
        128,
        "--vector-size",
        "-s",
        help="Dimension of word vector representations"),
    verbose: int = typer.Option(2,
                                "--verbose",
                                "-v",
                                help="Set verbosity: 0, 1, or 2"),
    # fmt: on
):
    """
    Step 4: Train the vectors

    Expects a file containing the shuffled cooccurrences and a vocab file and
    will output a plain-text vectors file.

    Note that this script will call into GloVe and expects you to pass in the
    GloVe build directory (/build if you run the Makefile). The commands will
    also be printed if you want to run them separately.
    """
    output_path = Path(out_dir)
    if not Path(glove_dir).exists():
        msg.fail("Can't find GloVe build directory", glove_dir, exits=1)
    if not Path(in_file).exists():
        msg.fail("Can't find input file", in_file, exits=1)
    if not Path(vocab_file).exists():
        msg.fail("Can't find vocab file", vocab_file, exits=1)
    if not output_path.exists():
        output_path.mkdir(parents=True)
        msg.good(f"Created output directory {out_dir}")
    output_file = output_path / f"vectors_glove_{vector_size}dim"
    msg.info("Training vectors")
    cmd = (f"{glove_dir}/glove -save-file {output_file} -threads {n_threads} "
           f"-input-file {in_file} -x-max {x_max} -iter {n_iter} "
           f"-vector-size {vector_size} -binary 0 -vocab-file {vocab_file} "
           f"-verbose {verbose}")
    print(cmd)
    train_cmd = os.system(cmd)
    if train_cmd != 0:
        msg.fail("Failed training vectors", exits=1)
    msg.good("Successfully trained vectors")
                stats["filtered_out"] += 1
                continue

            stats["kept_articles"] += 1

            # Concatenate
            item = {"meta": {}}
            item["text"] = "\n".join([row["title"], row["abstract"]])

            # Build the meta information
            for k in meta_columns:
                item["meta"][k] = row[k]

            # PMID should always exist and be an integer
            item["meta"]["pmid"] = int(item["meta"]["pmid"])

            # Save to the master file
            FOUT.write(item)


P = Pipe(source="data/baseline/parsed/", input_suffix=".jsonl", shuffle=True)

P(compute, 1)
msg.good(f"Saved to {f_save}")
msg.info(
    f"Saved {stats['kept_articles']:,}, filtered {stats['filtered_out']:,} articles that overlapped in PMC"
)

filesize = Path(f_save).stat().st_size
msg.info(f"Compressed filesize {filesize:,}")
Exemple #19
0
def pretrain_cli(
    # fmt: off
    ctx: typer.Context,  # This is only used to read additional arguments
    config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False, allow_dash=True),
    output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
    epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
    # fmt: on
):
    """
    Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
    using an approximate language-modelling objective. Two objective types
    are available, vector-based and character-based.

    In the vector-based objective, we load word vectors that have been trained
    using a word2vec-style distributional similarity algorithm, and train a
    component like a CNN, BiLSTM, etc to predict vectors which match the
    pretrained ones. The weights are saved to a directory after each epoch. You
    can then pass a path to one of these pretrained weights files to the
    'spacy train' command.

    This technique may be especially helpful if you have little labelled data.
    However, it's still quite experimental, so your mileage may vary.

    To load the weights back in during 'spacy train', you need to ensure
    all settings are the same between pretraining and training. Ideally,
    this is done by using the same config file for both commands.

    DOCS: https://spacy.io/api/cli#pretrain
    """
    config_overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
    verify_cli_args(config_path, output_dir, resume_path, epoch_resume)
    setup_gpu(use_gpu)
    msg.info(f"Loading config from: {config_path}")

    with show_validation_error(config_path):
        raw_config = load_config(
            config_path, overrides=config_overrides, interpolate=False
        )
    config = raw_config.interpolate()
    if not config.get("pretraining"):
        # TODO: What's the solution here? How do we handle optional blocks?
        msg.fail("The [pretraining] block in your config is empty", exits=1)
    if not output_dir.exists():
        output_dir.mkdir(parents=True)
        msg.good(f"Created output directory: {output_dir}")
    # Save non-interpolated config
    raw_config.to_disk(output_dir / "config.cfg")
    msg.good("Saved config file in the output directory")

    pretrain(
        config,
        output_dir,
        resume_path=resume_path,
        epoch_resume=epoch_resume,
        use_gpu=use_gpu,
        silent=False,
    )
    msg.good("Successfully finished pretrain")
Exemple #20
0
def game_loop(num_players, ai_file=None):
    p1 = create_player(1)

    if num_players == 1:
        with open_policy(ai_file) as policy:
            p2 = create_player(2, policy)
    else:
        p2 = create_player(2)

    player_sequence = [p1, p2] if randrange(2) else [p2, p1]

    msg.text(f"{player_sequence[0].name} goes first.")

    game = [0, 0, 0]
    state = None
    player, opp = player_sequence
    while 1:
        while 1:
            msg.text(
                f"{player.name} total score is {game[0]} and current round total is {game[2]}. "
                + f"\n{opp.name} score is {game[1]}")

            if player.decision_function is None:
                msg.text(
                    f"{player.name}, please choose your move (R to roll or H to hold): "
                )
                move = input()
            else:
                move = player.decision_function(*game)

            if move in ('r', 'R', 'roll'):
                roll = randrange(1, 7)
                if roll > 1:
                    msg.text(
                        f"{player.name} rolled a {roll}. {player.name} current round total "
                        + f"is {roll + game[2]}")
                    game[2] += roll
                else:
                    msg.text(
                        f"{player.name} rolled a {roll} and lost all his points!"
                    )
                    break
            elif move in ('h', 'H', 'hold'):
                msg.text(
                    f"{player.name} held. Adding {game[2]} to {game[0]} for " +
                    f"{player.name} total of {game[0]+game[2]}.")
                game[0] += game[2]
                break
            elif move in ('x', 'X', 'q', 'Q', 'quit', 'end'):
                state = const.EXIT
                break

        if state is const.EXIT:
            msg.text("Exiting...")
            break

        if game[0] >= 100:
            msg.good(f"{player.name} wins!")
            break

        game[0], game[1], game[2] = game[1], game[0], 0
        player, opp = opp, player
Exemple #21
0
def main(glove_dir,
         in_dir,
         out_dir,
         min_count=5,
         memory=4.0,
         window_size=15,
         verbose=2):
    """
    Step 3: Build vocabulary and frequency counts

    Expects a directory of preprocessed .s2v input files and will use GloVe to
    collect unigram counts and construct and shuffle cooccurrence data. See here
    for installation instructions: https://github.com/stanfordnlp/GloVe

    Note that this script will call into GloVe and expects you to pass in the
    GloVe build directory (/build if you run the Makefile). The commands will
    also be printed if you want to run them separately.
    """
    input_path = Path(in_dir)
    output_path = Path(out_dir)
    if not Path(glove_dir).exists():
        msg.fail("Can't find GloVe build directory", glove_dir, exits=1)
    if not input_path.exists() or not input_path.is_dir():
        msg.fail("Not a valid input directory", in_dir, exits=1)
    input_files = [
        str(fp) for fp in input_path.iterdir() if fp.suffix == ".s2v"
    ]
    if not input_files:
        msg.fail("No .s2v files found in input directory", in_dir, exits=1)
    msg.info(f"Using {len(input_files)} input files")
    if not output_path.exists():
        output_path.mkdir(parents=True)
        msg.good(f"Created output directory {out_dir}")

    vocab_file = output_path / f"vocab.txt"
    cooc_file = output_path / f"cooccurrence.bin"
    cooc_shuffle_file = output_path / f"cooccurrence.shuf.bin"

    msg.info("Creating vocabulary counts")
    cmd = (f"cat {' '.join(input_files)} | {glove_dir}/vocab_count "
           f"-min-count {min_count} -verbose {verbose} > {vocab_file}")
    print(cmd)
    vocab_cmd = os.system(cmd)
    if vocab_cmd != 0 or not Path(vocab_file).exists():
        msg.fail("Failed creating vocab counts", exits=1)
    msg.good("Created vocab counts", vocab_file)

    msg.info("Creating cooccurrence statistics")
    cmd = (
        f"cat {' '.join(input_files)} | {glove_dir}/cooccur -memory {memory} "
        f"-vocab-file {vocab_file} -verbose {verbose} "
        "-window-size {window_size} > {cooc_file}")
    print(cmd)
    cooccur_cmd = os.system(cmd)
    if cooccur_cmd != 0 or not Path(cooc_file).exists():
        msg.fail("Failed creating cooccurrence statistics", exits=1)
    msg.good("Created cooccurrence statistics", cooc_file)

    msg.info("Shuffling cooccurrence file")
    cmd = (f"{glove_dir}/shuffle -memory {memory} -verbose {verbose} "
           f"< {cooc_file} > {cooc_shuffle_file}")
    print(cmd)
    shuffle_cmd = os.system(cmd)
    if shuffle_cmd != 0 or not Path(cooc_shuffle_file).exists():
        msg.fail("Failed to shuffle cooccurrence file", exits=1)
    msg.good("Shuffled cooccurrence file", cooc_shuffle_file)
Exemple #22
0
def train_cli(
    # fmt: off
    ctx: typer.Context,  # This is only used to read additional arguments
    config_path: Path = Arg(...,
                            help="Path to config file",
                            exists=True,
                            allow_dash=True),
    output_path: Optional[Path] = Opt(
        None,
        "--output",
        "--output-path",
        "-o",
        help="Output directory to store trained pipeline in"),
    code_path: Optional[Path] = Opt(
        None,
        "--code",
        "-c",
        help=
        "Path to Python file with additional code (registered functions) to be imported"
    ),
    verbose: bool = Opt(
        False,
        "--verbose",
        "-V",
        "-VV",
        help="Display more information for debugging purposes"),
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
    # fmt: on
):
    """
    Train or update a spaCy pipeline. Requires data in spaCy's binary format. To
    convert data from other formats, use the `spacy convert` command. The
    config file includes all settings and hyperparameters used during training.
    To override settings in the config, e.g. settings that point to local
    paths or that you want to experiment with, you can override them as
    command line options. For instance, --training.batch_size 128 overrides
    the value of "batch_size" in the block "[training]". The --code argument
    lets you pass in a Python file that's imported before training. It can be
    used to register custom functions and architectures that can then be
    referenced in the config.

    DOCS: https://spacy.io/api/cli#train
    """
    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
    # Make sure all files and paths exists if they are needed
    if not config_path or (str(config_path) != "-"
                           and not config_path.exists()):
        msg.fail("Config file not found", config_path, exits=1)
    if output_path is not None and not output_path.exists():
        output_path.mkdir(parents=True)
        msg.good(f"Created output directory: {output_path}")
    overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
    setup_gpu(use_gpu)
    with show_validation_error(config_path):
        config = util.load_config(config_path,
                                  overrides=overrides,
                                  interpolate=False)
    msg.divider("Initializing pipeline")
    with show_validation_error(config_path, hint_fill=False):
        nlp = init_nlp(config, use_gpu=use_gpu)
    msg.good("Initialized pipeline")
    msg.divider("Training pipeline")
    train(nlp,
          output_path,
          use_gpu=use_gpu,
          stdout=sys.stdout,
          stderr=sys.stderr)
Exemple #23
0
def build(
    # fmt: off
    repo: str,
    commit: str,
    package_name: str = Option(None,
                               help="Package name (if different from repo)"),
    py35: bool = Option(False, "--py35", help="Build wheels for Python 3.5"),
    llvm: bool = Option(False, "--llvm", help="Requires LLVM to be installed"),
    universal: bool = Option(
        False,
        "--universal",
        help="Build universal (pure Python) wheel and sdist"),
    skip_tests: bool = Option(
        False,
        "--skip-tests",
        help="Don't run tests (e.g. if package doesn't have any)"),
    build_constraints: bool = Option(
        False,
        "--build-constraints",
        help="Use build constraints for build requirements"),
    # fmt: on
):
    """Build wheels for a given repo and commit / tag."""
    print(LOGO)
    repo_id = get_repo_id()
    repo_id = repo_id.replace(".git", "")
    user, package = repo.lower().split("/", 1)
    if package_name is None:
        package_name = package.replace("-", "_")
    msg.info(f"Building in repo {repo_id}")
    msg.info(f"Building wheels for {user}/{package}\n")
    if universal:
        msg.warn(
            "Building only universal sdist and wheel, no cross-platform wheels"
        )
    if skip_tests:
        msg.warn("Not running any tests")
    clone_url = DEFAULT_CLONE_TEMPLATE.format(f"{user}/{package}")
    repo = get_gh().get_repo(repo_id)
    with msg.loading("Finding a unique name for this release..."):
        # Pick the release_name by finding an unused one
        i = 1
        while True:
            release_name = f"{package_name}-{commit}"
            if i > 1:
                release_name += f"-{i}"
            try:
                repo.get_release(release_name)
            except github.UnknownObjectException:
                break
            i += 1
    branch_name = f"branch-for-{release_name}"
    bs = {
        "clone-url": clone_url,
        "package-name": package_name,
        "commit": commit,
        "options": {
            "llvm": llvm,
            "py35": py35,
            "universal": universal,
            "skip_tests": skip_tests,
            "build_constraints": build_constraints,
        },
        "upload-to": {
            "type": "github-release",
            "repo-id": repo_id,
            "release-id": release_name,
        },
    }
    bs_json = json.dumps(bs)
    bs_json_formatted = json.dumps(bs, indent=4)
    msg.text(f"Creating release {release_name} to collect assets")
    release_text = f"https://github.com/{user}/{package}\n\n### Build spec\n\n```json\n{bs_json_formatted}\n```"
    release = repo.create_git_release(release_name, release_name, release_text)
    with msg.loading("Creating build branch..."):
        # 'master' is a 'Commit'. 'master.commit' is a 'GitCommit'. These are
        # different types that are mostly *not* interchangeable:
        #   https://pygithub.readthedocs.io/en/latest/github_objects/Commit.html
        #   https://pygithub.readthedocs.io/en/latest/github_objects/GitCommit.html
        master = repo.get_commit("master")
        master_gitcommit = master.commit
        patch = github.InputGitTreeElement(
            "build-spec.json",
            "100644",
            "blob",
            content=bs_json,
        )
        tree = repo.create_git_tree([patch], master_gitcommit.tree)
        our_gitcommit = repo.create_git_commit(f"Building: {release_name}",
                                               tree, [master_gitcommit])
        repo.create_git_ref(f"refs/heads/{branch_name}", our_gitcommit.sha)
    msg.good(f"Commit is {our_gitcommit.sha[:8]} in branch {branch_name}")
    msg.text(f"Release: {release.html_url}")
    msg.text(
        f"Checks:  https://github.com/{repo_id}/commit/{our_gitcommit.sha}/checks"
    )
Exemple #24
0
def main(path_file: str = typer.Argument(..., help='Path to input file')):
    """
    Limpieza de la Base de Datos
    
    Este script contiene todas las funciones necesarias para limpiar y transformar la base en la versión más adecuada para el proyecto.

    Los objetivos son los siguientes:
     
     * Depurar los campos que contengan texto con algún error de escritura o caracteres inadecuados para el procesamiento.
     * Fijar un tipo de dato adecuado para el manejo con otros archivos
     * Compactar u organizar las columnas que en un formato que permita una mejor visualización y organización de la información referente a los reportes mensuales.

    """

    #Load Data
    msg.info("Load data...")
    data = pd.read_csv(path_file,
                       encoding='latin1',
                       low_memory=False,
                       dtype=SCHEMA)

    msg.info("General Information:\n")
    msg.info(data.info())
    #Remove rows with NIVEL== FID
    msg.good("Remove FID...")
    data = data[data.DESC_NIVEL != 'FID'].copy()

    #Fields cleaning DESCRIPCIONES(DES_*)
    msg.info("Cleaning DESC_ ...")
    data.DESC_RAMO = data.DESC_RAMO.apply(lambda x: cln_txt(str(x)))
    data.DESC_UR = data.DESC_UR.apply(lambda x: cln_txt(str(x)))
    data.DESC_AI = data.DESC_AI.apply(lambda x: cln_txt(str(x)))
    data.DESC_PP = data.DESC_PP.apply(lambda x: cln_txt(str(x)))
    data.OBJETIVO_PND = data.OBJETIVO_PND.apply(lambda x: cln_txt(str(x)))
    data.DESC_OBJETIVO_PROGRAMA_PND = data.DESC_OBJETIVO_PROGRAMA_PND.apply(
        lambda x: cln_txt(str(x)))
    data.OBJETIVO_ESTRATEGICO = data.OBJETIVO_ESTRATEGICO.apply(
        lambda x: cln_txt(str(x)))
    data.DESC_MATRIZ = data.DESC_MATRIZ.apply(lambda x: cln_txt(str(x)))
    data.DESC_OBJETIVO = data.DESC_OBJETIVO.apply(lambda x: cln_txt(str(x)))

    #Change wrong names
    msg.info("Changes names...")
    data.TIPO_INDICADOR = data.TIPO_INDICADOR.map(DTIPO_INDICADOR)
    data.DIMENSION = data.DIMENSION.map(DDIMENSION)

    #Change data type
    msg.info("Change data type...")
    data.ID_OBJETIVO = data.ID_OBJETIVO.astype('int')
    data.ID_OBJETIVO_PADRE = data.ID_OBJETIVO_PADRE.fillna(-1).astype('int')
    data.ID_INDICADOR_CICLO_ANTERIOR = data.ID_INDICADOR_CICLO_ANTERIOR.fillna(
        -1).astype('int')
    data.CICLO_LINEA_BASE = data.CICLO_LINEA_BASE.fillna(-1).astype('int')

    #List of columns to group data
    msg.info("Create List of Columns...")
    META_MES_COL = data.columns[data.columns.str.startswith(
        'META_MES')].tolist()
    META_AJUSTADA_MES_COL = data.columns[data.columns.str.startswith(
        'META_AJUSTADA_MES')].tolist()
    AVANCE_MES_COL = data.columns[data.columns.str.startswith(
        'AVANCE_MES')].tolist()
    JUSTIFICACION_AJUSTE_MES_COL = data.columns[data.columns.str.startswith(
        'JUSTIFICACION_AJUSTE_MES')].tolist()
    AVANCE_CAUSA_MES_COL = data.columns[data.columns.str.startswith(
        'AVANCE_CAUSA_MES')].tolist()
    AVANCE_EFECTO_MES_COL = data.columns[data.columns.str.startswith(
        'AVANCE_EFECTO_MES')].tolist()
    AVANCE_OTROS_MOTIVOS_MES_COL = data.columns[data.columns.str.startswith(
        'AVANCE_OTROS_MOTIVOS_MES')].tolist()

    #META by months
    msg.info("Meta by months...")
    for i in range(12):
        data[f'RECORDS_META_MES{i+1}']=(data[f'META_MES{i+1}'].astype('string')+':'\
            +data[f'META_MES{i+1}_NUM'].astype('string')+':'+data[f'META_MES{i+1}_DEN']\
                .astype('string'))

    #META AJUSTADA by months
    msg.info("Meta Ajustada by months...")
    for i in range(12):
        data[f'RECORDS_META_AJUSTADA_MES{i+1}']=(data[f'META_MES{i+1}'].astype('string')\
            +':'+data[f'META_MES{i+1}_NUM'].astype('string')+':'+data[f'META_MES{i+1}_DEN']\
                .astype('string'))

    #AVANCE by months
    msg.info("AVANCE by months...")
    for i in range(12):
        data[f'RECORDS_AVANCE_MES{i+1}']=(data[f'META_MES{i+1}'].astype('string')+':'+\
            data[f'META_MES{i+1}_NUM'].astype('string')+':'+data[f'META_MES{i+1}_DEN']\
                .astype('string'))

    #JUSTIFICACION by Months
    msg.info("JUSTIFICACION by months...")

    func = '|'.join

    data['JUSTIFICACIONES_AJUSTE_POR_MES']=data[JUSTIFICACION_AJUSTE_MES_COL]\
        .fillna('#').astype('str').apply(lambda x:func(x),axis=1)

    #AVANCE CAUSA by months
    msg.info("AVANCE CAUSA by months...")
    data['AVANCE_CAUSA_POR_MES']=data[AVANCE_CAUSA_MES_COL].fillna('#').astype('str')\
    .apply(lambda x:func(x),axis=1)

    #AVANCE EFECTO by Months
    msg.info("AVANCE EFECTO by months...")
    data['AVANCE_EFECTO_POR_MES']=data[AVANCE_EFECTO_MES_COL].fillna('#').astype('str')\
    .apply(lambda x:func(x),axis=1)

    #AVANCE OTROS MOTIVOS by months
    msg.info("AVANCE OTROS MOTIVOs by months...")
    data['AVANCE_OTROS_MOTIVOS_POR_MES']=data[AVANCE_OTROS_MOTIVOS_MES_COL].fillna('#')\
    .astype('str').apply(lambda x:func(x),axis=1)

    #Delete columns group data
    msg.info("delete columns")
    data.drop(labels=META_MES_COL + META_AJUSTADA_MES_COL + AVANCE_MES_COL,
              inplace=True,
              axis=1)
    data.drop(labels=JUSTIFICACION_AJUSTE_MES_COL + AVANCE_CAUSA_MES_COL +
              AVANCE_EFECTO_MES_COL + AVANCE_OTROS_MOTIVOS_MES_COL,
              inplace=True,
              axis=1)

    msg.info("General Information:\n")
    data.info()

    #Save File
    msg.info("Save the Files...")
    data.reset_index().to_feather('base')  #Para version feather
    #data.to_csv('base.csv.zip',encoding='latin1', index=False,compression='zip')# Para guardad en versión csv
    msg.good("OK!!!")
Exemple #25
0
def download(model, direct=False, *pip_args):
    """
    Download compatible model from default download path using pip. Model
    can be shortcut, model name or, if --direct flag is set, full model name
    with version. For direct downloads, the compatibility check will be skipped.
    """
    if not require_package("spacy") and "--no-deps" not in pip_args:
        msg.warn(
            "Skipping model package dependencies and setting `--no-deps`. "
            "You don't seem to have the spaCy package itself installed "
            "(maybe because you've built from source?), so installing the "
            "model dependencies would cause spaCy to be downloaded, which "
            "probably isn't what you want. If the model package has other "
            "dependencies, you'll have to install them manually.")
        pip_args = pip_args + ("--no-deps", )
    dl_tpl = "{m}-{v}/{m}-{v}.tar.gz#egg={m}=={v}"
    if direct:
        components = model.split("-")
        model_name = "".join(components[:-1])
        version = components[-1]
        dl = download_model(dl_tpl.format(m=model_name, v=version), pip_args)
    else:
        shortcuts = get_json(about.__shortcuts__, "available shortcuts")
        model_name = shortcuts.get(model, model)
        compatibility = get_compatibility()
        version = get_version(model_name, compatibility)
        dl = download_model(dl_tpl.format(m=model_name, v=version), pip_args)
        if dl != 0:  # if download subprocess doesn't return 0, exit
            sys.exit(dl)
        msg.good(
            "Download and installation successful",
            "You can now load the model via spacy.load('{}')".format(
                model_name),
        )
        # Only create symlink if the model is installed via a shortcut like 'en'.
        # There's no real advantage over an additional symlink for en_core_web_sm
        # and if anything, it's more error prone and causes more confusion.
        if model in shortcuts:
            try:
                # Get package path here because link uses
                # pip.get_installed_distributions() to check if model is a
                # package, which fails if model was just installed via
                # subprocess
                package_path = get_package_path(model_name)
                link(model_name, model, force=True, model_path=package_path)
            except:  # noqa: E722
                # Dirty, but since spacy.download and the auto-linking is
                # mostly a convenience wrapper, it's best to show a success
                # message and loading instructions, even if linking fails.
                msg.warn(
                    "Download successful but linking failed",
                    "Creating a shortcut link for '{}' didn't work (maybe you "
                    "don't have admin permissions?), but you can still load "
                    "the model via its full package name: "
                    "nlp = spacy.load('{}')".format(model, model_name),
                )
        # If a model is downloaded and then loaded within the same process, our
        # is_package check currently fails, because pkg_resources.working_set
        # is not refreshed automatically (see #3923). We're trying to work
        # around this here be requiring the package explicitly.
        require_package(model_name)
Exemple #26
0
def init_vectors_cli(
    # fmt: off
    lang: str = Arg(..., help="The language of the nlp object to create"),
    vectors_loc: Path = Arg(...,
                            help="Vectors file in Word2Vec format",
                            exists=True),
    output_dir: Path = Arg(..., help="Pipeline output directory"),
    prune: int = Opt(-1,
                     "--prune",
                     "-p",
                     help="Optional number of vectors to prune to"),
    truncate: int = Opt(
        0,
        "--truncate",
        "-t",
        help=
        "Optional number of vectors to truncate to when reading in vectors file"
    ),
    mode: str = Opt("default",
                    "--mode",
                    "-m",
                    help="Vectors mode: default or floret"),
    name: Optional[str] = Opt(
        None,
        "--name",
        "-n",
        help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"
    ),
    verbose: bool = Opt(
        False,
        "--verbose",
        "-V",
        "-VV",
        help="Display more information for debugging purposes"),
    jsonl_loc: Optional[Path] = Opt(
        None,
        "--lexemes-jsonl",
        "-j",
        help="Location of JSONL-formatted attributes file",
        hidden=True),
    # fmt: on
):
    """Convert word vectors for use with spaCy. Will export an nlp object that
    you can use in the [initialize] block of your config to initialize
    a model with vectors.
    """
    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
    msg.info(f"Creating blank nlp object for language '{lang}'")
    nlp = util.get_lang_class(lang)()
    if jsonl_loc is not None:
        update_lexemes(nlp, jsonl_loc)
    convert_vectors(
        nlp,
        vectors_loc,
        truncate=truncate,
        prune=prune,
        name=name,
        mode=mode,
    )
    msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
    nlp.to_disk(output_dir)
    msg.good(
        "Saved nlp object with vectors to output directory. You can now use the "
        "path to it in your config as the 'vectors' setting in [initialize].",
        output_dir.resolve(),
    )
Exemple #27
0
def debug_data(
    config_path: Path,
    *,
    config_overrides: Dict[str, Any] = {},
    ignore_warnings: bool = False,
    verbose: bool = False,
    no_format: bool = True,
    silent: bool = True,
):
    msg = Printer(
        no_print=silent, pretty=not no_format, ignore_warnings=ignore_warnings
    )
    # Make sure all files and paths exists if they are needed
    with show_validation_error(config_path):
        cfg = util.load_config(config_path, overrides=config_overrides)
        nlp = util.load_model_from_config(cfg)
        config = nlp.config.interpolate()
        T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
    # Use original config here, not resolved version
    sourced_components = get_sourced_components(cfg)
    frozen_components = T["frozen_components"]
    resume_components = [p for p in sourced_components if p not in frozen_components]
    pipeline = nlp.pipe_names
    factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
    msg.divider("Data file validation")

    # Create the gold corpus to be able to better analyze data
    dot_names = [T["train_corpus"], T["dev_corpus"]]
    train_corpus, dev_corpus = resolve_dot_names(config, dot_names)

    nlp.initialize(lambda: train_corpus(nlp))
    msg.good("Pipeline can be initialized with data")

    train_dataset = list(train_corpus(nlp))
    dev_dataset = list(dev_corpus(nlp))
    msg.good("Corpus is loadable")

    # Create all gold data here to avoid iterating over the train_dataset constantly
    gold_train_data = _compile_gold(train_dataset, factory_names, nlp, make_proj=True)
    gold_train_unpreprocessed_data = _compile_gold(
        train_dataset, factory_names, nlp, make_proj=False
    )
    gold_dev_data = _compile_gold(dev_dataset, factory_names, nlp, make_proj=True)

    train_texts = gold_train_data["texts"]
    dev_texts = gold_dev_data["texts"]
    frozen_components = T["frozen_components"]

    msg.divider("Training stats")
    msg.text(f"Language: {nlp.lang}")
    msg.text(f"Training pipeline: {', '.join(pipeline)}")
    if resume_components:
        msg.text(f"Components from other pipelines: {', '.join(resume_components)}")
    if frozen_components:
        msg.text(f"Frozen components: {', '.join(frozen_components)}")
    msg.text(f"{len(train_dataset)} training docs")
    msg.text(f"{len(dev_dataset)} evaluation docs")

    if not len(gold_dev_data):
        msg.fail("No evaluation docs")
    overlap = len(train_texts.intersection(dev_texts))
    if overlap:
        msg.warn(f"{overlap} training examples also in evaluation data")
    else:
        msg.good("No overlap between training and evaluation data")
    # TODO: make this feedback more fine-grained and report on updated
    # components vs. blank components
    if not resume_components and len(train_dataset) < BLANK_MODEL_THRESHOLD:
        text = f"Low number of examples to train a new pipeline ({len(train_dataset)})"
        if len(train_dataset) < BLANK_MODEL_MIN_THRESHOLD:
            msg.fail(text)
        else:
            msg.warn(text)
        msg.text(
            f"It's recommended to use at least {BLANK_MODEL_THRESHOLD} examples "
            f"(minimum {BLANK_MODEL_MIN_THRESHOLD})",
            show=verbose,
        )

    msg.divider("Vocab & Vectors")
    n_words = gold_train_data["n_words"]
    msg.info(
        f"{n_words} total word(s) in the data ({len(gold_train_data['words'])} unique)"
    )
    if gold_train_data["n_misaligned_words"] > 0:
        n_misaligned = gold_train_data["n_misaligned_words"]
        msg.warn(f"{n_misaligned} misaligned tokens in the training data")
    if gold_dev_data["n_misaligned_words"] > 0:
        n_misaligned = gold_dev_data["n_misaligned_words"]
        msg.warn(f"{n_misaligned} misaligned tokens in the dev data")
    most_common_words = gold_train_data["words"].most_common(10)
    msg.text(
        f"10 most common words: {_format_labels(most_common_words, counts=True)}",
        show=verbose,
    )
    if len(nlp.vocab.vectors):
        msg.info(
            f"{len(nlp.vocab.vectors)} vectors ({nlp.vocab.vectors.n_keys} "
            f"unique keys, {nlp.vocab.vectors_length} dimensions)"
        )
        n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
        msg.warn(
            "{} words in training data without vectors ({:.0f}%)".format(
                n_missing_vectors,
                100 * (n_missing_vectors / gold_train_data["n_words"]),
            ),
        )
        msg.text(
            "10 most common words without vectors: {}".format(
                _format_labels(
                    gold_train_data["words_missing_vectors"].most_common(10),
                    counts=True,
                )
            ),
            show=verbose,
        )
    else:
        msg.info("No word vectors present in the package")

    if "ner" in factory_names:
        # Get all unique NER labels present in the data
        labels = set(
            label for label in gold_train_data["ner"] if label not in ("O", "-", None)
        )
        label_counts = gold_train_data["ner"]
        model_labels = _get_labels_from_model(nlp, "ner")
        has_low_data_warning = False
        has_no_neg_warning = False
        has_ws_ents_error = False
        has_punct_ents_warning = False

        msg.divider("Named Entity Recognition")
        msg.info(f"{len(model_labels)} label(s)")
        missing_values = label_counts["-"]
        msg.text(f"{missing_values} missing value(s) (tokens with '-' label)")
        for label in labels:
            if len(label) == 0:
                msg.fail("Empty label found in train data")
        labels_with_counts = [
            (label, count)
            for label, count in label_counts.most_common()
            if label != "-"
        ]
        labels_with_counts = _format_labels(labels_with_counts, counts=True)
        msg.text(f"Labels in train data: {_format_labels(labels)}", show=verbose)
        missing_labels = model_labels - labels
        if missing_labels:
            msg.warn(
                "Some model labels are not present in the train data. The "
                "model performance may be degraded for these labels after "
                f"training: {_format_labels(missing_labels)}."
            )
        if gold_train_data["ws_ents"]:
            msg.fail(f"{gold_train_data['ws_ents']} invalid whitespace entity spans")
            has_ws_ents_error = True

        if gold_train_data["punct_ents"]:
            msg.warn(f"{gold_train_data['punct_ents']} entity span(s) with punctuation")
            has_punct_ents_warning = True

        for label in labels:
            if label_counts[label] <= NEW_LABEL_THRESHOLD:
                msg.warn(
                    f"Low number of examples for label '{label}' ({label_counts[label]})"
                )
                has_low_data_warning = True

                with msg.loading("Analyzing label distribution..."):
                    neg_docs = _get_examples_without_label(train_dataset, label)
                if neg_docs == 0:
                    msg.warn(f"No examples for texts WITHOUT new label '{label}'")
                    has_no_neg_warning = True

        if not has_low_data_warning:
            msg.good("Good amount of examples for all labels")
        if not has_no_neg_warning:
            msg.good("Examples without occurrences available for all labels")
        if not has_ws_ents_error:
            msg.good("No entities consisting of or starting/ending with whitespace")
        if not has_punct_ents_warning:
            msg.good("No entities consisting of or starting/ending with punctuation")

        if has_low_data_warning:
            msg.text(
                f"To train a new entity type, your data should include at "
                f"least {NEW_LABEL_THRESHOLD} instances of the new label",
                show=verbose,
            )
        if has_no_neg_warning:
            msg.text(
                "Training data should always include examples of entities "
                "in context, as well as examples without a given entity "
                "type.",
                show=verbose,
            )
        if has_ws_ents_error:
            msg.text(
                "As of spaCy v2.1.0, entity spans consisting of or starting/ending "
                "with whitespace characters are considered invalid."
            )

        if has_punct_ents_warning:
            msg.text(
                "Entity spans consisting of or starting/ending "
                "with punctuation can not be trained with a noise level > 0."
            )

    if "textcat" in factory_names:
        msg.divider("Text Classification (Exclusive Classes)")
        labels = _get_labels_from_model(nlp, "textcat")
        msg.info(f"Text Classification: {len(labels)} label(s)")
        msg.text(f"Labels: {_format_labels(labels)}", show=verbose)
        missing_labels = labels - set(gold_train_data["cats"])
        if missing_labels:
            msg.warn(
                "Some model labels are not present in the train data. The "
                "model performance may be degraded for these labels after "
                f"training: {_format_labels(missing_labels)}."
            )
        if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]):
            msg.warn(
                "Potential train/dev mismatch: the train and dev labels are "
                "not the same. "
                f"Train labels: {_format_labels(gold_train_data['cats'])}. "
                f"Dev labels: {_format_labels(gold_dev_data['cats'])}."
            )
        if len(labels) < 2:
            msg.fail(
                "The model does not have enough labels. 'textcat' requires at "
                "least two labels due to mutually-exclusive classes, e.g. "
                "LABEL/NOT_LABEL or POSITIVE/NEGATIVE for a binary "
                "classification task."
            )
        if (
            gold_train_data["n_cats_bad_values"] > 0
            or gold_dev_data["n_cats_bad_values"] > 0
        ):
            msg.fail(
                "Unsupported values for cats: the supported values are "
                "1.0/True and 0.0/False."
            )
        if gold_train_data["n_cats_multilabel"] > 0:
            # Note: you should never get here because you run into E895 on
            # initialization first.
            msg.fail(
                "The train data contains instances without mutually-exclusive "
                "classes. Use the component 'textcat_multilabel' instead of "
                "'textcat'."
            )
        if gold_dev_data["n_cats_multilabel"] > 0:
            msg.fail(
                "The dev data contains instances without mutually-exclusive "
                "classes. Use the component 'textcat_multilabel' instead of "
                "'textcat'."
            )

    if "textcat_multilabel" in factory_names:
        msg.divider("Text Classification (Multilabel)")
        labels = _get_labels_from_model(nlp, "textcat_multilabel")
        msg.info(f"Text Classification: {len(labels)} label(s)")
        msg.text(f"Labels: {_format_labels(labels)}", show=verbose)
        missing_labels = labels - set(gold_train_data["cats"])
        if missing_labels:
            msg.warn(
                "Some model labels are not present in the train data. The "
                "model performance may be degraded for these labels after "
                f"training: {_format_labels(missing_labels)}."
            )
        if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]):
            msg.warn(
                "Potential train/dev mismatch: the train and dev labels are "
                "not the same. "
                f"Train labels: {_format_labels(gold_train_data['cats'])}. "
                f"Dev labels: {_format_labels(gold_dev_data['cats'])}."
            )
        if (
            gold_train_data["n_cats_bad_values"] > 0
            or gold_dev_data["n_cats_bad_values"] > 0
        ):
            msg.fail(
                "Unsupported values for cats: the supported values are "
                "1.0/True and 0.0/False."
            )
        if gold_train_data["n_cats_multilabel"] > 0:
            if gold_dev_data["n_cats_multilabel"] == 0:
                msg.warn(
                    "Potential train/dev mismatch: the train data contains "
                    "instances without mutually-exclusive classes while the "
                    "dev data contains only instances with mutually-exclusive "
                    "classes."
                )
        else:
            msg.warn(
                "The train data contains only instances with "
                "mutually-exclusive classes. You can potentially use the "
                "component 'textcat' instead of 'textcat_multilabel'."
            )
            if gold_dev_data["n_cats_multilabel"] > 0:
                msg.fail(
                    "Train/dev mismatch: the dev data contains instances "
                    "without mutually-exclusive classes while the train data "
                    "contains only instances with mutually-exclusive classes."
                )

    if "tagger" in factory_names:
        msg.divider("Part-of-speech Tagging")
        labels = [label for label in gold_train_data["tags"]]
        model_labels = _get_labels_from_model(nlp, "tagger")
        msg.info(f"{len(labels)} label(s) in train data")
        missing_labels = model_labels - set(labels)
        if missing_labels:
            msg.warn(
                "Some model labels are not present in the train data. The "
                "model performance may be degraded for these labels after "
                f"training: {_format_labels(missing_labels)}."
            )
        labels_with_counts = _format_labels(
            gold_train_data["tags"].most_common(), counts=True
        )
        msg.text(labels_with_counts, show=verbose)

    if "morphologizer" in factory_names:
        msg.divider("Morphologizer (POS+Morph)")
        labels = [label for label in gold_train_data["morphs"]]
        model_labels = _get_labels_from_model(nlp, "morphologizer")
        msg.info(f"{len(labels)} label(s) in train data")
        missing_labels = model_labels - set(labels)
        if missing_labels:
            msg.warn(
                "Some model labels are not present in the train data. The "
                "model performance may be degraded for these labels after "
                f"training: {_format_labels(missing_labels)}."
            )
        labels_with_counts = _format_labels(
            gold_train_data["morphs"].most_common(), counts=True
        )
        msg.text(labels_with_counts, show=verbose)

    if "parser" in factory_names:
        has_low_data_warning = False
        msg.divider("Dependency Parsing")

        # profile sentence length
        msg.info(
            f"Found {gold_train_data['n_sents']} sentence(s) with an average "
            f"length of {gold_train_data['n_words'] / gold_train_data['n_sents']:.1f} words."
        )

        # check for documents with multiple sentences
        sents_per_doc = gold_train_data["n_sents"] / len(gold_train_data["texts"])
        if sents_per_doc < 1.1:
            msg.warn(
                f"The training data contains {sents_per_doc:.2f} sentences per "
                f"document. When there are very few documents containing more "
                f"than one sentence, the parser will not learn how to segment "
                f"longer texts into sentences."
            )

        # profile labels
        labels_train = [label for label in gold_train_data["deps"]]
        labels_train_unpreprocessed = [
            label for label in gold_train_unpreprocessed_data["deps"]
        ]
        labels_dev = [label for label in gold_dev_data["deps"]]

        if gold_train_unpreprocessed_data["n_nonproj"] > 0:
            n_nonproj = gold_train_unpreprocessed_data["n_nonproj"]
            msg.info(f"Found {n_nonproj} nonprojective train sentence(s)")
        if gold_dev_data["n_nonproj"] > 0:
            n_nonproj = gold_dev_data["n_nonproj"]
            msg.info(f"Found {n_nonproj} nonprojective dev sentence(s)")
        msg.info(f"{len(labels_train_unpreprocessed)} label(s) in train data")
        msg.info(f"{len(labels_train)} label(s) in projectivized train data")
        labels_with_counts = _format_labels(
            gold_train_unpreprocessed_data["deps"].most_common(), counts=True
        )
        msg.text(labels_with_counts, show=verbose)

        # rare labels in train
        for label in gold_train_unpreprocessed_data["deps"]:
            if gold_train_unpreprocessed_data["deps"][label] <= DEP_LABEL_THRESHOLD:
                msg.warn(
                    f"Low number of examples for label '{label}' "
                    f"({gold_train_unpreprocessed_data['deps'][label]})"
                )
                has_low_data_warning = True

        # rare labels in projectivized train
        rare_projectivized_labels = []
        for label in gold_train_data["deps"]:
            if (
                gold_train_data["deps"][label] <= DEP_LABEL_THRESHOLD
                and DELIMITER in label
            ):
                rare_projectivized_labels.append(
                    f"{label}: {gold_train_data['deps'][label]}"
                )

        if len(rare_projectivized_labels) > 0:
            msg.warn(
                f"Low number of examples for {len(rare_projectivized_labels)} "
                "label(s) in the projectivized dependency trees used for "
                "training. You may want to projectivize labels such as punct "
                "before training in order to improve parser performance."
            )
            msg.warn(
                f"Projectivized labels with low numbers of examples: ",
                ", ".join(rare_projectivized_labels),
                show=verbose,
            )
            has_low_data_warning = True

        # labels only in train
        if set(labels_train) - set(labels_dev):
            msg.warn(
                "The following labels were found only in the train data:",
                ", ".join(set(labels_train) - set(labels_dev)),
                show=verbose,
            )

        # labels only in dev
        if set(labels_dev) - set(labels_train):
            msg.warn(
                "The following labels were found only in the dev data:",
                ", ".join(set(labels_dev) - set(labels_train)),
                show=verbose,
            )

        if has_low_data_warning:
            msg.text(
                f"To train a parser, your data should include at "
                f"least {DEP_LABEL_THRESHOLD} instances of each label.",
                show=verbose,
            )

        # multiple root labels
        if len(gold_train_unpreprocessed_data["roots"]) > 1:
            msg.warn(
                f"Multiple root labels "
                f"({', '.join(gold_train_unpreprocessed_data['roots'])}) "
                f"found in training data. spaCy's parser uses a single root "
                f"label ROOT so this distinction will not be available."
            )

        # these should not happen, but just in case
        if gold_train_data["n_nonproj"] > 0:
            msg.fail(
                f"Found {gold_train_data['n_nonproj']} nonprojective "
                f"projectivized train sentence(s)"
            )
        if gold_train_data["n_cycles"] > 0:
            msg.fail(
                f"Found {gold_train_data['n_cycles']} projectivized train sentence(s) with cycles"
            )

    msg.divider("Summary")
    good_counts = msg.counts[MESSAGES.GOOD]
    warn_counts = msg.counts[MESSAGES.WARN]
    fail_counts = msg.counts[MESSAGES.FAIL]
    if good_counts:
        msg.good(f"{good_counts} {'check' if good_counts == 1 else 'checks'} passed")
    if warn_counts:
        msg.warn(f"{warn_counts} {'warning' if warn_counts == 1 else 'warnings'}")
    if fail_counts:
        msg.fail(f"{fail_counts} {'error' if fail_counts == 1 else 'errors'}")
        sys.exit(1)
Exemple #28
0
def main(
    in_file,
    out_dir=None,
    model_file=None,
    config_file=None,
    spacy_model=None,
    fine_tune=False,
):
    """Train CRF entity tagger."""
    if config_file:
        msg.info("Loading config from disk")
        component_config = srsly.read_json(config_file)
        msg.good("Successfully loaded config from file.", config_file)
    else:
        component_config = None

    crf_extractor = CRFExtractor(component_config=component_config)

    if model_file is not None:
        msg.info(f"Loading model from disk.")
        crf_extractor = crf_extractor.from_disk(model_file)
        msg.good("Successfully loaded model from file.", model_file)

    msg.info("Loading training examples.")
    train_examples = read_file(in_file)
    msg.good(
        f"Successfully loaded {len(train_examples)} training examples from file.",
        in_file)

    if spacy_model is not None:
        nlp = spacy.load(spacy_model)
        msg.info(f"Using spaCy model: {spacy_model}")
    else:
        nlp = spacy.blank("en")
        msg.info(f"Using spaCy blank: 'en'")

    tokenizer = SpacyTokenizer(nlp=nlp)
    use_dense_features = crf_extractor.use_dense_features()
    train_crf = [
        gold_example_to_crf_tokens(ex,
                                   tokenizer=tokenizer,
                                   use_dense_features=use_dense_features)
        for ex in train_examples
    ]

    if fine_tune:
        msg.info("Fine-tuning hyper params.")
        rs = crf_extractor.fine_tune(train_crf,
                                     cv=5,
                                     n_iter=30,
                                     random_state=42)
        msg.good("Setting fine-tuned hyper params:", rs.best_params_)
        crf_extractor.component_config.update(rs.best_params_)

    msg.info("Training entity tagger with CRF.")
    crf_extractor.train(train_crf)

    model_path = pathlib.Path(out_dir or ".").resolve() / "model.pkl"
    msg.info("Saving model to disk")
    model_path.parent.mkdir(exist_ok=True)
    crf_extractor.to_disk(model_path)
    msg.good("Successfully saved model to file.",
             str(model_path.relative_to(os.getcwd())))