def main( # fmt: off in_file: str = typer.Argument(..., help="Vectors file (text-based)"), vocab_file: str = typer.Argument(..., help="Vocabulary file"), out_dir: str = typer.Argument(..., help="Path to output directory"), min_freq_ratio: float = typer.Option( 0.0, "--min-freq-ratio", "-r", help= "Frequency ratio threshold for discarding minority senses or casings"), min_distance: float = typer.Option( 0.0, "--min-distance", "-s", help="Similarity threshold for discarding redundant keys"), # fmt: on ): """ Step 5: Export a sense2vec component Expects a vectors.txt and a vocab file trained with GloVe and exports a component that can be loaded with Sense2vec.from_disk. """ input_path = Path(in_file) vocab_path = Path(vocab_file) output_path = Path(out_dir) if not input_path.exists(): msg.fail("Can't find input file", in_file, exits=1) if input_path.suffix == ".bin": msg.fail("Need text-based vectors file, not binary", in_file, exits=1) if not vocab_path.exists(): msg.fail("Can't find vocab file", vocab_file, exits=1) if not output_path.exists(): output_path.mkdir(parents=True) msg.good(f"Created output directory {out_dir}") with input_path.open("r", encoding="utf8") as f: (n_vectors, vector_size), f = _get_shape(f) vectors_data = f.readlines() with vocab_path.open("r", encoding="utf8") as f: vocab = read_vocab(f) vectors = {} all_senses = set() for item in vectors_data: item = item.rstrip().rsplit(" ", vector_size) key = item[0] try: _, sense = split_key(key) except ValueError: continue vec = item[1:] if len(vec) != vector_size: msg.fail(f"Wrong vector size: {len(vec)} (expected {vector_size})", exits=1) all_senses.add(sense) vectors[key] = numpy.asarray(vec, dtype=numpy.float32) discarded = set() discarded.update(get_minority_keys(vocab, min_freq_ratio)) discarded.update(get_redundant_keys(vocab, vectors, min_distance)) n_vectors = len(vectors) - len(discarded) s2v = Sense2Vec(shape=(n_vectors, vector_size), senses=list(all_senses)) for key, vector in vectors.items(): if key not in discarded: s2v.add(key, vector) s2v.set_freq(key, vocab[key]) msg.good("Created the sense2vec model") msg.info(f"{n_vectors} vectors, {len(all_senses)} total senses") s2v.to_disk(output_path) msg.good("Saved model to directory", out_dir)
def train( lang, output_path, train_path, dev_path, raw_text=None, base_model=None, pipeline="tagger,parser,ner", replace_components=False, vectors=None, width=96, conv_depth=4, cnn_window=1, cnn_pieces=3, use_chars=False, bilstm_depth=0, embed_rows=2000, n_iter=30, n_early_stopping=None, n_examples=0, use_gpu=-1, version="0.0.0", meta_path=None, init_tok2vec=None, parser_multitasks="", entity_multitasks="", noise_level=0.0, orth_variant_level=0.0, eval_beam_widths="", gold_preproc=False, learn_tokens=False, textcat_multilabel=False, textcat_arch="bow", textcat_positive_label=None, tag_map_path=None, verbose=False, debug=False, ): """ Train or update a spaCy model. Requires data to be formatted in spaCy's JSON format. To convert data from other formats, use the `spacy convert` command. """ util.fix_random_seed() util.set_env_log(verbose) # Make sure all files and paths exists if they are needed train_path = util.ensure_path(train_path) dev_path = util.ensure_path(dev_path) meta_path = util.ensure_path(meta_path) output_path = util.ensure_path(output_path) if raw_text is not None: raw_text = list(srsly.read_jsonl(raw_text)) if not train_path or not train_path.exists(): msg.fail("Training data not found", train_path, exits=1) if not dev_path or not dev_path.exists(): msg.fail("Development data not found", dev_path, exits=1) if meta_path is not None and not meta_path.exists(): msg.fail("Can't find model meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) if meta_path else {} if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]: msg.warn( "Output directory is not empty", "This can lead to unintended side effects when saving the model. " "Please use an empty directory or a different path instead. If " "the specified output path doesn't exist, the directory will be " "created for you.", ) if not output_path.exists(): output_path.mkdir() msg.good("Created output directory: {}".format(output_path)) tag_map = {} if tag_map_path is not None: tag_map = srsly.read_json(tag_map_path) # Take dropout and batch size as generators of values -- dropout # starts high and decays sharply, to force the optimizer to explore. # Batch size starts at 1 and grows, so that we make updates quickly # at the beginning of training. dropout_rates = util.decaying( util.env_opt("dropout_from", 0.2), util.env_opt("dropout_to", 0.2), util.env_opt("dropout_decay", 0.0), ) batch_sizes = util.compounding( util.env_opt("batch_from", 100.0), util.env_opt("batch_to", 1000.0), util.env_opt("batch_compound", 1.001), ) if not eval_beam_widths: eval_beam_widths = [1] else: eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")] if 1 not in eval_beam_widths: eval_beam_widths.append(1) eval_beam_widths.sort() has_beam_widths = eval_beam_widths != [1] # Set up the base model and pipeline. If a base model is specified, load # the model and make sure the pipeline matches the pipeline setting. If # training starts from a blank model, intitalize the language class. pipeline = [p.strip() for p in pipeline.split(",")] disabled_pipes = None pipes_added = False msg.text("Training pipeline: {}".format(pipeline)) if use_gpu >= 0: activated_gpu = None try: activated_gpu = set_gpu(use_gpu) except Exception as e: msg.warn("Exception: {}".format(e)) if activated_gpu is not None: msg.text("Using GPU: {}".format(use_gpu)) else: msg.warn("Unable to activate GPU: {}".format(use_gpu)) msg.text("Using CPU only") use_gpu = -1 if base_model: msg.text("Starting with base model '{}'".format(base_model)) nlp = util.load_model(base_model) if nlp.lang != lang: msg.fail( "Model language ('{}') doesn't match language specified as " "`lang` argument ('{}') ".format(nlp.lang, lang), exits=1, ) for pipe in pipeline: pipe_cfg = {} if pipe == "parser": pipe_cfg = {"learn_tokens": learn_tokens} elif pipe == "textcat": pipe_cfg = { "exclusive_classes": not textcat_multilabel, "architecture": textcat_arch, "positive_label": textcat_positive_label, } if pipe not in nlp.pipe_names: msg.text("Adding component to base model '{}'".format(pipe)) nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) pipes_added = True elif replace_components: msg.text("Replacing component from base model '{}'".format(pipe)) nlp.replace_pipe(pipe, nlp.create_pipe(pipe, config=pipe_cfg)) pipes_added = True else: if pipe == "textcat": textcat_cfg = nlp.get_pipe("textcat").cfg base_cfg = { "exclusive_classes": textcat_cfg["exclusive_classes"], "architecture": textcat_cfg["architecture"], "positive_label": textcat_cfg["positive_label"], } if base_cfg != pipe_cfg: msg.fail( "The base textcat model configuration does" "not match the provided training options. " "Existing cfg: {}, provided cfg: {}".format( base_cfg, pipe_cfg ), exits=1, ) msg.text("Extending component from base model '{}'".format(pipe)) disabled_pipes = nlp.disable_pipes( [p for p in nlp.pipe_names if p not in pipeline] ) else: msg.text("Starting with blank model '{}'".format(lang)) lang_cls = util.get_lang_class(lang) nlp = lang_cls() for pipe in pipeline: if pipe == "parser": pipe_cfg = {"learn_tokens": learn_tokens} elif pipe == "textcat": pipe_cfg = { "exclusive_classes": not textcat_multilabel, "architecture": textcat_arch, "positive_label": textcat_positive_label, } else: pipe_cfg = {} nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) # Update tag map with provided mapping nlp.vocab.morphology.tag_map.update(tag_map) if vectors: msg.text("Loading vector from model '{}'".format(vectors)) _load_vectors(nlp, vectors) # Multitask objectives multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)] for pipe_name, multitasks in multitask_options: if multitasks: if pipe_name not in pipeline: msg.fail( "Can't use multitask objective without '{}' in the " "pipeline".format(pipe_name) ) pipe = nlp.get_pipe(pipe_name) for objective in multitasks.split(","): pipe.add_multitask_objective(objective) # Prepare training corpus msg.text("Counting training words (limit={})".format(n_examples)) corpus = GoldCorpus(train_path, dev_path, limit=n_examples) n_train_words = corpus.count_train() if base_model and not pipes_added: # Start with an existing model, use default optimizer optimizer = create_default_optimizer(Model.ops) else: # Start with a blank model, call begin_training cfg = {"device": use_gpu} cfg["conv_depth"] = conv_depth cfg["token_vector_width"] = width cfg["bilstm_depth"] = bilstm_depth cfg["cnn_maxout_pieces"] = cnn_pieces cfg["embed_size"] = embed_rows cfg["conv_window"] = cnn_window cfg["subword_features"] = not use_chars optimizer = nlp.begin_training(lambda: corpus.train_tuples, **cfg) nlp._optimizer = None # Load in pretrained weights if init_tok2vec is not None: components = _load_pretrained_tok2vec(nlp, init_tok2vec) msg.text("Loaded pretrained tok2vec for: {}".format(components)) # Verify textcat config if "textcat" in pipeline: textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", []) if textcat_positive_label and textcat_positive_label not in textcat_labels: msg.fail( "The textcat_positive_label (tpl) '{}' does not match any " "label in the training data.".format(textcat_positive_label), exits=1, ) if textcat_positive_label and len(textcat_labels) != 2: msg.fail( "A textcat_positive_label (tpl) '{}' was provided for training " "data that does not appear to be a binary classification " "problem with two labels.".format(textcat_positive_label), exits=1, ) train_docs = corpus.train_docs( nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0, ignore_misaligned=True, ) train_labels = set() if textcat_multilabel: multilabel_found = False for text, gold in train_docs: train_labels.update(gold.cats.keys()) if list(gold.cats.values()).count(1.0) != 1: multilabel_found = True if not multilabel_found and not base_model: msg.warn( "The textcat training instances look like they have " "mutually-exclusive classes. Remove the flag " "'--textcat-multilabel' to train a classifier with " "mutually-exclusive classes." ) if not textcat_multilabel: for text, gold in train_docs: train_labels.update(gold.cats.keys()) if list(gold.cats.values()).count(1.0) != 1 and not base_model: msg.warn( "Some textcat training instances do not have exactly " "one positive label. Modifying training options to " "include the flag '--textcat-multilabel' for classes " "that are not mutually exclusive." ) nlp.get_pipe("textcat").cfg["exclusive_classes"] = False textcat_multilabel = True break if base_model and set(textcat_labels) != train_labels: msg.fail( "Cannot extend textcat model using data with different " "labels. Base model labels: {}, training data labels: " "{}.".format(textcat_labels, list(train_labels)), exits=1, ) if textcat_multilabel: msg.text( "Textcat evaluation score: ROC AUC score macro-averaged across " "the labels '{}'".format(", ".join(textcat_labels)) ) elif textcat_positive_label and len(textcat_labels) == 2: msg.text( "Textcat evaluation score: F1-score for the " "label '{}'".format(textcat_positive_label) ) elif len(textcat_labels) > 1: if len(textcat_labels) == 2: msg.warn( "If the textcat component is a binary classifier with " "exclusive classes, provide '--textcat-positive-label' for " "an evaluation on the positive class." ) msg.text( "Textcat evaluation score: F1-score macro-averaged across " "the labels '{}'".format(", ".join(textcat_labels)) ) else: msg.fail( "Unsupported textcat configuration. Use `spacy debug-data` " "for more information." ) # fmt: off row_head, output_stats = _configure_training_output(pipeline, use_gpu, has_beam_widths) row_widths = [len(w) for w in row_head] row_settings = {"widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2} # fmt: on print("") msg.row(row_head, **row_settings) msg.row(["-" * width for width in row_settings["widths"]], **row_settings) try: iter_since_best = 0 best_score = 0.0 for i in range(n_iter): train_docs = corpus.train_docs( nlp, noise_level=noise_level, orth_variant_level=orth_variant_level, gold_preproc=gold_preproc, max_length=0, ignore_misaligned=True, ) if raw_text: random.shuffle(raw_text) raw_batches = util.minibatch( (nlp.make_doc(rt["text"]) for rt in raw_text), size=8 ) words_seen = 0 with tqdm.tqdm(total=n_train_words, leave=False) as pbar: losses = {} for batch in util.minibatch_by_words(train_docs, size=batch_sizes): if not batch: continue docs, golds = zip(*batch) try: nlp.update( docs, golds, sgd=optimizer, drop=next(dropout_rates), losses=losses, ) except ValueError as e: err = "Error during training" if init_tok2vec: err += " Did you provide the same parameters during 'train' as during 'pretrain'?" msg.fail(err, "Original error message: {}".format(e), exits=1) if raw_text: # If raw text is available, perform 'rehearsal' updates, # which use unlabelled data to reduce overfitting. raw_batch = list(next(raw_batches)) nlp.rehearse(raw_batch, sgd=optimizer, losses=losses) if not int(os.environ.get("LOG_FRIENDLY", 0)): pbar.update(sum(len(doc) for doc in docs)) words_seen += sum(len(doc) for doc in docs) with nlp.use_params(optimizer.averages): util.set_env_log(False) epoch_model_path = output_path / ("model%d" % i) nlp.to_disk(epoch_model_path) nlp_loaded = util.load_model_from_path(epoch_model_path) for beam_width in eval_beam_widths: for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width dev_docs = list( corpus.dev_docs( nlp_loaded, gold_preproc=gold_preproc, ignore_misaligned=True, ) ) nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) end_time = timer() if use_gpu < 0: gpu_wps = None cpu_wps = nwords / (end_time - start_time) else: gpu_wps = nwords / (end_time - start_time) # Only evaluate on CPU in the first iteration (for # timing) if GPU is enabled if i == 0: with Model.use_device("cpu"): nlp_loaded = util.load_model_from_path(epoch_model_path) for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width dev_docs = list( corpus.dev_docs( nlp_loaded, gold_preproc=gold_preproc, ignore_misaligned=True, ) ) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) end_time = timer() cpu_wps = nwords / (end_time - start_time) acc_loc = output_path / ("model%d" % i) / "accuracy.json" srsly.write_json(acc_loc, scorer.scores) # Update model meta.json meta["lang"] = nlp.lang meta["pipeline"] = nlp.pipe_names meta["spacy_version"] = ">=%s" % about.__version__ if beam_width == 1: meta["speed"] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta.setdefault("accuracy", {}) for component in nlp.pipe_names: for metric in _get_metrics(component): meta["accuracy"][metric] = scorer.scores[metric] else: meta.setdefault("beam_accuracy", {}) meta.setdefault("beam_speed", {}) for component in nlp.pipe_names: for metric in _get_metrics(component): meta["beam_accuracy"][metric] = scorer.scores[metric] meta["beam_speed"][beam_width] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta["vectors"] = { "width": nlp.vocab.vectors_length, "vectors": len(nlp.vocab.vectors), "keys": nlp.vocab.vectors.n_keys, "name": nlp.vocab.vectors.name, } meta.setdefault("name", "model%d" % i) meta.setdefault("version", version) meta["labels"] = nlp.meta["labels"] meta_loc = output_path / ("model%d" % i) / "meta.json" srsly.write_json(meta_loc, meta) util.set_env_log(verbose) progress = _get_progress( i, losses, scorer.scores, output_stats, beam_width=beam_width if has_beam_widths else None, cpu_wps=cpu_wps, gpu_wps=gpu_wps, ) if i == 0 and "textcat" in pipeline: textcats_per_cat = scorer.scores.get("textcats_per_cat", {}) for cat, cat_score in textcats_per_cat.items(): if cat_score.get("roc_auc_score", 0) < 0: msg.warn( "Textcat ROC AUC score is undefined due to " "only one value in label '{}'.".format(cat) ) msg.row(progress, **row_settings) # Early stopping if n_early_stopping is not None: current_score = _score_for_model(meta) if current_score < best_score: iter_since_best += 1 else: iter_since_best = 0 best_score = current_score if iter_since_best >= n_early_stopping: msg.text( "Early stopping, best iteration " "is: {}".format(i - iter_since_best) ) msg.text( "Best score = {}; Final iteration " "score = {}".format(best_score, current_score) ) break except Exception as e: msg.warn( "Aborting and saving the final best model. " "Encountered exception: {}".format(e), exits=1, ) finally: best_pipes = nlp.pipe_names if disabled_pipes: disabled_pipes.restore() with nlp.use_params(optimizer.averages): final_model_path = output_path / "model-final" nlp.to_disk(final_model_path) meta_loc = output_path / "model-final" / "meta.json" final_meta = srsly.read_json(meta_loc) final_meta.setdefault("accuracy", {}) final_meta["accuracy"].update(meta.get("accuracy", {})) final_meta.setdefault("speed", {}) final_meta["speed"].setdefault("cpu", None) final_meta["speed"].setdefault("gpu", None) meta.setdefault("speed", {}) meta["speed"].setdefault("cpu", None) meta["speed"].setdefault("gpu", None) # combine cpu and gpu speeds with the base model speeds if final_meta["speed"]["cpu"] and meta["speed"]["cpu"]: speed = _get_total_speed( [final_meta["speed"]["cpu"], meta["speed"]["cpu"]] ) final_meta["speed"]["cpu"] = speed if final_meta["speed"]["gpu"] and meta["speed"]["gpu"]: speed = _get_total_speed( [final_meta["speed"]["gpu"], meta["speed"]["gpu"]] ) final_meta["speed"]["gpu"] = speed # if there were no speeds to update, overwrite with meta if ( final_meta["speed"]["cpu"] is None and final_meta["speed"]["gpu"] is None ): final_meta["speed"].update(meta["speed"]) # note: beam speeds are not combined with the base model if has_beam_widths: final_meta.setdefault("beam_accuracy", {}) final_meta["beam_accuracy"].update(meta.get("beam_accuracy", {})) final_meta.setdefault("beam_speed", {}) final_meta["beam_speed"].update(meta.get("beam_speed", {})) srsly.write_json(meta_loc, final_meta) msg.good("Saved model to output directory", final_model_path) with msg.loading("Creating best model..."): best_model_path = _collate_best_model(final_meta, output_path, best_pipes) msg.good("Created best model", best_model_path)
for fund in funds: for stock in fund.stocks: if stocks.get(stock.code): stocks[stock.code].proportion += float( stock.proportion) / fund_count else: stocks[stock.code] = stock stocks[stock.code].proportion = float( stocks[stock.code].proportion) / fund_count total_money = 0 sorted_stocks = sorted(stocks.values(), key=lambda item: item.proportion, reverse=True) stock_table = PrettyTable() stock_table.title = '股票列表' stock_table.field_names = ['股票代码', '股票名称', '占净值比例', '买入金额'] stock_table.align['占净值比例'] = 'r' stock_table.align['买入金额'] = 'r' for stock in sorted_stocks: cost = int(money * stock.proportion / fund_count / 100) total_money += cost stock_table.add_row([ stock.code, stock.name, f'{stock.proportion / fund_count:.02f}%', f'{cost:.02f}' ]) print(stock_table) msg.good( f'总计买入: {total_money:.02f} 元,剩余 {money - total_money:.02f} 元,建议存入余额宝/余利宝/零钱通等。' ) msg.warn('以上信息仅供参考,股市有风险,请理性投资!')
def main( model="./zh_vectors_web_ud_lg/model-final", new_model_name="zh_vectors_web_ud_clue_lg", output_dir="./zh_vectors_web_ud_clue_lg", train_path="./clue_spacy_train.jsonl", dev_path="./clue_spacy_dev.jsonl", meta_path="./meta.json", use_gpu=0, n_iter=50 ): import tqdm """Set up the pipeline and entity recognizer, and train the new entity.""" random.seed(0) if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) # Add entity recognizer to model if it's not in the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner) # otherwise, get it, so we can add labels to it else: ner = nlp.get_pipe("ner") for label in LABEL: if label not in ner.labels: ner.add_label(label) # add new entity label to entity recognizer train_path = ensure_path(train_path) dev_path = ensure_path(dev_path) if not train_path or not train_path.exists(): msg.fail("Training data not found", train_path, exits=1) if not dev_path or not dev_path.exists(): msg.fail("Development data not found", dev_path, exits=1) if output_dir.exists() and [p for p in output_dir.iterdir() if p.is_dir()]: msg.warn( "Output directory is not empty", "This can lead to unintended side effects when saving the model. " "Please use an empty directory or a different path instead. If " "the specified output path doesn't exist, the directory will be " "created for you.", ) if not output_dir.exists(): output_dir.mkdir() meta = srsly.read_json(meta_path) if meta_path else {} # Prepare training corpus msg.text("Counting training words (limit={})".format(0)) corpus = GoldCorpus(train_path, dev_path, limit=0) n_train_words = corpus.count_train() if model is None: optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) else: optimizer = create_default_optimizer(Model.ops) # Todo: gpu train? dropout_rates = decaying( 0.2, 0.2, 0.0) batch_sizes = compounding( 100.0, 1000.0 , 1.001) # get names of other pipes to disable them during training pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [ pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions] # UnboundLocalError: local variable 'has_beam_widths' referenced before assignment # fmt: off eval_beam_widths="" if not eval_beam_widths: eval_beam_widths = [1] else: eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")] if 1 not in eval_beam_widths: eval_beam_widths.append(1) eval_beam_widths.sort() has_beam_widths = eval_beam_widths != [1] row_head, output_stats = _configure_training_output(["ner"], use_gpu, has_beam_widths) row_widths = [len(w) for w in row_head] row_settings = {"widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2} # fmt: on print("") msg.row(row_head, **row_settings) msg.row(["-" * width for width in row_settings["widths"]], **row_settings) try: noise_level = 0.0 orth_variant_level = 0.0 gold_preproc = False verbose = False best_score = 0.0 with nlp.disable_pipes(*other_pipes): # only train NER for itn in range(n_iter): train_docs = corpus.train_docs( nlp, noise_level=noise_level, orth_variant_level=orth_variant_level, gold_preproc=gold_preproc, max_length=0, ignore_misaligned=True, ) words_seen = 0 with tqdm.tqdm(total=n_train_words, leave=False) as pbar: losses = {} for batch in minibatch_by_words(train_docs, size=batch_sizes): if not batch: continue docs, golds = zip(*batch) nlp.update( docs, golds, sgd=optimizer, drop=next(dropout_rates), losses=losses, ) if not int(os.environ.get("LOG_FRIENDLY", 0)): pbar.update(sum(len(doc) for doc in docs)) words_seen += sum(len(doc) for doc in docs) with nlp.use_params(optimizer.averages): set_env_log(False) epoch_model_path = output_dir / ("model%d" % itn) nlp.to_disk(epoch_model_path) nlp_loaded = load_model_from_path(epoch_model_path) for beam_width in eval_beam_widths: for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width dev_docs = list( corpus.dev_docs( nlp_loaded, gold_preproc=gold_preproc, ignore_misaligned=True, ) ) nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) end_time = timer() if use_gpu < 0: gpu_wps = None cpu_wps = nwords / (end_time - start_time) else: gpu_wps = nwords / (end_time - start_time) with Model.use_device("cpu"): nlp_loaded = load_model_from_path(epoch_model_path) for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width dev_docs = list( corpus.dev_docs( nlp_loaded, gold_preproc=gold_preproc, ignore_misaligned=True, ) ) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) end_time = timer() cpu_wps = nwords / (end_time - start_time) acc_loc = output_dir / ("model%d" % itn) / "accuracy.json" srsly.write_json(acc_loc, scorer.scores) # Update model meta.json meta["lang"] = nlp.lang meta["pipeline"] = nlp.pipe_names meta["spacy_version"] = ">=%s" % spacy.__version__ if beam_width == 1: meta["speed"] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta["accuracy"] = scorer.scores else: meta.setdefault("beam_accuracy", {}) meta.setdefault("beam_speed", {}) meta["beam_accuracy"][beam_width] = scorer.scores meta["beam_speed"][beam_width] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta["vectors"] = { "width": nlp.vocab.vectors_length, "vectors": len(nlp.vocab.vectors), "keys": nlp.vocab.vectors.n_keys, "name": nlp.vocab.vectors.name, } meta.setdefault("name", "model%d" % itn) meta.setdefault("version", "0.0.1") meta["labels"] = nlp.meta["labels"] meta_loc = output_dir / ("model%d" % itn) / "meta.json" srsly.write_json(meta_loc, meta) set_env_log(verbose) progress = _get_progress( itn, losses, scorer.scores, output_stats, beam_width=beam_width if has_beam_widths else None, cpu_wps=cpu_wps, gpu_wps=gpu_wps, ) msg.row(progress, **row_settings) finally: with nlp.use_params(optimizer.averages): final_model_path = output_dir / "model-final" nlp.to_disk(final_model_path) msg.good("Saved model to output directory", final_model_path) meta["pipeline"] = nlp.pipe_names meta["labels"] = nlp.meta["labels"] meta["factories"] = nlp.meta["factories"] with msg.loading("Creating best model..."): best_model_path = _collate_best_model(meta, output_dir, nlp.pipe_names) msg.good("Created best model", best_model_path)
def main(vectors, gpu_id=-1, n_neighbors=100, batch_size=1024, cutoff=0, start=0, end=None): """ Step 6: Precompute nearest-neighbor queries (optional) Precompute nearest-neighbor queries for every entry in the vocab to make Sense2Vec.most_similar faster. The --cutoff option lets you define the number of earliest rows to limit the neighbors to. For instance, if cutoff is 100000, no word will have a nearest neighbor outside of the top 100k vectors. """ if gpu_id == -1: xp = numpy else: import cupy as xp import cupy.cuda.device xp.take_along_axis = take_along_axis device = cupy.cuda.device.Device(gpu_id) cupy.cuda.get_cublas_handle() device.use() vectors_dir = Path(vectors) vectors_file = vectors_dir / "vectors" if not vectors_dir.is_dir() or not vectors_file.exists(): err = "Are you passing in the exported sense2vec directory containing a vectors file?" msg.fail(f"Can't load vectors from {vectors}", err, exits=1) with msg.loading(f"Loading vectors from {vectors}"): vectors = xp.load(str(vectors_file)) msg.good( f"Loaded {vectors.shape[0]:,} vectors with dimension {vectors.shape[1]}" ) norms = xp.linalg.norm(vectors, axis=1, keepdims=True) norms[norms == 0] = 1 # Normalize to unit norm vectors /= norms if cutoff < 1: cutoff = vectors.shape[0] if end is None: end = vectors.shape[0] mean = float(norms.mean()) var = float(norms.var()) msg.good(f"Normalized (mean {mean:,.2f}, variance {var:,.2f})") msg.info( f"Finding {n_neighbors:,} neighbors among {cutoff:,} most frequent") n = min(n_neighbors, vectors.shape[0]) subset = vectors[:cutoff] best_rows = xp.zeros((end - start, n), dtype="i") scores = xp.zeros((end - start, n), dtype="f") for i in tqdm.tqdm(list(range(start, end, batch_size))): size = min(batch_size, end - i) batch = vectors[i:i + size] sims = xp.dot(batch, subset.T) # Set self-similarities to -inf, so that we don't return them. for j in range(size): if i + j < sims.shape[1]: sims[j, i + j] = -xp.inf # This used to use argpartition, to do a partial sort...But this ended # up being a ratsnest of terrible numpy crap. Just sorting the whole # list isn't really slower, and it's much simpler to read. ranks = xp.argsort(sims, axis=1) batch_rows = ranks[:, -n:] # Reverse batch_rows = batch_rows[:, ::-1] batch_scores = xp.take_along_axis(sims, batch_rows, axis=1) best_rows[i:i + size] = batch_rows scores[i:i + size] = batch_scores msg.info("Saving output") if not isinstance(best_rows, numpy.ndarray): best_rows = best_rows.get() if not isinstance(scores, numpy.ndarray): scores = scores.get() output = { "indices": best_rows, "scores": scores.astype("float16"), "start": start, "end": end, "cutoff": cutoff, } output_file = vectors_dir / "cache" with msg.loading("Saving output..."): srsly.write_msgpack(output_file, output) msg.good(f"Saved cache to {output_file}")
def pretrain( texts_loc, vectors_model, output_dir, width=96, depth=4, bilstm_depth=0, cnn_pieces=3, sa_depth=0, use_chars=False, cnn_window=1, embed_rows=2000, loss_func="cosine", use_vectors=False, dropout=0.2, n_iter=1000, batch_size=3000, max_length=500, min_length=5, seed=0, n_save_every=None, init_tok2vec=None, epoch_start=None, ): """ Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, using an approximate language-modelling objective. Specifically, we load pretrained vectors, and train a component like a CNN, BiLSTM, etc to predict vectors which match the pretrained ones. The weights are saved to a directory after each epoch. You can then pass a path to one of these pretrained weights files to the 'spacy train' command. This technique may be especially helpful if you have little labelled data. However, it's still quite experimental, so your mileage may vary. To load the weights back in during 'spacy train', you need to ensure all settings are the same between pretraining and training. The API and errors around this need some improvement. """ config = dict(locals()) for key in config: if isinstance(config[key], Path): config[key] = str(config[key]) util.fix_random_seed(seed) has_gpu = prefer_gpu() if has_gpu: import torch torch.set_default_tensor_type("torch.cuda.FloatTensor") msg.info("Using GPU" if has_gpu else "Not using GPU") output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() msg.good("Created output directory") srsly.write_json(output_dir / "config.json", config) msg.good("Saved settings to config.json") # Load texts from file or stdin if texts_loc != "-": # reading from a file texts_loc = Path(texts_loc) if not texts_loc.exists(): msg.fail("Input text file doesn't exist", texts_loc, exits=1) with msg.loading("Loading input texts..."): texts = list(srsly.read_jsonl(texts_loc)) if not texts: msg.fail("Input file is empty", texts_loc, exits=1) msg.good("Loaded input texts") random.shuffle(texts) else: # reading from stdin msg.text("Reading input text from stdin...") texts = srsly.read_jsonl("-") with msg.loading("Loading model '{}'...".format(vectors_model)): nlp = util.load_model(vectors_model) msg.good("Loaded model '{}'".format(vectors_model)) pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name model = create_pretraining_model( nlp, Tok2Vec( width, embed_rows, conv_depth=depth, pretrained_vectors=pretrained_vectors, bilstm_depth=bilstm_depth, # Requires PyTorch. Experimental. subword_features=not use_chars, # Set to False for Chinese etc cnn_maxout_pieces=cnn_pieces, # If set to 1, use Mish activation. ), ) # Load in pretrained weights if init_tok2vec is not None: components = _load_pretrained_tok2vec(nlp, init_tok2vec) msg.text("Loaded pretrained tok2vec for: {}".format(components)) # Parse the epoch number from the given weight file model_name = re.search(r"model\d+\.bin", str(init_tok2vec)) if model_name: # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin' epoch_start = int(model_name.group(0)[5:][:-4]) + 1 else: if not epoch_start: msg.fail( "You have to use the '--epoch-start' argument when using a renamed weight file for " "'--init-tok2vec'", exits=True, ) elif epoch_start < 0: msg.fail( "The argument '--epoch-start' has to be greater or equal to 0. '%d' is invalid" % epoch_start, exits=True, ) else: # Without '--init-tok2vec' the '--epoch-start' argument is ignored epoch_start = 0 optimizer = create_default_optimizer(model.ops) tracker = ProgressTracker(frequency=10000) msg.divider("Pre-training tok2vec layer - starting at epoch %d" % epoch_start) row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")} msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings) def _save_model(epoch, is_temp=False): is_temp_str = ".temp" if is_temp else "" with model.use_params(optimizer.averages): with (output_dir / ("model%d%s.bin" % (epoch, is_temp_str))).open( "wb" ) as file_: file_.write(model.tok2vec.to_bytes()) log = { "nr_word": tracker.nr_word, "loss": tracker.loss, "epoch_loss": tracker.epoch_loss, "epoch": epoch, } with (output_dir / "log.jsonl").open("a") as file_: file_.write(srsly.json_dumps(log) + "\n") skip_counter = 0 for epoch in range(epoch_start, n_iter + epoch_start): for batch_id, batch in enumerate( util.minibatch_by_words(((text, None) for text in texts), size=batch_size) ): docs, count = make_docs( nlp, [text for (text, _) in batch], max_length=max_length, min_length=min_length, ) skip_counter += count loss = make_update( model, docs, optimizer, objective=loss_func, drop=dropout ) progress = tracker.update(epoch, loss, docs) if progress: msg.row(progress, **row_settings) if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7: break if n_save_every and (batch_id % n_save_every == 0): _save_model(epoch, is_temp=True) _save_model(epoch) tracker.epoch_loss = 0.0 if texts_loc != "-": # Reshuffle the texts if texts were loaded from a file random.shuffle(texts) if skip_counter > 0: msg.warn("Skipped {count} empty values".format(count=str(skip_counter))) msg.good("Successfully finished pretrain")
def project_assets( project_dir: Path, *, overrides: Dict[str, Any] = SimpleFrozenDict(), sparse_checkout: bool = False, ) -> None: """Fetch assets for a project using DVC if possible. project_dir (Path): Path to project directory. """ project_path = ensure_path(project_dir) config = load_project_config(project_path, overrides=overrides) assets = config.get("assets", {}) if not assets: msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0) msg.info(f"Fetching {len(assets)} asset(s)") for asset in assets: dest = (project_dir / asset["dest"]).resolve() checksum = asset.get("checksum") if "git" in asset: git_err = ( f"Cloning spaCy project templates requires Git and the 'git' command. " f"Make sure it's installed and that the executable is available." ) get_git_version(error=git_err) if dest.exists(): # If there's already a file, check for checksum if checksum and checksum == get_checksum(dest): msg.good( f"Skipping download with matching checksum: {asset['dest']}" ) continue else: if dest.is_dir(): shutil.rmtree(dest) else: dest.unlink() if "repo" not in asset["git"] or asset["git"]["repo"] is None: msg.fail( "A git asset must include 'repo', the repository address.", exits=1) if "path" not in asset["git"] or asset["git"]["path"] is None: msg.fail( "A git asset must include 'path' - use \"\" to get the entire repository.", exits=1, ) git_checkout( asset["git"]["repo"], asset["git"]["path"], dest, branch=asset["git"].get("branch"), sparse=sparse_checkout, ) msg.good(f"Downloaded asset {dest}") else: url = asset.get("url") if not url: # project.yml defines asset without URL that the user has to place check_private_asset(dest, checksum) continue fetch_asset(project_path, url, dest, checksum)
def package(input_dir, output_dir, meta_path=None, create_meta=False, force=False): """ Generate Python package for model data, including meta and required installation files. A new directory will be created in the specified output directory, and model data will be copied over. If --create-meta is set and a meta.json already exists in the output directory, the existing values will be used as the defaults in the command-line prompt. """ input_path = util.ensure_path(input_dir) output_path = util.ensure_path(output_dir) meta_path = util.ensure_path(meta_path) if not input_path or not input_path.exists(): msg.fail("Can't locate model data", input_path, exits=1) if not output_path or not output_path.exists(): msg.fail("Output directory not found", output_path, exits=1) if meta_path and not meta_path.exists(): msg.fail("Can't find model meta.json", meta_path, exits=1) meta_path = meta_path or input_path / "meta.json" if meta_path.is_file(): meta = srsly.read_json(meta_path) if not create_meta: # only print if user doesn't want to overwrite msg.good("Loaded meta.json from file", meta_path) else: meta = generate_meta(input_dir, meta, msg) for key in ("lang", "name", "version"): if key not in meta or meta[key] == "": msg.fail( "No '{}' setting found in meta.json".format(key), "This setting is required to build your package.", exits=1, ) model_name = meta["lang"] + "_" + meta["name"] model_name_v = model_name + "-" + meta["version"] main_path = output_path / model_name_v package_path = main_path / model_name if package_path.exists(): if force: shutil.rmtree(path2str(package_path)) else: msg.fail( "Package directory already exists", "Please delete the directory and try again, or use the " "`--force` flag to overwrite existing " "directories.".format(path=path2str(package_path)), exits=1, ) Path.mkdir(package_path, parents=True) shutil.copytree(path2str(input_path), path2str(package_path / model_name_v)) create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2)) create_file(main_path / "setup.py", TEMPLATE_SETUP) create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST) create_file(package_path / "__init__.py", TEMPLATE_INIT) msg.good("Successfully created package '{}'".format(model_name_v), main_path) msg.text( "To build the package, run `python setup.py sdist` in this directory.")
msg.text("RUNNING " + command) wrapped_command = f"cd {directory} && {command}" pipe = subprocess.Popen( wrapped_command, shell=True, ) pipe.wait() if pipe.returncode == 0: msg.good("TEST PASSED") else: msg.fail("TEST FAILED") msg.text('') return pipe.returncode root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # run the tests if isinstance(tests, str): returncode = run_test(tests, root_dir) elif isinstance(tests, (list, tuple)): returncode = 0 for test in tests: returncode += run_test(test, root_dir) if returncode == 0: msg.good("ALL TESTS PASSED") else: msg.fail("SOME TESTS FAILED, SEE ABOVE") sys.exit(returncode)
def speech_to_text_demo(asr: "ASROnlineAudioFrame") -> None: """Speech to Text (ASR) Microphone Demo. Interrupt the notebook's kernel to stop the app from recoring. """ asr.reset() audio = pyaudio.PyAudio() offset = {"count": 0} columns = [] devices = [] for idx in range(audio.get_device_count()): device = audio.get_device_info_by_index(idx) if not device.get("maxInputChannels"): continue devices.append(idx) columns.append((idx, device.get("name"))) if columns: msg.good("Found the following input devices!") msg.table(columns, header=("ID", "Devices"), divider=True) if devices: device_index = -2 while device_index not in devices: msg.info("Please enter the device ID") device_index = int(input()) def callback(in_data, frame_count, time_info, status): signal = np.frombuffer(in_data, dtype=np.int16) text = asr.transcribe(signal) if text: print(text, end="") offset["count"] = asr.params.offset elif offset["count"] > 0: offset["count"] -= 1 if offset["count"] == 0: print(" ", end="") return (in_data, pyaudio.paContinue) stream = audio.open( input=True, format=pyaudio.paInt16, input_device_index=device_index, stream_callback=callback, channels=asr.params.channels, rate=asr.params.sample_rate, frames_per_buffer=asr.chunk_size, ) msg.loading("Listening...") stream.start_stream() try: while stream.is_active(): time.sleep(0.1) except (KeyboardInterrupt, Exception) as e: stream.stop_stream() stream.close() audio.terminate() msg.warn("WARNING: ASR stream stopped.", e) else: msg.fail("ERROR", "No audio input device found.")
def debug_model( config, resolved_train_config, nlp, pipe, *, print_settings: Optional[Dict[str, Any]] = None, ): if not hasattr(pipe, "model"): msg.fail( f"The component '{pipe}' does not specify an object that holds a Model.", exits=1, ) model = pipe.model if not isinstance(model, Model): msg.fail( f"Requires a Thinc Model to be analysed, but found {type(model)} instead.", exits=1, ) if print_settings is None: print_settings = {} # STEP 0: Printing before training msg.info(f"Analysing model with ID {model.id}") if print_settings.get("print_before_training"): msg.divider(f"STEP 0 - before training") _print_model(model, print_settings) # STEP 1: Initializing the model and printing again with data_validation(False): try: dot_names = [resolved_train_config["train_corpus"]] with show_validation_error(): (train_corpus, ) = resolve_dot_names(config, dot_names) nlp.initialize(lambda: train_corpus(nlp)) msg.info("Initialized the model with the training corpus.") examples = list(itertools.islice(train_corpus(nlp), 5)) except ValueError: try: _set_output_dim(nO=7, model=model) with show_validation_error(): examples = [Example.from_dict(x, {}) for x in _get_docs()] nlp.initialize(lambda: examples) msg.info("Initialized the model with dummy data.") except Exception: msg.fail( "Could not initialize the model: you'll have to provide a valid 'train_corpus' argument in the config file.", exits=1, ) if print_settings.get("print_after_init"): msg.divider(f"STEP 1 - after initialization") _print_model(model, print_settings) # STEP 2: Updating the model and printing again set_dropout_rate(model, 0.2) # ugly hack to deal with Tok2Vec/Transformer listeners upstream_component = None if model.has_ref("tok2vec") and "tok2vec-listener" in model.get_ref( "tok2vec").name: upstream_component = nlp.get_pipe("tok2vec") if (model.has_ref("tok2vec") and "transformer-listener" in model.get_ref("tok2vec").name): upstream_component = nlp.get_pipe("transformer") for e in range(3): if upstream_component: upstream_component.update(examples) pipe.update(examples) if print_settings.get("print_after_training"): msg.divider(f"STEP 2 - after training") _print_model(model, print_settings) # STEP 3: the final prediction prediction = model.predict([ex.predicted for ex in examples]) if print_settings.get("print_prediction"): msg.divider(f"STEP 3 - prediction") msg.info(str(prediction)) msg.good(f"Succesfully ended analysis - model looks good.")
def create_wikigraph( output_path: Path, wiki="en", version="latest", dumps_path: Path = None, max_workers: int = None, silent: bool = None, force: bool = None, ): """ Create a `WikiGraph` from a specific dump. It can then be used by directly loading it, or it can be packaged with the `package-wikigraph` command. Parameters ---------- output_path : Path Path in which to store the `WikiGraph`. wiki : str, optional Wikipedia dump type to use, by default "en". version : str, optional Wikipedia dump version to use, by default "latest". dumps_path : Path, optional Path in which to find previously downloaded dumps, or where to save dumps downloaded in this call, by default None. max_workers : int, optional Maximum number of processes to use, by default None. silent : bool, optional Do not print anything in stout, by default None. force : bool, optional Overwrite content in output_path, if any, by default None. """ if not output_path.exists(): output_path.mkdir() msg.good(f"Created output directory: {output_path}") graph_name = f"{wiki}wiki_core" graph_path = output_path.joinpath(graph_name) if not force and graph_path.exists(): msg.fail( f"Output path already contains {graph_name} directory", "Use --force to overwrite it", exits=1, ) kwargs = { "dumps_path": dumps_path, "max_workers": max_workers, "wiki": wiki, "version": version, "verbose": not silent, } wg = WikiGraph.build(**kwargs) if not graph_path.exists(): graph_path.mkdir() with msg.loading("dump to disk..."): wg.dump(graph_path) spikex_ver = ".".join(spikex_version.split(".")[:2]) meta = get_meta() meta["name"] = graph_name meta["wiki"] = wiki meta["version"] = wg.version meta["spikex_version"] = f">={spikex_ver}" meta["fullname"] = f"{graph_name}-{spikex_ver}" meta["sources"].append("Wikipedia") meta_path = graph_path.joinpath("meta.json") meta_path.write_text(json_dumps(meta, indent=2)) msg.good(f"Successfully created {graph_name}.")
def init_model( lang, output_dir, freqs_loc=None, clusters_loc=None, jsonl_loc=None, vectors_loc=None, truncate_vectors=0, prune_vectors=-1, vectors_name=None, model_name=None, omit_extra_lookups=False, base_model=None, ): """ Create a new model from raw data, like word frequencies, Brown clusters and word vectors. If vectors are provided in Word2Vec format, they can be either a .txt or zipped as a .zip or .tar.gz. """ if jsonl_loc is not None: if freqs_loc is not None or clusters_loc is not None: settings = ["-j"] if freqs_loc: settings.append("-f") if clusters_loc: settings.append("-c") msg.warn( "Incompatible arguments", "The -f and -c arguments are deprecated, and not compatible " "with the -j argument, which should specify the same " "information. Either merge the frequencies and clusters data " "into the JSONL-formatted file (recommended), or use only the " "-f and -c files, without the other lexical attributes.", ) jsonl_loc = ensure_path(jsonl_loc) lex_attrs = srsly.read_jsonl(jsonl_loc) else: clusters_loc = ensure_path(clusters_loc) freqs_loc = ensure_path(freqs_loc) if freqs_loc is not None and not freqs_loc.exists(): msg.fail("Can't find words frequencies file", freqs_loc, exits=1) lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc) with msg.loading("Creating model..."): nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model) # Create empty extra lexeme tables so the data from spacy-lookups-data # isn't loaded if these features are accessed if omit_extra_lookups: nlp.vocab.lookups_extra = Lookups() nlp.vocab.lookups_extra.add_table("lexeme_cluster") nlp.vocab.lookups_extra.add_table("lexeme_prob") nlp.vocab.lookups_extra.add_table("lexeme_settings") msg.good("Successfully created model") if vectors_loc is not None: add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name) vec_added = len(nlp.vocab.vectors) lex_added = len(nlp.vocab) msg.good( "Sucessfully compiled vocab", "{} entries, {} vectors".format(lex_added, vec_added), ) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) return nlp
help='Path to save answers', default='answers.jsonl') return parser if __name__ == '__main__': parser = create_argparser() args = parser.parse_args() if args.pre_trained_model_path: with msg.loading("Loading pre trained model"): with open(args.pre_trained_model_path, "rb") as f: covid_q = pickle.load(f) msg.good(f"Loaded {args.pre_trained_model_path}") else: msg.text("Training new model") covid_q = train() msg.good("Trained") with open("models/covid_q.pkl", "wb") as f: pickle.dump(covid_q, f) challenge_tasks = [{ "task": "What is known about transmission, incubation, and environmental stability?", "questions": [ "Is the virus transmitted by aerosol, droplets, food, close contact, fecal matter, or water?", "How long is the incubation period for the virus?", "Can the virus be transmitted asymptomatically or during the incubation period?", "How does weather, heat, and humidity affect the tramsmission of 2019-nCoV?",
def link(origin, link_name, force=False, model_path=None): """ Create a symlink for models within the spacy/data directory. Accepts either the name of a pip package, or the local path to the model data directory. Linking models allows loading them via spacy.load(link_name). """ if util.is_package(origin): model_path = util.get_package_path(origin) else: model_path = Path(origin) if model_path is None else Path(model_path) if not model_path.exists(): msg.fail( "Can't locate model data", "The data should be located in {}".format(path2str(model_path)), exits=1, ) data_path = util.get_data_path() if not data_path or not data_path.exists(): spacy_loc = Path(__file__).parent.parent msg.fail( "Can't find the spaCy data path to create model symlink", "Make sure a directory `/data` exists within your spaCy " "installation and try again. The data directory should be located " "here:".format(path=spacy_loc), exits=1, ) link_path = util.get_data_path() / link_name if link_path.is_symlink() and not force: msg.fail( "Link '{}' already exists".format(link_name), "To overwrite an existing link, use the --force flag", exits=1, ) elif link_path.is_symlink(): # does a symlink exist? # NB: It's important to check for is_symlink here and not for exists, # because invalid/outdated symlinks would return False otherwise. link_path.unlink() elif link_path.exists(): # does it exist otherwise? # NB: Check this last because valid symlinks also "exist". msg.fail( "Can't overwrite symlink '{}'".format(link_name), "This can happen if your data directory contains a directory or " "file of the same name.", exits=1, ) details = "%s --> %s" % (path2str(model_path), path2str(link_path)) try: symlink_to(link_path, model_path) except: # noqa: E722 # This is quite dirty, but just making sure other errors are caught. msg.fail( "Couldn't link model to '{}'".format(link_name), "Creating a symlink in spacy/data failed. Make sure you have the " "required permissions and try re-running the command as admin, or " "use a virtualenv. You can still import the model as a module and " "call its load() method, or create the symlink manually.", ) msg.text(details) raise msg.good("Linking successful", details) msg.text("You can now load the model via spacy.load('{}')".format(link_name))
sess = requests.session() base_url = "https://exporter.nih.gov/" # Get the links from the target webpage r = sess.get(url_exporter) assert r.ok soup = bs4.BeautifulSoup(r.content, "lxml") # Filter for only our matches target = "CSVs/final/RePORTER_" save_links = [a["href"] for a in soup.find_all("a", href=True)] save_links = [link for link in save_links if target in link] msg.good(f"Found {len(save_links)} links in Exporter") def download(f0, f1): url = base_url + f0 r = sess.get(url) if not r.ok: print(r.content) print(r.status_code) msg.fail(f"Failed {url}") exit() with open(f1, "wb") as FOUT: FOUT.write(r.content)
def main( # fmt: off glove_dir: str = typer.Argument( ..., help="Directory containing the GloVe build"), in_file: str = typer.Argument(..., help="Input file (shuffled cooccurrences)"), vocab_file: str = typer.Argument(..., help="Vocabulary file"), out_dir: str = typer.Argument(..., help="Path to output directory"), n_threads: int = typer.Option(8, "--n-threads", "-t", help="Number of threads"), n_iter: int = typer.Option(15, "--n-iter", "-n", help="Number of iterations"), x_max: int = typer.Option( 10, "--x-max", "-x", help="Parameter specifying cutoff in weighting function"), vector_size: int = typer.Option( 128, "--vector-size", "-s", help="Dimension of word vector representations"), verbose: int = typer.Option(2, "--verbose", "-v", help="Set verbosity: 0, 1, or 2"), # fmt: on ): """ Step 4: Train the vectors Expects a file containing the shuffled cooccurrences and a vocab file and will output a plain-text vectors file. Note that this script will call into GloVe and expects you to pass in the GloVe build directory (/build if you run the Makefile). The commands will also be printed if you want to run them separately. """ output_path = Path(out_dir) if not Path(glove_dir).exists(): msg.fail("Can't find GloVe build directory", glove_dir, exits=1) if not Path(in_file).exists(): msg.fail("Can't find input file", in_file, exits=1) if not Path(vocab_file).exists(): msg.fail("Can't find vocab file", vocab_file, exits=1) if not output_path.exists(): output_path.mkdir(parents=True) msg.good(f"Created output directory {out_dir}") output_file = output_path / f"vectors_glove_{vector_size}dim" msg.info("Training vectors") cmd = (f"{glove_dir}/glove -save-file {output_file} -threads {n_threads} " f"-input-file {in_file} -x-max {x_max} -iter {n_iter} " f"-vector-size {vector_size} -binary 0 -vocab-file {vocab_file} " f"-verbose {verbose}") print(cmd) train_cmd = os.system(cmd) if train_cmd != 0: msg.fail("Failed training vectors", exits=1) msg.good("Successfully trained vectors")
stats["filtered_out"] += 1 continue stats["kept_articles"] += 1 # Concatenate item = {"meta": {}} item["text"] = "\n".join([row["title"], row["abstract"]]) # Build the meta information for k in meta_columns: item["meta"][k] = row[k] # PMID should always exist and be an integer item["meta"]["pmid"] = int(item["meta"]["pmid"]) # Save to the master file FOUT.write(item) P = Pipe(source="data/baseline/parsed/", input_suffix=".jsonl", shuffle=True) P(compute, 1) msg.good(f"Saved to {f_save}") msg.info( f"Saved {stats['kept_articles']:,}, filtered {stats['filtered_out']:,} articles that overlapped in PMC" ) filesize = Path(f_save).stat().st_size msg.info(f"Compressed filesize {filesize:,}")
def pretrain_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False, allow_dash=True), output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"), code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"), epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), # fmt: on ): """ Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, using an approximate language-modelling objective. Two objective types are available, vector-based and character-based. In the vector-based objective, we load word vectors that have been trained using a word2vec-style distributional similarity algorithm, and train a component like a CNN, BiLSTM, etc to predict vectors which match the pretrained ones. The weights are saved to a directory after each epoch. You can then pass a path to one of these pretrained weights files to the 'spacy train' command. This technique may be especially helpful if you have little labelled data. However, it's still quite experimental, so your mileage may vary. To load the weights back in during 'spacy train', you need to ensure all settings are the same between pretraining and training. Ideally, this is done by using the same config file for both commands. DOCS: https://spacy.io/api/cli#pretrain """ config_overrides = parse_config_overrides(ctx.args) import_code(code_path) verify_cli_args(config_path, output_dir, resume_path, epoch_resume) setup_gpu(use_gpu) msg.info(f"Loading config from: {config_path}") with show_validation_error(config_path): raw_config = load_config( config_path, overrides=config_overrides, interpolate=False ) config = raw_config.interpolate() if not config.get("pretraining"): # TODO: What's the solution here? How do we handle optional blocks? msg.fail("The [pretraining] block in your config is empty", exits=1) if not output_dir.exists(): output_dir.mkdir(parents=True) msg.good(f"Created output directory: {output_dir}") # Save non-interpolated config raw_config.to_disk(output_dir / "config.cfg") msg.good("Saved config file in the output directory") pretrain( config, output_dir, resume_path=resume_path, epoch_resume=epoch_resume, use_gpu=use_gpu, silent=False, ) msg.good("Successfully finished pretrain")
def game_loop(num_players, ai_file=None): p1 = create_player(1) if num_players == 1: with open_policy(ai_file) as policy: p2 = create_player(2, policy) else: p2 = create_player(2) player_sequence = [p1, p2] if randrange(2) else [p2, p1] msg.text(f"{player_sequence[0].name} goes first.") game = [0, 0, 0] state = None player, opp = player_sequence while 1: while 1: msg.text( f"{player.name} total score is {game[0]} and current round total is {game[2]}. " + f"\n{opp.name} score is {game[1]}") if player.decision_function is None: msg.text( f"{player.name}, please choose your move (R to roll or H to hold): " ) move = input() else: move = player.decision_function(*game) if move in ('r', 'R', 'roll'): roll = randrange(1, 7) if roll > 1: msg.text( f"{player.name} rolled a {roll}. {player.name} current round total " + f"is {roll + game[2]}") game[2] += roll else: msg.text( f"{player.name} rolled a {roll} and lost all his points!" ) break elif move in ('h', 'H', 'hold'): msg.text( f"{player.name} held. Adding {game[2]} to {game[0]} for " + f"{player.name} total of {game[0]+game[2]}.") game[0] += game[2] break elif move in ('x', 'X', 'q', 'Q', 'quit', 'end'): state = const.EXIT break if state is const.EXIT: msg.text("Exiting...") break if game[0] >= 100: msg.good(f"{player.name} wins!") break game[0], game[1], game[2] = game[1], game[0], 0 player, opp = opp, player
def main(glove_dir, in_dir, out_dir, min_count=5, memory=4.0, window_size=15, verbose=2): """ Step 3: Build vocabulary and frequency counts Expects a directory of preprocessed .s2v input files and will use GloVe to collect unigram counts and construct and shuffle cooccurrence data. See here for installation instructions: https://github.com/stanfordnlp/GloVe Note that this script will call into GloVe and expects you to pass in the GloVe build directory (/build if you run the Makefile). The commands will also be printed if you want to run them separately. """ input_path = Path(in_dir) output_path = Path(out_dir) if not Path(glove_dir).exists(): msg.fail("Can't find GloVe build directory", glove_dir, exits=1) if not input_path.exists() or not input_path.is_dir(): msg.fail("Not a valid input directory", in_dir, exits=1) input_files = [ str(fp) for fp in input_path.iterdir() if fp.suffix == ".s2v" ] if not input_files: msg.fail("No .s2v files found in input directory", in_dir, exits=1) msg.info(f"Using {len(input_files)} input files") if not output_path.exists(): output_path.mkdir(parents=True) msg.good(f"Created output directory {out_dir}") vocab_file = output_path / f"vocab.txt" cooc_file = output_path / f"cooccurrence.bin" cooc_shuffle_file = output_path / f"cooccurrence.shuf.bin" msg.info("Creating vocabulary counts") cmd = (f"cat {' '.join(input_files)} | {glove_dir}/vocab_count " f"-min-count {min_count} -verbose {verbose} > {vocab_file}") print(cmd) vocab_cmd = os.system(cmd) if vocab_cmd != 0 or not Path(vocab_file).exists(): msg.fail("Failed creating vocab counts", exits=1) msg.good("Created vocab counts", vocab_file) msg.info("Creating cooccurrence statistics") cmd = ( f"cat {' '.join(input_files)} | {glove_dir}/cooccur -memory {memory} " f"-vocab-file {vocab_file} -verbose {verbose} " "-window-size {window_size} > {cooc_file}") print(cmd) cooccur_cmd = os.system(cmd) if cooccur_cmd != 0 or not Path(cooc_file).exists(): msg.fail("Failed creating cooccurrence statistics", exits=1) msg.good("Created cooccurrence statistics", cooc_file) msg.info("Shuffling cooccurrence file") cmd = (f"{glove_dir}/shuffle -memory {memory} -verbose {verbose} " f"< {cooc_file} > {cooc_shuffle_file}") print(cmd) shuffle_cmd = os.system(cmd) if shuffle_cmd != 0 or not Path(cooc_shuffle_file).exists(): msg.fail("Failed to shuffle cooccurrence file", exits=1) msg.good("Shuffled cooccurrence file", cooc_shuffle_file)
def train_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), output_path: Optional[Path] = Opt( None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"), code_path: Optional[Path] = Opt( None, "--code", "-c", help= "Path to Python file with additional code (registered functions) to be imported" ), verbose: bool = Opt( False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") # fmt: on ): """ Train or update a spaCy pipeline. Requires data in spaCy's binary format. To convert data from other formats, use the `spacy convert` command. The config file includes all settings and hyperparameters used during training. To override settings in the config, e.g. settings that point to local paths or that you want to experiment with, you can override them as command line options. For instance, --training.batch_size 128 overrides the value of "batch_size" in the block "[training]". The --code argument lets you pass in a Python file that's imported before training. It can be used to register custom functions and architectures that can then be referenced in the config. DOCS: https://spacy.io/api/cli#train """ util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) # Make sure all files and paths exists if they are needed if not config_path or (str(config_path) != "-" and not config_path.exists()): msg.fail("Config file not found", config_path, exits=1) if output_path is not None and not output_path.exists(): output_path.mkdir(parents=True) msg.good(f"Created output directory: {output_path}") overrides = parse_config_overrides(ctx.args) import_code(code_path) setup_gpu(use_gpu) with show_validation_error(config_path): config = util.load_config(config_path, overrides=overrides, interpolate=False) msg.divider("Initializing pipeline") with show_validation_error(config_path, hint_fill=False): nlp = init_nlp(config, use_gpu=use_gpu) msg.good("Initialized pipeline") msg.divider("Training pipeline") train(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr)
def build( # fmt: off repo: str, commit: str, package_name: str = Option(None, help="Package name (if different from repo)"), py35: bool = Option(False, "--py35", help="Build wheels for Python 3.5"), llvm: bool = Option(False, "--llvm", help="Requires LLVM to be installed"), universal: bool = Option( False, "--universal", help="Build universal (pure Python) wheel and sdist"), skip_tests: bool = Option( False, "--skip-tests", help="Don't run tests (e.g. if package doesn't have any)"), build_constraints: bool = Option( False, "--build-constraints", help="Use build constraints for build requirements"), # fmt: on ): """Build wheels for a given repo and commit / tag.""" print(LOGO) repo_id = get_repo_id() repo_id = repo_id.replace(".git", "") user, package = repo.lower().split("/", 1) if package_name is None: package_name = package.replace("-", "_") msg.info(f"Building in repo {repo_id}") msg.info(f"Building wheels for {user}/{package}\n") if universal: msg.warn( "Building only universal sdist and wheel, no cross-platform wheels" ) if skip_tests: msg.warn("Not running any tests") clone_url = DEFAULT_CLONE_TEMPLATE.format(f"{user}/{package}") repo = get_gh().get_repo(repo_id) with msg.loading("Finding a unique name for this release..."): # Pick the release_name by finding an unused one i = 1 while True: release_name = f"{package_name}-{commit}" if i > 1: release_name += f"-{i}" try: repo.get_release(release_name) except github.UnknownObjectException: break i += 1 branch_name = f"branch-for-{release_name}" bs = { "clone-url": clone_url, "package-name": package_name, "commit": commit, "options": { "llvm": llvm, "py35": py35, "universal": universal, "skip_tests": skip_tests, "build_constraints": build_constraints, }, "upload-to": { "type": "github-release", "repo-id": repo_id, "release-id": release_name, }, } bs_json = json.dumps(bs) bs_json_formatted = json.dumps(bs, indent=4) msg.text(f"Creating release {release_name} to collect assets") release_text = f"https://github.com/{user}/{package}\n\n### Build spec\n\n```json\n{bs_json_formatted}\n```" release = repo.create_git_release(release_name, release_name, release_text) with msg.loading("Creating build branch..."): # 'master' is a 'Commit'. 'master.commit' is a 'GitCommit'. These are # different types that are mostly *not* interchangeable: # https://pygithub.readthedocs.io/en/latest/github_objects/Commit.html # https://pygithub.readthedocs.io/en/latest/github_objects/GitCommit.html master = repo.get_commit("master") master_gitcommit = master.commit patch = github.InputGitTreeElement( "build-spec.json", "100644", "blob", content=bs_json, ) tree = repo.create_git_tree([patch], master_gitcommit.tree) our_gitcommit = repo.create_git_commit(f"Building: {release_name}", tree, [master_gitcommit]) repo.create_git_ref(f"refs/heads/{branch_name}", our_gitcommit.sha) msg.good(f"Commit is {our_gitcommit.sha[:8]} in branch {branch_name}") msg.text(f"Release: {release.html_url}") msg.text( f"Checks: https://github.com/{repo_id}/commit/{our_gitcommit.sha}/checks" )
def main(path_file: str = typer.Argument(..., help='Path to input file')): """ Limpieza de la Base de Datos Este script contiene todas las funciones necesarias para limpiar y transformar la base en la versión más adecuada para el proyecto. Los objetivos son los siguientes: * Depurar los campos que contengan texto con algún error de escritura o caracteres inadecuados para el procesamiento. * Fijar un tipo de dato adecuado para el manejo con otros archivos * Compactar u organizar las columnas que en un formato que permita una mejor visualización y organización de la información referente a los reportes mensuales. """ #Load Data msg.info("Load data...") data = pd.read_csv(path_file, encoding='latin1', low_memory=False, dtype=SCHEMA) msg.info("General Information:\n") msg.info(data.info()) #Remove rows with NIVEL== FID msg.good("Remove FID...") data = data[data.DESC_NIVEL != 'FID'].copy() #Fields cleaning DESCRIPCIONES(DES_*) msg.info("Cleaning DESC_ ...") data.DESC_RAMO = data.DESC_RAMO.apply(lambda x: cln_txt(str(x))) data.DESC_UR = data.DESC_UR.apply(lambda x: cln_txt(str(x))) data.DESC_AI = data.DESC_AI.apply(lambda x: cln_txt(str(x))) data.DESC_PP = data.DESC_PP.apply(lambda x: cln_txt(str(x))) data.OBJETIVO_PND = data.OBJETIVO_PND.apply(lambda x: cln_txt(str(x))) data.DESC_OBJETIVO_PROGRAMA_PND = data.DESC_OBJETIVO_PROGRAMA_PND.apply( lambda x: cln_txt(str(x))) data.OBJETIVO_ESTRATEGICO = data.OBJETIVO_ESTRATEGICO.apply( lambda x: cln_txt(str(x))) data.DESC_MATRIZ = data.DESC_MATRIZ.apply(lambda x: cln_txt(str(x))) data.DESC_OBJETIVO = data.DESC_OBJETIVO.apply(lambda x: cln_txt(str(x))) #Change wrong names msg.info("Changes names...") data.TIPO_INDICADOR = data.TIPO_INDICADOR.map(DTIPO_INDICADOR) data.DIMENSION = data.DIMENSION.map(DDIMENSION) #Change data type msg.info("Change data type...") data.ID_OBJETIVO = data.ID_OBJETIVO.astype('int') data.ID_OBJETIVO_PADRE = data.ID_OBJETIVO_PADRE.fillna(-1).astype('int') data.ID_INDICADOR_CICLO_ANTERIOR = data.ID_INDICADOR_CICLO_ANTERIOR.fillna( -1).astype('int') data.CICLO_LINEA_BASE = data.CICLO_LINEA_BASE.fillna(-1).astype('int') #List of columns to group data msg.info("Create List of Columns...") META_MES_COL = data.columns[data.columns.str.startswith( 'META_MES')].tolist() META_AJUSTADA_MES_COL = data.columns[data.columns.str.startswith( 'META_AJUSTADA_MES')].tolist() AVANCE_MES_COL = data.columns[data.columns.str.startswith( 'AVANCE_MES')].tolist() JUSTIFICACION_AJUSTE_MES_COL = data.columns[data.columns.str.startswith( 'JUSTIFICACION_AJUSTE_MES')].tolist() AVANCE_CAUSA_MES_COL = data.columns[data.columns.str.startswith( 'AVANCE_CAUSA_MES')].tolist() AVANCE_EFECTO_MES_COL = data.columns[data.columns.str.startswith( 'AVANCE_EFECTO_MES')].tolist() AVANCE_OTROS_MOTIVOS_MES_COL = data.columns[data.columns.str.startswith( 'AVANCE_OTROS_MOTIVOS_MES')].tolist() #META by months msg.info("Meta by months...") for i in range(12): data[f'RECORDS_META_MES{i+1}']=(data[f'META_MES{i+1}'].astype('string')+':'\ +data[f'META_MES{i+1}_NUM'].astype('string')+':'+data[f'META_MES{i+1}_DEN']\ .astype('string')) #META AJUSTADA by months msg.info("Meta Ajustada by months...") for i in range(12): data[f'RECORDS_META_AJUSTADA_MES{i+1}']=(data[f'META_MES{i+1}'].astype('string')\ +':'+data[f'META_MES{i+1}_NUM'].astype('string')+':'+data[f'META_MES{i+1}_DEN']\ .astype('string')) #AVANCE by months msg.info("AVANCE by months...") for i in range(12): data[f'RECORDS_AVANCE_MES{i+1}']=(data[f'META_MES{i+1}'].astype('string')+':'+\ data[f'META_MES{i+1}_NUM'].astype('string')+':'+data[f'META_MES{i+1}_DEN']\ .astype('string')) #JUSTIFICACION by Months msg.info("JUSTIFICACION by months...") func = '|'.join data['JUSTIFICACIONES_AJUSTE_POR_MES']=data[JUSTIFICACION_AJUSTE_MES_COL]\ .fillna('#').astype('str').apply(lambda x:func(x),axis=1) #AVANCE CAUSA by months msg.info("AVANCE CAUSA by months...") data['AVANCE_CAUSA_POR_MES']=data[AVANCE_CAUSA_MES_COL].fillna('#').astype('str')\ .apply(lambda x:func(x),axis=1) #AVANCE EFECTO by Months msg.info("AVANCE EFECTO by months...") data['AVANCE_EFECTO_POR_MES']=data[AVANCE_EFECTO_MES_COL].fillna('#').astype('str')\ .apply(lambda x:func(x),axis=1) #AVANCE OTROS MOTIVOS by months msg.info("AVANCE OTROS MOTIVOs by months...") data['AVANCE_OTROS_MOTIVOS_POR_MES']=data[AVANCE_OTROS_MOTIVOS_MES_COL].fillna('#')\ .astype('str').apply(lambda x:func(x),axis=1) #Delete columns group data msg.info("delete columns") data.drop(labels=META_MES_COL + META_AJUSTADA_MES_COL + AVANCE_MES_COL, inplace=True, axis=1) data.drop(labels=JUSTIFICACION_AJUSTE_MES_COL + AVANCE_CAUSA_MES_COL + AVANCE_EFECTO_MES_COL + AVANCE_OTROS_MOTIVOS_MES_COL, inplace=True, axis=1) msg.info("General Information:\n") data.info() #Save File msg.info("Save the Files...") data.reset_index().to_feather('base') #Para version feather #data.to_csv('base.csv.zip',encoding='latin1', index=False,compression='zip')# Para guardad en versión csv msg.good("OK!!!")
def download(model, direct=False, *pip_args): """ Download compatible model from default download path using pip. Model can be shortcut, model name or, if --direct flag is set, full model name with version. For direct downloads, the compatibility check will be skipped. """ if not require_package("spacy") and "--no-deps" not in pip_args: msg.warn( "Skipping model package dependencies and setting `--no-deps`. " "You don't seem to have the spaCy package itself installed " "(maybe because you've built from source?), so installing the " "model dependencies would cause spaCy to be downloaded, which " "probably isn't what you want. If the model package has other " "dependencies, you'll have to install them manually.") pip_args = pip_args + ("--no-deps", ) dl_tpl = "{m}-{v}/{m}-{v}.tar.gz#egg={m}=={v}" if direct: components = model.split("-") model_name = "".join(components[:-1]) version = components[-1] dl = download_model(dl_tpl.format(m=model_name, v=version), pip_args) else: shortcuts = get_json(about.__shortcuts__, "available shortcuts") model_name = shortcuts.get(model, model) compatibility = get_compatibility() version = get_version(model_name, compatibility) dl = download_model(dl_tpl.format(m=model_name, v=version), pip_args) if dl != 0: # if download subprocess doesn't return 0, exit sys.exit(dl) msg.good( "Download and installation successful", "You can now load the model via spacy.load('{}')".format( model_name), ) # Only create symlink if the model is installed via a shortcut like 'en'. # There's no real advantage over an additional symlink for en_core_web_sm # and if anything, it's more error prone and causes more confusion. if model in shortcuts: try: # Get package path here because link uses # pip.get_installed_distributions() to check if model is a # package, which fails if model was just installed via # subprocess package_path = get_package_path(model_name) link(model_name, model, force=True, model_path=package_path) except: # noqa: E722 # Dirty, but since spacy.download and the auto-linking is # mostly a convenience wrapper, it's best to show a success # message and loading instructions, even if linking fails. msg.warn( "Download successful but linking failed", "Creating a shortcut link for '{}' didn't work (maybe you " "don't have admin permissions?), but you can still load " "the model via its full package name: " "nlp = spacy.load('{}')".format(model, model_name), ) # If a model is downloaded and then loaded within the same process, our # is_package check currently fails, because pkg_resources.working_set # is not refreshed automatically (see #3923). We're trying to work # around this here be requiring the package explicitly. require_package(model_name)
def init_vectors_cli( # fmt: off lang: str = Arg(..., help="The language of the nlp object to create"), vectors_loc: Path = Arg(..., help="Vectors file in Word2Vec format", exists=True), output_dir: Path = Arg(..., help="Pipeline output directory"), prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"), truncate: int = Opt( 0, "--truncate", "-t", help= "Optional number of vectors to truncate to when reading in vectors file" ), mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"), name: Optional[str] = Opt( None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors" ), verbose: bool = Opt( False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), jsonl_loc: Optional[Path] = Opt( None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True), # fmt: on ): """Convert word vectors for use with spaCy. Will export an nlp object that you can use in the [initialize] block of your config to initialize a model with vectors. """ util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) msg.info(f"Creating blank nlp object for language '{lang}'") nlp = util.get_lang_class(lang)() if jsonl_loc is not None: update_lexemes(nlp, jsonl_loc) convert_vectors( nlp, vectors_loc, truncate=truncate, prune=prune, name=name, mode=mode, ) msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors") nlp.to_disk(output_dir) msg.good( "Saved nlp object with vectors to output directory. You can now use the " "path to it in your config as the 'vectors' setting in [initialize].", output_dir.resolve(), )
def debug_data( config_path: Path, *, config_overrides: Dict[str, Any] = {}, ignore_warnings: bool = False, verbose: bool = False, no_format: bool = True, silent: bool = True, ): msg = Printer( no_print=silent, pretty=not no_format, ignore_warnings=ignore_warnings ) # Make sure all files and paths exists if they are needed with show_validation_error(config_path): cfg = util.load_config(config_path, overrides=config_overrides) nlp = util.load_model_from_config(cfg) config = nlp.config.interpolate() T = registry.resolve(config["training"], schema=ConfigSchemaTraining) # Use original config here, not resolved version sourced_components = get_sourced_components(cfg) frozen_components = T["frozen_components"] resume_components = [p for p in sourced_components if p not in frozen_components] pipeline = nlp.pipe_names factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names] msg.divider("Data file validation") # Create the gold corpus to be able to better analyze data dot_names = [T["train_corpus"], T["dev_corpus"]] train_corpus, dev_corpus = resolve_dot_names(config, dot_names) nlp.initialize(lambda: train_corpus(nlp)) msg.good("Pipeline can be initialized with data") train_dataset = list(train_corpus(nlp)) dev_dataset = list(dev_corpus(nlp)) msg.good("Corpus is loadable") # Create all gold data here to avoid iterating over the train_dataset constantly gold_train_data = _compile_gold(train_dataset, factory_names, nlp, make_proj=True) gold_train_unpreprocessed_data = _compile_gold( train_dataset, factory_names, nlp, make_proj=False ) gold_dev_data = _compile_gold(dev_dataset, factory_names, nlp, make_proj=True) train_texts = gold_train_data["texts"] dev_texts = gold_dev_data["texts"] frozen_components = T["frozen_components"] msg.divider("Training stats") msg.text(f"Language: {nlp.lang}") msg.text(f"Training pipeline: {', '.join(pipeline)}") if resume_components: msg.text(f"Components from other pipelines: {', '.join(resume_components)}") if frozen_components: msg.text(f"Frozen components: {', '.join(frozen_components)}") msg.text(f"{len(train_dataset)} training docs") msg.text(f"{len(dev_dataset)} evaluation docs") if not len(gold_dev_data): msg.fail("No evaluation docs") overlap = len(train_texts.intersection(dev_texts)) if overlap: msg.warn(f"{overlap} training examples also in evaluation data") else: msg.good("No overlap between training and evaluation data") # TODO: make this feedback more fine-grained and report on updated # components vs. blank components if not resume_components and len(train_dataset) < BLANK_MODEL_THRESHOLD: text = f"Low number of examples to train a new pipeline ({len(train_dataset)})" if len(train_dataset) < BLANK_MODEL_MIN_THRESHOLD: msg.fail(text) else: msg.warn(text) msg.text( f"It's recommended to use at least {BLANK_MODEL_THRESHOLD} examples " f"(minimum {BLANK_MODEL_MIN_THRESHOLD})", show=verbose, ) msg.divider("Vocab & Vectors") n_words = gold_train_data["n_words"] msg.info( f"{n_words} total word(s) in the data ({len(gold_train_data['words'])} unique)" ) if gold_train_data["n_misaligned_words"] > 0: n_misaligned = gold_train_data["n_misaligned_words"] msg.warn(f"{n_misaligned} misaligned tokens in the training data") if gold_dev_data["n_misaligned_words"] > 0: n_misaligned = gold_dev_data["n_misaligned_words"] msg.warn(f"{n_misaligned} misaligned tokens in the dev data") most_common_words = gold_train_data["words"].most_common(10) msg.text( f"10 most common words: {_format_labels(most_common_words, counts=True)}", show=verbose, ) if len(nlp.vocab.vectors): msg.info( f"{len(nlp.vocab.vectors)} vectors ({nlp.vocab.vectors.n_keys} " f"unique keys, {nlp.vocab.vectors_length} dimensions)" ) n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values()) msg.warn( "{} words in training data without vectors ({:.0f}%)".format( n_missing_vectors, 100 * (n_missing_vectors / gold_train_data["n_words"]), ), ) msg.text( "10 most common words without vectors: {}".format( _format_labels( gold_train_data["words_missing_vectors"].most_common(10), counts=True, ) ), show=verbose, ) else: msg.info("No word vectors present in the package") if "ner" in factory_names: # Get all unique NER labels present in the data labels = set( label for label in gold_train_data["ner"] if label not in ("O", "-", None) ) label_counts = gold_train_data["ner"] model_labels = _get_labels_from_model(nlp, "ner") has_low_data_warning = False has_no_neg_warning = False has_ws_ents_error = False has_punct_ents_warning = False msg.divider("Named Entity Recognition") msg.info(f"{len(model_labels)} label(s)") missing_values = label_counts["-"] msg.text(f"{missing_values} missing value(s) (tokens with '-' label)") for label in labels: if len(label) == 0: msg.fail("Empty label found in train data") labels_with_counts = [ (label, count) for label, count in label_counts.most_common() if label != "-" ] labels_with_counts = _format_labels(labels_with_counts, counts=True) msg.text(f"Labels in train data: {_format_labels(labels)}", show=verbose) missing_labels = model_labels - labels if missing_labels: msg.warn( "Some model labels are not present in the train data. The " "model performance may be degraded for these labels after " f"training: {_format_labels(missing_labels)}." ) if gold_train_data["ws_ents"]: msg.fail(f"{gold_train_data['ws_ents']} invalid whitespace entity spans") has_ws_ents_error = True if gold_train_data["punct_ents"]: msg.warn(f"{gold_train_data['punct_ents']} entity span(s) with punctuation") has_punct_ents_warning = True for label in labels: if label_counts[label] <= NEW_LABEL_THRESHOLD: msg.warn( f"Low number of examples for label '{label}' ({label_counts[label]})" ) has_low_data_warning = True with msg.loading("Analyzing label distribution..."): neg_docs = _get_examples_without_label(train_dataset, label) if neg_docs == 0: msg.warn(f"No examples for texts WITHOUT new label '{label}'") has_no_neg_warning = True if not has_low_data_warning: msg.good("Good amount of examples for all labels") if not has_no_neg_warning: msg.good("Examples without occurrences available for all labels") if not has_ws_ents_error: msg.good("No entities consisting of or starting/ending with whitespace") if not has_punct_ents_warning: msg.good("No entities consisting of or starting/ending with punctuation") if has_low_data_warning: msg.text( f"To train a new entity type, your data should include at " f"least {NEW_LABEL_THRESHOLD} instances of the new label", show=verbose, ) if has_no_neg_warning: msg.text( "Training data should always include examples of entities " "in context, as well as examples without a given entity " "type.", show=verbose, ) if has_ws_ents_error: msg.text( "As of spaCy v2.1.0, entity spans consisting of or starting/ending " "with whitespace characters are considered invalid." ) if has_punct_ents_warning: msg.text( "Entity spans consisting of or starting/ending " "with punctuation can not be trained with a noise level > 0." ) if "textcat" in factory_names: msg.divider("Text Classification (Exclusive Classes)") labels = _get_labels_from_model(nlp, "textcat") msg.info(f"Text Classification: {len(labels)} label(s)") msg.text(f"Labels: {_format_labels(labels)}", show=verbose) missing_labels = labels - set(gold_train_data["cats"]) if missing_labels: msg.warn( "Some model labels are not present in the train data. The " "model performance may be degraded for these labels after " f"training: {_format_labels(missing_labels)}." ) if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]): msg.warn( "Potential train/dev mismatch: the train and dev labels are " "not the same. " f"Train labels: {_format_labels(gold_train_data['cats'])}. " f"Dev labels: {_format_labels(gold_dev_data['cats'])}." ) if len(labels) < 2: msg.fail( "The model does not have enough labels. 'textcat' requires at " "least two labels due to mutually-exclusive classes, e.g. " "LABEL/NOT_LABEL or POSITIVE/NEGATIVE for a binary " "classification task." ) if ( gold_train_data["n_cats_bad_values"] > 0 or gold_dev_data["n_cats_bad_values"] > 0 ): msg.fail( "Unsupported values for cats: the supported values are " "1.0/True and 0.0/False." ) if gold_train_data["n_cats_multilabel"] > 0: # Note: you should never get here because you run into E895 on # initialization first. msg.fail( "The train data contains instances without mutually-exclusive " "classes. Use the component 'textcat_multilabel' instead of " "'textcat'." ) if gold_dev_data["n_cats_multilabel"] > 0: msg.fail( "The dev data contains instances without mutually-exclusive " "classes. Use the component 'textcat_multilabel' instead of " "'textcat'." ) if "textcat_multilabel" in factory_names: msg.divider("Text Classification (Multilabel)") labels = _get_labels_from_model(nlp, "textcat_multilabel") msg.info(f"Text Classification: {len(labels)} label(s)") msg.text(f"Labels: {_format_labels(labels)}", show=verbose) missing_labels = labels - set(gold_train_data["cats"]) if missing_labels: msg.warn( "Some model labels are not present in the train data. The " "model performance may be degraded for these labels after " f"training: {_format_labels(missing_labels)}." ) if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]): msg.warn( "Potential train/dev mismatch: the train and dev labels are " "not the same. " f"Train labels: {_format_labels(gold_train_data['cats'])}. " f"Dev labels: {_format_labels(gold_dev_data['cats'])}." ) if ( gold_train_data["n_cats_bad_values"] > 0 or gold_dev_data["n_cats_bad_values"] > 0 ): msg.fail( "Unsupported values for cats: the supported values are " "1.0/True and 0.0/False." ) if gold_train_data["n_cats_multilabel"] > 0: if gold_dev_data["n_cats_multilabel"] == 0: msg.warn( "Potential train/dev mismatch: the train data contains " "instances without mutually-exclusive classes while the " "dev data contains only instances with mutually-exclusive " "classes." ) else: msg.warn( "The train data contains only instances with " "mutually-exclusive classes. You can potentially use the " "component 'textcat' instead of 'textcat_multilabel'." ) if gold_dev_data["n_cats_multilabel"] > 0: msg.fail( "Train/dev mismatch: the dev data contains instances " "without mutually-exclusive classes while the train data " "contains only instances with mutually-exclusive classes." ) if "tagger" in factory_names: msg.divider("Part-of-speech Tagging") labels = [label for label in gold_train_data["tags"]] model_labels = _get_labels_from_model(nlp, "tagger") msg.info(f"{len(labels)} label(s) in train data") missing_labels = model_labels - set(labels) if missing_labels: msg.warn( "Some model labels are not present in the train data. The " "model performance may be degraded for these labels after " f"training: {_format_labels(missing_labels)}." ) labels_with_counts = _format_labels( gold_train_data["tags"].most_common(), counts=True ) msg.text(labels_with_counts, show=verbose) if "morphologizer" in factory_names: msg.divider("Morphologizer (POS+Morph)") labels = [label for label in gold_train_data["morphs"]] model_labels = _get_labels_from_model(nlp, "morphologizer") msg.info(f"{len(labels)} label(s) in train data") missing_labels = model_labels - set(labels) if missing_labels: msg.warn( "Some model labels are not present in the train data. The " "model performance may be degraded for these labels after " f"training: {_format_labels(missing_labels)}." ) labels_with_counts = _format_labels( gold_train_data["morphs"].most_common(), counts=True ) msg.text(labels_with_counts, show=verbose) if "parser" in factory_names: has_low_data_warning = False msg.divider("Dependency Parsing") # profile sentence length msg.info( f"Found {gold_train_data['n_sents']} sentence(s) with an average " f"length of {gold_train_data['n_words'] / gold_train_data['n_sents']:.1f} words." ) # check for documents with multiple sentences sents_per_doc = gold_train_data["n_sents"] / len(gold_train_data["texts"]) if sents_per_doc < 1.1: msg.warn( f"The training data contains {sents_per_doc:.2f} sentences per " f"document. When there are very few documents containing more " f"than one sentence, the parser will not learn how to segment " f"longer texts into sentences." ) # profile labels labels_train = [label for label in gold_train_data["deps"]] labels_train_unpreprocessed = [ label for label in gold_train_unpreprocessed_data["deps"] ] labels_dev = [label for label in gold_dev_data["deps"]] if gold_train_unpreprocessed_data["n_nonproj"] > 0: n_nonproj = gold_train_unpreprocessed_data["n_nonproj"] msg.info(f"Found {n_nonproj} nonprojective train sentence(s)") if gold_dev_data["n_nonproj"] > 0: n_nonproj = gold_dev_data["n_nonproj"] msg.info(f"Found {n_nonproj} nonprojective dev sentence(s)") msg.info(f"{len(labels_train_unpreprocessed)} label(s) in train data") msg.info(f"{len(labels_train)} label(s) in projectivized train data") labels_with_counts = _format_labels( gold_train_unpreprocessed_data["deps"].most_common(), counts=True ) msg.text(labels_with_counts, show=verbose) # rare labels in train for label in gold_train_unpreprocessed_data["deps"]: if gold_train_unpreprocessed_data["deps"][label] <= DEP_LABEL_THRESHOLD: msg.warn( f"Low number of examples for label '{label}' " f"({gold_train_unpreprocessed_data['deps'][label]})" ) has_low_data_warning = True # rare labels in projectivized train rare_projectivized_labels = [] for label in gold_train_data["deps"]: if ( gold_train_data["deps"][label] <= DEP_LABEL_THRESHOLD and DELIMITER in label ): rare_projectivized_labels.append( f"{label}: {gold_train_data['deps'][label]}" ) if len(rare_projectivized_labels) > 0: msg.warn( f"Low number of examples for {len(rare_projectivized_labels)} " "label(s) in the projectivized dependency trees used for " "training. You may want to projectivize labels such as punct " "before training in order to improve parser performance." ) msg.warn( f"Projectivized labels with low numbers of examples: ", ", ".join(rare_projectivized_labels), show=verbose, ) has_low_data_warning = True # labels only in train if set(labels_train) - set(labels_dev): msg.warn( "The following labels were found only in the train data:", ", ".join(set(labels_train) - set(labels_dev)), show=verbose, ) # labels only in dev if set(labels_dev) - set(labels_train): msg.warn( "The following labels were found only in the dev data:", ", ".join(set(labels_dev) - set(labels_train)), show=verbose, ) if has_low_data_warning: msg.text( f"To train a parser, your data should include at " f"least {DEP_LABEL_THRESHOLD} instances of each label.", show=verbose, ) # multiple root labels if len(gold_train_unpreprocessed_data["roots"]) > 1: msg.warn( f"Multiple root labels " f"({', '.join(gold_train_unpreprocessed_data['roots'])}) " f"found in training data. spaCy's parser uses a single root " f"label ROOT so this distinction will not be available." ) # these should not happen, but just in case if gold_train_data["n_nonproj"] > 0: msg.fail( f"Found {gold_train_data['n_nonproj']} nonprojective " f"projectivized train sentence(s)" ) if gold_train_data["n_cycles"] > 0: msg.fail( f"Found {gold_train_data['n_cycles']} projectivized train sentence(s) with cycles" ) msg.divider("Summary") good_counts = msg.counts[MESSAGES.GOOD] warn_counts = msg.counts[MESSAGES.WARN] fail_counts = msg.counts[MESSAGES.FAIL] if good_counts: msg.good(f"{good_counts} {'check' if good_counts == 1 else 'checks'} passed") if warn_counts: msg.warn(f"{warn_counts} {'warning' if warn_counts == 1 else 'warnings'}") if fail_counts: msg.fail(f"{fail_counts} {'error' if fail_counts == 1 else 'errors'}") sys.exit(1)
def main( in_file, out_dir=None, model_file=None, config_file=None, spacy_model=None, fine_tune=False, ): """Train CRF entity tagger.""" if config_file: msg.info("Loading config from disk") component_config = srsly.read_json(config_file) msg.good("Successfully loaded config from file.", config_file) else: component_config = None crf_extractor = CRFExtractor(component_config=component_config) if model_file is not None: msg.info(f"Loading model from disk.") crf_extractor = crf_extractor.from_disk(model_file) msg.good("Successfully loaded model from file.", model_file) msg.info("Loading training examples.") train_examples = read_file(in_file) msg.good( f"Successfully loaded {len(train_examples)} training examples from file.", in_file) if spacy_model is not None: nlp = spacy.load(spacy_model) msg.info(f"Using spaCy model: {spacy_model}") else: nlp = spacy.blank("en") msg.info(f"Using spaCy blank: 'en'") tokenizer = SpacyTokenizer(nlp=nlp) use_dense_features = crf_extractor.use_dense_features() train_crf = [ gold_example_to_crf_tokens(ex, tokenizer=tokenizer, use_dense_features=use_dense_features) for ex in train_examples ] if fine_tune: msg.info("Fine-tuning hyper params.") rs = crf_extractor.fine_tune(train_crf, cv=5, n_iter=30, random_state=42) msg.good("Setting fine-tuned hyper params:", rs.best_params_) crf_extractor.component_config.update(rs.best_params_) msg.info("Training entity tagger with CRF.") crf_extractor.train(train_crf) model_path = pathlib.Path(out_dir or ".").resolve() / "model.pkl" msg.info("Saving model to disk") model_path.parent.mkdir(exist_ok=True) crf_extractor.to_disk(model_path) msg.good("Successfully saved model to file.", str(model_path.relative_to(os.getcwd())))