def train_model( model, train_path, eval_path, n_iter=10, output=None, tok2vec=None, ): """ Train a model from Prodigy annotations and optionally save out the best model to disk. """ spacy.util.fix_random_seed(0) with msg.loading(f"Loading '{model}'..."): if model.startswith("blank:"): nlp = spacy.blank(model.replace("blank:", "")) else: nlp = spacy.load(model) msg.good(f"Loaded model '{model}'") train_data, labels = format_data(srsly.read_jsonl(train_path)) eval_data, _ = format_data(srsly.read_jsonl(eval_path)) ner = nlp.create_pipe("ner") for label in labels: ner.add_label(label) nlp.add_pipe(ner) t2v_cfg = { "embed_rows": 10000, "token_vector_width": 128, "conv_depth": 8, "nr_feature_tokens": 3, } optimizer = nlp.begin_training( component_cfg={"ner": t2v_cfg} if tok2vec else {}) if tok2vec: _load_pretrained_tok2vec(nlp, Path(tok2vec)) batch_size = spacy.util.compounding(1.0, 16.0, 1.001) best_acc = 0 best_model = None row_widths = (2, 8, 8, 8, 8) msg.row(("#", "L", "P", "R", "F"), widths=row_widths) for i in range(n_iter): random.shuffle(train_data) losses = {} data = tqdm.tqdm(train_data, leave=False) for batch in spacy.util.minibatch(data, size=batch_size): texts, annots = zip(*batch) nlp.update(texts, annots, drop=0.2, losses=losses) with nlp.use_params(optimizer.averages): sc = nlp.evaluate(eval_data) if sc.ents_f > best_acc: best_acc = sc.ents_f if output: best_model = nlp.to_bytes() acc = (f"{sc.ents_p:.3f}", f"{sc.ents_r:.3f}", f"{sc.ents_f:.3f}") msg.row((i + 1, f"{losses['ner']:.2f}", *acc), widths=row_widths) msg.text(f"Best F-Score: {best_acc:.3f}") if output and best_model: with msg.loading("Saving model..."): nlp.from_bytes(best_model).to_disk(output) msg.good("Saved model", output)
def profile(model, inputs=None, n_texts=10000): """ Profile a spaCy pipeline, to find out which functions take the most time. Input should be formatted as one JSON object per line with a key "text". It can either be provided as a JSONL file, or be read from sys.sytdin. If no input file is specified, the IMDB dataset is loaded via Thinc. """ if inputs is not None: inputs = _read_inputs(inputs, msg) if inputs is None: n_inputs = 25000 with msg.loading("Loading IMDB dataset via Thinc..."): imdb_train, _ = thinc.extra.datasets.imdb() inputs, _ = zip(*imdb_train) msg.info("Loaded IMDB dataset and using {} examples".format(n_inputs)) inputs = inputs[:n_inputs] with msg.loading("Loading model '{}'...".format(model)): nlp = load_model(model) msg.good("Loaded model '{}'".format(model)) texts = list(itertools.islice(inputs, n_texts)) cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof") s = pstats.Stats("Profile.prof") msg.divider("Profile stats") s.strip_dirs().sort_stats("time").print_stats()
def train(): with msg.loading(" Loading BERT"): TOKENIZER = BertTokenizer.from_pretrained('bert-base-uncased') MODEL = BertForQuestionAnswering.from_pretrained( 'bert-large-uncased-whole-word-masking-finetuned-squad') msg.good(" BERT loaded") articles_dir = os.path.join(SCRIPT_PATH, '../data/raw/CORD-19-research-challenge/') articles_folders = [ 'biorxiv_medrxiv/biorxiv_medrxiv/pdf_json/', 'comm_use_subset/comm_use_subset/pdf_json/', 'comm_use_subset/comm_use_subset/pmc_json/', 'noncomm_use_subset/noncomm_use_subset/pdf_json/', 'noncomm_use_subset/noncomm_use_subset/pmc_json/', 'custom_license/custom_license/pdf_json/', 'custom_license/custom_license/pmc_json/' ] meta_path = articles_dir + 'metadata.csv' with msg.loading(" Loading publications"): start = time.time() data_text, index2paperID, index2paperPath = get_data_texts( articles_dir, articles_folders, meta_path) msg.good(" Publications loaded - Took {:.2f}s".format(time.time() - start)) covid_q = QuestionCovid(TOKENIZER, MODEL, index2paperID, index2paperPath) covid_q.fit(data_text) return covid_q
def cfg(): cfg = get_config(TEST_CFG) artefacts = [ "indices.pickle", "weights.h5", ] S3_SLUG = cfg["data"]["s3_slug"] OUTPUT_PATH = cfg["build"]["output_path"] WORD_EMBEDDINGS = cfg["build"]["word_embeddings"] for artefact in artefacts: with msg.loading(f"Could not find {artefact} locally, downloading..."): try: artefact = os.path.join(OUTPUT_PATH, artefact) download_model_artefact(artefact, S3_SLUG) msg.good(f"Found {artefact}") except: msg.fail(f"Could not download {S3_SLUG}{artefact}") # Check on word embedding and download if not exists WORD_EMBEDDINGS = cfg["build"]["word_embeddings"] with msg.loading( f"Could not find {WORD_EMBEDDINGS} locally, downloading..."): try: download_model_artefact(WORD_EMBEDDINGS, S3_SLUG) msg.good(f"Found {WORD_EMBEDDINGS}") except: msg.fail(f"Could not download {S3_SLUG}{WORD_EMBEDDINGS}") return cfg
def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) -> None: if inputs is not None: texts = _read_inputs(inputs, msg) texts = list(itertools.islice(texts, n_texts)) if inputs is None: try: import ml_datasets except ImportError: msg.fail( "This command, when run without an input file, " "requires the ml_datasets library to be installed: " "pip install ml_datasets", exits=1, ) with msg.loading("Loading IMDB dataset via ml_datasets..."): imdb_train, _ = ml_datasets.imdb(train_limit=n_texts, dev_limit=0) texts, _ = zip(*imdb_train) msg.info(f"Loaded IMDB dataset and using {n_texts} examples") with msg.loading(f"Loading pipeline '{model}'..."): nlp = load_model(model) msg.good(f"Loaded pipeline '{model}'") cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof") s = pstats.Stats("Profile.prof") msg.divider("Profile stats") s.strip_dirs().sort_stats("time").print_stats()
def read_attrs_from_deprecated(freqs_loc, clusters_loc): if freqs_loc is not None: with msg.loading("Counting frequencies..."): probs, _ = read_freqs(freqs_loc) msg.good("Counted frequencies") else: probs, _ = ({}, DEFAULT_OOV_PROB) # noqa: F841 if clusters_loc: with msg.loading("Reading clusters..."): clusters = read_clusters(clusters_loc) msg.good("Read clusters") else: clusters = {} lex_attrs = [] sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True) if len(sorted_probs): for i, (word, prob) in tqdm(enumerate(sorted_probs)): attrs = {"orth": word, "id": i, "prob": prob} # Decode as a little-endian string, so that we can do & 15 to get # the first 4 bits. See _parse_features.pyx if word in clusters: attrs["cluster"] = int(clusters[word][::-1], 2) else: attrs["cluster"] = 0 lex_attrs.append(attrs) return lex_attrs
def train(self, corpus: List[Fragment], verbose: bool = None): if not corpus: raise ValueError msg.no_print = not verbose with msg.loading("setting things up..."): self._setup_training(corpus) msg.text("train Naive Bayes model") feats = {} totals = {} for frag in corpus: for feat, val in frag.features.items(): feats[frag.label][feat + "_" + val] += 1 totals[frag.label] += len(frag.features) # add-1 smoothing and normalization with msg.loading("smoothing... "): smooth_inc = 0.1 all_feat_names = set(feats[True].keys()).union( set(feats[False].keys())) for label in [0, 1]: totals[label] += len(all_feat_names) * smooth_inc for feat in all_feat_names: feats[label][feat] += smooth_inc feats[label][feat] /= totals[label] self.feats[(label, feat)] = feats[label][feat] feats[label][self._PRIOR_FEAT] = (totals[label] / totals.totalCount()) self.feats[(label, self._PRIOR_FEAT)] = feats[label][self._PRIOR_FEAT] msg.good("done")
def train_model(model, train_path, eval_path, n_iter=10, output="./model2/", tok2vec=None): spacy.util.fix_random_seed(0) with msg.loading(f"Loading '{model}'..."): if model.startswith("blank:"): nlp = spacy.blank(model.replace("blank:", "")) else: nlp = spacy.load(model) msg.good(f"Loaded model '{model}'") train_data, labels = format_data(srsly.read_jsonl(train_path)) eval_data, _ = format_data(srsly.read_jsonl(eval_path)) if "textcat" not in nlp.pipe_names: textcat = nlp.create_pipe("textcat") nlp.add_pipe(textcat, last=True) else: textcat = nlp.get_pipe("textcat") for label in labels: textcat.add_label(label) optimizer = nlp.begin_training(component_cfg={"exclusive_classes": True}) batch_size = spacy.util.compounding(1.0, 16.0, 1.001) best_acc = 0 best_model = None row_widths = (2, 8, 8) msg.row(("#", "L", "F"), widths=row_widths) for i in range(n_iter): random.shuffle(train_data) losses = {} data = tqdm.tqdm(train_data, leave=False) for batch in spacy.util.minibatch(data, size=batch_size): #texts = [text for text, entities in batch] #annotations = [entities for text, entities in batch] texts, annotations = zip(*batch) nlp.update(texts, annotations, drop=0.2, losses=losses) with nlp.use_params(optimizer.averages): scorer = nlp.evaluate(eval_data) if scorer.textcat_score > best_acc: best_acc = scorer.textcat_score if output: best_model = nlp.to_bytes() acc = f"{scorer.textcat_score:.3f}" msg.row((i + 1, f"{losses['textcat']:.2f}", acc), widths=row_widths) msg.text(f"Best F-Score: {best_acc:.3f}") if output and best_model: with msg.loading("Saving model..."): nlp.from_bytes(best_model).to_disk(output) msg.good("Saved model", output)
def fit(self, data_text): self.TFIDF_VECTORIZER = TfidfVectorizer() with msg.loading(" Fitting TFIDF"): start = time.time() self.TFIDF_VECTORIZER.fit(data_text.values()) msg.good(" TFIDF fitted - Took {:.2f}s".format(time.time() - start)) with msg.loading(" Creating Articles matrix"): start = time.time() self.ARTICLES_MATRIX = self.TFIDF_VECTORIZER.transform( data_text.values()) msg.good( " Article matrix created - Took {:.2f}s".format(time.time() - start))
def add_vectors(nlp, vectors_loc, prune_vectors, name=None): vectors_loc = ensure_path(vectors_loc) if vectors_loc and vectors_loc.parts[-1].endswith(".npz"): nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb"))) for lex in nlp.vocab: if lex.rank: nlp.vocab.vectors.add(lex.orth, row=lex.rank) else: if vectors_loc: with msg.loading("Reading vectors from {}".format(vectors_loc)): vectors_data, vector_keys = read_vectors(vectors_loc) msg.good("Loaded vectors from {}".format(vectors_loc)) else: vectors_data, vector_keys = (None, None) if vector_keys is not None: for word in vector_keys: if word not in nlp.vocab: lexeme = nlp.vocab[word] lexeme.is_oov = False if vectors_data is not None: nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys) if name is None: nlp.vocab.vectors.name = "%s_model.vectors" % nlp.meta["lang"] else: nlp.vocab.vectors.name = name nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name if prune_vectors >= 1: nlp.vocab.prune_vectors(prune_vectors)
def _load_file(file_path: Path, msg: Printer) -> None: file_name = file_path.parts[-1] if file_path.suffix == ".json": with msg.loading(f"Loading {file_name}..."): data = srsly.read_json(file_path) msg.good(f"Loaded {file_name}") return data elif file_path.suffix == ".jsonl": with msg.loading(f"Loading {file_name}..."): data = srsly.read_jsonl(file_path) msg.good(f"Loaded {file_name}") return data msg.fail( f"Can't load file extension {file_path.suffix}", "Expected .json or .jsonl", exits=1, )
def init_model( lang, output_dir, freqs_loc=None, clusters_loc=None, jsonl_loc=None, vectors_loc=None, truncate_vectors=0, prune_vectors=-1, vectors_name=None, model_name=None, ): """ Create a new model from raw data, like word frequencies, Brown clusters and word vectors. If vectors are provided in Word2Vec format, they can be either a .txt or zipped as a .zip or .tar.gz. """ if jsonl_loc is not None: if freqs_loc is not None or clusters_loc is not None: settings = ["-j"] if freqs_loc: settings.append("-f") if clusters_loc: settings.append("-c") msg.warn( "Incompatible arguments", "The -f and -c arguments are deprecated, and not compatible " "with the -j argument, which should specify the same " "information. Either merge the frequencies and clusters data " "into the JSONL-formatted file (recommended), or use only the " "-f and -c files, without the other lexical attributes.", ) jsonl_loc = ensure_path(jsonl_loc) lex_attrs = srsly.read_jsonl(jsonl_loc) else: clusters_loc = ensure_path(clusters_loc) freqs_loc = ensure_path(freqs_loc) if freqs_loc is not None and not freqs_loc.exists(): msg.fail("Can't find words frequencies file", freqs_loc, exits=1) lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc) with msg.loading("Creating model..."): nlp = create_model(lang, lex_attrs, name=model_name) msg.good("Successfully created model") if vectors_loc is not None: add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name) vec_added = len(nlp.vocab.vectors) lex_added = len(nlp.vocab) msg.good( "Sucessfully compiled vocab", "{} entries, {} vectors".format(lex_added, vec_added), ) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) return nlp
def evaluate_model(model, eval_path): """ Evaluate a trained model on Prodigy annotations and print the accuracy. """ with msg.loading(f"Loading model '{model}'..."): nlp = spacy.load(model) data, _ = format_data(srsly.read_jsonl(eval_path)) sc = nlp.evaluate(data) result = [("F-Score", f"{sc.textcat_score:.3f}")] msg.table(result)
def swarm_solve( problems: Union[List[str], str], config: SwarmConfig, max_steps: Union[List[int], int] = 128, silent: bool = False, ) -> Swarm: single_problem: bool = isinstance(problems, str) if single_problem: problems = [problems] if isinstance(max_steps, int): max_steps = [max_steps ] if single_problem else [max_steps] * len(problems) assert len(problems) > 0, "no problems to solve" assert len(problems) == len(max_steps) assert isinstance(problems, list) current_problem: str = problems.pop(0) current_max_moves: int = max_steps.pop(0) def env_callable(): nonlocal current_problem, current_max_moves return FragileMathyEnv( name="mathy_v0", problem=current_problem, repeat_problem=True, max_steps=current_max_moves, ) mathy_env: MathyEnv = env_callable()._env._env.mathy swarm: Swarm = mathy_swarm(config, env_callable) while True: if not silent: with msg.loading(f"Solving {current_problem} ..."): swarm.run() else: swarm.run() if not silent: if swarm.walkers.best_reward > EnvRewards.WIN: last_state = MathyEnvState.from_np( swarm.walkers.states.best_state) msg.good( f"Solved! {current_problem} = {last_state.agent.problem}") mathy_env.print_history(last_state) else: msg.fail(f"Failed to find a solution :(") if len(max_steps) > 0: current_max_moves = max_steps.pop(0) current_problem = problems.pop(0) else: break return swarm
def fitted_ann_kb(nlp, entities, aliases): kb = AnnKnowledgeBase(nlp.vocab, entity_vector_length=300) print(vars(kb)) entity_ids = [] descriptions = [] freqs = [] for e in entities: entity_ids.append(e["id"]) descriptions.append(e.get("description", "")) freqs.append(100) msg.divider("Apply EntityEncoder") with msg.loading("Applying EntityEncoder to descriptions"): # get the pretrained entity vectors embeddings = [nlp.make_doc(desc).vector for desc in descriptions] msg.good("Finished, embeddings created") with msg.loading("Setting kb entities and aliases"): # set the entities, can also be done by calling `kb.add_entity` for each entity for i in range(len(entity_ids)): entity = entity_ids[i] if not kb.contains_entity(entity): kb.add_entity(entity, freqs[i], embeddings[i]) for a in aliases: ents = [e for e in a["entities"] if kb.contains_entity(e)] n_ents = len(ents) if n_ents > 0: prior_prob = [1.0 / n_ents] * n_ents kb.add_alias(alias=a["alias"], entities=ents, probabilities=prior_prob) kb.fit_index(verbose=True) return kb
def _make_graph_components(**kwargs): cat2id = {} page2id = {} id2page = {} disambiguations = {} pprops = _get_pprops(**kwargs) verbose = "verbose" in kwargs and kwargs["verbose"] msg_no_print = msg.no_print msg.no_print = not verbose iter_page_data = dt.iter_page_dump_data(**kwargs) for ns_kind, pageid, title in iter_page_data: disambi = False if pageid in pprops: page_props = pprops[pageid] if "hiddencat" in page_props or "noindex" in page_props: continue if "disambiguation" in page_props: disambi = True if disambi: disambiguations.setdefault(title, pageid) if ns_kind == dt.WIKI_NS_KIND_PAGE: page2id[title] = pageid id2page[pageid] = title elif ns_kind == dt.WIKI_NS_KIND_CATEGORY: cat2id[f"Category:{title}"] = pageid category_links = _get_category_links(cat2id, id2page, **kwargs) redirects = _get_redirects(page2id, id2page, **kwargs) with msg.loading("Removing duplicates..."): for title in redirects.values(): source_id = page2id.pop(title, None) id2page.pop(source_id, None) for title, pageid in disambiguations.items(): page2id.pop(title, None) id2page.pop(pageid, None) with msg.loading("Building graph..."): adjacency = _edgelist2adjacency(category_links) msg.no_print = msg_no_print return page2id, redirects, disambiguations, cat2id, adjacency
def get_model_pkgs(silent: bool = False) -> Tuple[dict, dict]: msg = Printer(no_print=silent, pretty=not silent) with msg.loading("Loading compatibility table..."): r = requests.get(about.__compatibility__) if r.status_code != 200: msg.fail( f"Server error ({r.status_code})", "Couldn't fetch compatibility table.", exits=1, ) msg.good("Loaded compatibility table") compat = r.json()["spacy"] all_models = set() with warnings.catch_warnings(): warnings.filterwarnings("ignore", message="\\[W09[45]") installed_models = get_installed_models() for spacy_v, models in dict(compat).items(): all_models.update(models.keys()) for model, model_vs in models.items(): compat[spacy_v][model] = [reformat_version(v) for v in model_vs] pkgs = {} for pkg_name in installed_models: package = pkg_name.replace("-", "_") version = get_package_version(pkg_name) if package in compat: is_compat = version in compat[package] spacy_version = about.__version__ else: model_path = get_package_path(package) with warnings.catch_warnings(): warnings.filterwarnings("ignore", message="\\[W09[45]") model_meta = get_model_meta(model_path) spacy_version = model_meta.get("spacy_version", "n/a") is_compat = is_compatible_version( about.__version__, spacy_version) # type: ignore[assignment] pkgs[pkg_name] = { "name": package, "version": version, "spacy": spacy_version, "compat": is_compat, } return pkgs, compat
def _add_category_links(g, pages, cat2id, **kwargs): edges = [] for _, source_id, target_title in dt.iter_categorylinks_dump_data( **kwargs ): if target_title not in pages: continue try: source_vx = g.vs.find(source_id) if ( source_vx["kind"] == dt.WIKI_NS_KIND_CATEGORY and source_vx["title"] not in pages ): continue target_id = cat2id[target_title] except (KeyError, ValueError): continue edges.append((source_vx["name"], target_id)) with msg.loading("adding category edges..."): g.add_edges(edges)
def cli_print_problems(environment: str, difficulty: str, number: int): """Print a set of generated problems from a given environment. This is useful if you when developing new environment types for verifying that the problems you're generating take the form you expect. """ import gym from mathy_envs.gym import MathyGymEnv env_name = f"mathy-{environment}-{difficulty}-v0" env: MathyGymEnv = gym.make(env_name) # type:ignore msg.divider(env_name) with msg.loading(f"Generating {number} problems..."): header = ("Complexity", "Is Valid", "Text") widths = (10, 8, 62) aligns = ("c", "c", "l") data = [] for i in range(number): state, problem = env.mathy.get_initial_state(env.env_problem_args, print_problem=False) valid = False text = problem.text try: env.mathy.parser.parse(problem.text) valid = True except BaseException as error: text = f"parse failed for '{problem.text}' with error: {error}" data.append(( problem.complexity, "✔" if valid else "✘", text, )) msg.good(f"\nGenerated {number} problems!") print( msg.table(data, header=header, divider=True, widths=widths, aligns=aligns))
def wps(model, data): """ Measure the processing speed in words per second. It's recommended to use a larger corpus of raw text here (e.g. a few million words). """ with msg.loading(f"Loading model '{model}'..."): nlp = spacy.load(model) texts = (eg["text"] for eg in srsly.read_jsonl(data)) n_docs = 0 n_words = 0 start_time = timer() for doc in nlp.pipe(texts): n_docs += 1 n_words += len(doc) end_time = timer() wps = int(n_words / (end_time - start_time)) result = [ ("Docs", f"{n_docs:,}"), ("Words", f"{n_words:,}"), ("Words/s", f"{wps:,}"), ] msg.table(result, widths=(7, 12), aligns=("l", "r"))
def build( # fmt: off repo: str, commit: str, package_name: str = Option(None, help="Package name (if different from repo)"), py35: bool = Option(False, "--py35", help="Build wheels for Python 3.5"), llvm: bool = Option(False, "--llvm", help="Requires LLVM to be installed"), rust: bool = Option(False, "--rust", help="Requires Rust to be installed"), universal: bool = Option( False, "--universal", help="Build universal (pure Python) wheel and sdist"), skip_tests: bool = Option( False, "--skip-tests", help="Don't run tests (e.g. if package doesn't have any)"), build_constraints: bool = Option( False, "--build-constraints", help="Use build constraints for build requirements"), # fmt: on ): """Build wheels for a given repo and commit / tag.""" print(LOGO) repo_id = get_repo_id() user, package = repo.lower().split("/", 1) if package_name is None: package_name = package.replace("-", "_") msg.info(f"Building in repo {repo_id}") msg.info(f"Building wheels for {user}/{package}\n") if universal: msg.warn( "Building only universal sdist and wheel, no cross-platform wheels" ) if skip_tests: msg.warn("Not running any tests") clone_url = DEFAULT_CLONE_TEMPLATE.format(f"{user}/{package}") repo = get_gh().get_repo(repo_id) with msg.loading("Finding a unique name for this release..."): # Pick the release_name by finding an unused one i = 1 while True: release_name = f"{package_name}-{commit}" if i > 1: release_name += f"-{i}" try: repo.get_release(release_name) except github.UnknownObjectException: break i += 1 branch_name = f"branch-for-{release_name}" bs = { "clone-url": clone_url, "package-name": package_name, "commit": commit, "options": { "llvm": llvm, "rust": rust, "py35": py35, "universal": universal, "skip_tests": skip_tests, "build_constraints": build_constraints, }, "upload-to": { "type": "github-release", "repo-id": repo_id, "release-id": release_name, }, } bs_json = json.dumps(bs) bs_json_formatted = json.dumps(bs, indent=4) msg.text(f"Creating release {release_name} to collect assets") release_text = f"https://github.com/{user}/{package}\n\n### Build spec\n\n```json\n{bs_json_formatted}\n```" release = repo.create_git_release(release_name, release_name, release_text) with msg.loading("Creating build branch..."): # 'master' is a 'Commit'. 'master.commit' is a 'GitCommit'. These are # different types that are mostly *not* interchangeable: # https://pygithub.readthedocs.io/en/latest/github_objects/Commit.html # https://pygithub.readthedocs.io/en/latest/github_objects/GitCommit.html master = repo.get_commit("master") master_gitcommit = master.commit patch = github.InputGitTreeElement( "build-spec.json", "100644", "blob", content=bs_json, ) tree = repo.create_git_tree([patch], master_gitcommit.tree) our_gitcommit = repo.create_git_commit(f"Building: {release_name}", tree, [master_gitcommit]) repo.create_git_ref(f"refs/heads/{branch_name}", our_gitcommit.sha) msg.good(f"Commit is {our_gitcommit.sha[:8]} in branch {branch_name}") msg.text(f"Release: {release.html_url}") msg.text( f"Checks: https://github.com/{repo_id}/commit/{our_gitcommit.sha}/checks" )
def train( lang, output_path, train_path, dev_path, raw_text=None, base_model=None, pipeline="tagger,parser,ner", replace_components=False, vectors=None, width=96, conv_depth=4, cnn_window=1, cnn_pieces=3, bilstm_depth=0, embed_rows=2000, n_iter=30, n_early_stopping=None, n_examples=0, use_gpu=-1, version="0.0.0", meta_path=None, init_tok2vec=None, parser_multitasks="", entity_multitasks="", noise_level=0.0, orth_variant_level=0.0, eval_beam_widths="", gold_preproc=False, learn_tokens=False, textcat_multilabel=False, textcat_arch="bow", textcat_positive_label=None, tag_map_path=None, omit_extra_lookups=False, verbose=False, debug=False, ): """ Train or update a spaCy model. Requires data to be formatted in spaCy's JSON format. To convert data from other formats, use the `spacy convert` command. """ util.fix_random_seed() util.set_env_log(verbose) # Make sure all files and paths exists if they are needed train_path = util.ensure_path(train_path) dev_path = util.ensure_path(dev_path) meta_path = util.ensure_path(meta_path) output_path = util.ensure_path(output_path) if raw_text is not None: raw_text = list(srsly.read_jsonl(raw_text)) if not train_path or not train_path.exists(): msg.fail("Training data not found", train_path, exits=1) if not dev_path or not dev_path.exists(): msg.fail("Development data not found", dev_path, exits=1) if meta_path is not None and not meta_path.exists(): msg.fail("Can't find model meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) if meta_path else {} if output_path.exists() and [ p for p in output_path.iterdir() if p.is_dir() ]: msg.warn( "Output directory is not empty", "This can lead to unintended side effects when saving the model. " "Please use an empty directory or a different path instead. If " "the specified output path doesn't exist, the directory will be " "created for you.", ) if not output_path.exists(): output_path.mkdir() msg.good("Created output directory: {}".format(output_path)) # Take dropout and batch size as generators of values -- dropout # starts high and decays sharply, to force the optimizer to explore. # Batch size starts at 1 and grows, so that we make updates quickly # at the beginning of training. dropout_rates = util.decaying( util.env_opt("dropout_from", 0.2), util.env_opt("dropout_to", 0.2), util.env_opt("dropout_decay", 0.0), ) batch_sizes = util.compounding( util.env_opt("batch_from", 100.0), util.env_opt("batch_to", 1000.0), util.env_opt("batch_compound", 1.001), ) if not eval_beam_widths: eval_beam_widths = [1] else: eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")] if 1 not in eval_beam_widths: eval_beam_widths.append(1) eval_beam_widths.sort() has_beam_widths = eval_beam_widths != [1] # Set up the base model and pipeline. If a base model is specified, load # the model and make sure the pipeline matches the pipeline setting. If # training starts from a blank model, intitalize the language class. pipeline = [p.strip() for p in pipeline.split(",")] disabled_pipes = None pipes_added = False msg.text("Training pipeline: {}".format(pipeline)) if use_gpu >= 0: activated_gpu = None try: activated_gpu = set_gpu(use_gpu) except Exception as e: msg.warn("Exception: {}".format(e)) if activated_gpu is not None: msg.text("Using GPU: {}".format(use_gpu)) else: msg.warn("Unable to activate GPU: {}".format(use_gpu)) msg.text("Using CPU only") use_gpu = -1 base_components = [] if base_model: msg.text("Starting with base model '{}'".format(base_model)) nlp = util.load_model(base_model) if nlp.lang != lang: msg.fail( "Model language ('{}') doesn't match language specified as " "`lang` argument ('{}') ".format(nlp.lang, lang), exits=1, ) for pipe in pipeline: pipe_cfg = {} if pipe == "parser": pipe_cfg = {"learn_tokens": learn_tokens} elif pipe == "textcat": pipe_cfg = { "exclusive_classes": not textcat_multilabel, "architecture": textcat_arch, "positive_label": textcat_positive_label, } if pipe not in nlp.pipe_names: msg.text("Adding component to base model: '{}'".format(pipe)) nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) pipes_added = True elif replace_components: msg.text( "Replacing component from base model '{}'".format(pipe)) nlp.replace_pipe(pipe, nlp.create_pipe(pipe, config=pipe_cfg)) pipes_added = True else: if pipe == "textcat": textcat_cfg = nlp.get_pipe("textcat").cfg base_cfg = { "exclusive_classes": textcat_cfg["exclusive_classes"], "architecture": textcat_cfg["architecture"], "positive_label": textcat_cfg["positive_label"], } if base_cfg != pipe_cfg: msg.fail( "The base textcat model configuration does" "not match the provided training options. " "Existing cfg: {}, provided cfg: {}".format( base_cfg, pipe_cfg), exits=1, ) msg.text( "Extending component from base model '{}'".format(pipe)) base_components.append(pipe) disabled_pipes = nlp.disable_pipes( [p for p in nlp.pipe_names if p not in pipeline]) else: msg.text("Starting with blank model '{}'".format(lang)) lang_cls = util.get_lang_class(lang) nlp = lang_cls() for pipe in pipeline: if pipe == "parser": pipe_cfg = {"learn_tokens": learn_tokens} elif pipe == "textcat": pipe_cfg = { "exclusive_classes": not textcat_multilabel, "architecture": textcat_arch, "positive_label": textcat_positive_label, } else: pipe_cfg = {} nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) if tag_map_path is not None: tag_map = srsly.read_json(tag_map_path) # Replace tag map with provided mapping nlp.vocab.morphology.load_tag_map(tag_map) # Create empty extra lexeme tables so the data from spacy-lookups-data # isn't loaded if these features are accessed if omit_extra_lookups: nlp.vocab.lookups_extra = Lookups() nlp.vocab.lookups_extra.add_table("lexeme_cluster") nlp.vocab.lookups_extra.add_table("lexeme_prob") nlp.vocab.lookups_extra.add_table("lexeme_settings") if vectors: msg.text("Loading vector from model '{}'".format(vectors)) _load_vectors(nlp, vectors) # Multitask objectives multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)] for pipe_name, multitasks in multitask_options: if multitasks: if pipe_name not in pipeline: msg.fail("Can't use multitask objective without '{}' in the " "pipeline".format(pipe_name)) pipe = nlp.get_pipe(pipe_name) for objective in multitasks.split(","): pipe.add_multitask_objective(objective) # Prepare training corpus msg.text("Counting training words (limit={})".format(n_examples)) corpus = GoldCorpus(train_path, dev_path, limit=n_examples) n_train_words = corpus.count_train() if base_model and not pipes_added: # Start with an existing model, use default optimizer optimizer = nlp.resume_training(device=use_gpu) else: # Start with a blank model, call begin_training cfg = {"device": use_gpu} cfg["conv_depth"] = conv_depth cfg["token_vector_width"] = width cfg["bilstm_depth"] = bilstm_depth cfg["cnn_maxout_pieces"] = cnn_pieces cfg["embed_size"] = embed_rows cfg["conv_window"] = cnn_window optimizer = nlp.begin_training(lambda: corpus.train_tuples, **cfg) nlp._optimizer = None # Load in pretrained weights if init_tok2vec is not None: components = _load_pretrained_tok2vec(nlp, init_tok2vec, base_components) msg.text("Loaded pretrained tok2vec for: {}".format(components)) # Verify textcat config if "textcat" in pipeline: textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", []) if textcat_positive_label and textcat_positive_label not in textcat_labels: msg.fail( "The textcat_positive_label (tpl) '{}' does not match any " "label in the training data.".format(textcat_positive_label), exits=1, ) if textcat_positive_label and len(textcat_labels) != 2: msg.fail( "A textcat_positive_label (tpl) '{}' was provided for training " "data that does not appear to be a binary classification " "problem with two labels.".format(textcat_positive_label), exits=1, ) train_docs = corpus.train_docs( nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0, ignore_misaligned=True, ) train_labels = set() if textcat_multilabel: multilabel_found = False for text, gold in train_docs: train_labels.update(gold.cats.keys()) if list(gold.cats.values()).count(1.0) != 1: multilabel_found = True if not multilabel_found and not base_model: msg.warn("The textcat training instances look like they have " "mutually-exclusive classes. Remove the flag " "'--textcat-multilabel' to train a classifier with " "mutually-exclusive classes.") if not textcat_multilabel: for text, gold in train_docs: train_labels.update(gold.cats.keys()) if list(gold.cats.values()).count(1.0) != 1 and not base_model: msg.warn( "Some textcat training instances do not have exactly " "one positive label. Modifying training options to " "include the flag '--textcat-multilabel' for classes " "that are not mutually exclusive.") nlp.get_pipe("textcat").cfg["exclusive_classes"] = False textcat_multilabel = True break if base_model and set(textcat_labels) != train_labels: msg.fail( "Cannot extend textcat model using data with different " "labels. Base model labels: {}, training data labels: " "{}.".format(textcat_labels, list(train_labels)), exits=1, ) if textcat_multilabel: msg.text( "Textcat evaluation score: ROC AUC score macro-averaged across " "the labels '{}'".format(", ".join(textcat_labels))) elif textcat_positive_label and len(textcat_labels) == 2: msg.text("Textcat evaluation score: F1-score for the " "label '{}'".format(textcat_positive_label)) elif len(textcat_labels) > 1: if len(textcat_labels) == 2: msg.warn( "If the textcat component is a binary classifier with " "exclusive classes, provide '--textcat-positive-label' for " "an evaluation on the positive class.") msg.text( "Textcat evaluation score: F1-score macro-averaged across " "the labels '{}'".format(", ".join(textcat_labels))) else: msg.fail( "Unsupported textcat configuration. Use `spacy debug-data` " "for more information.") # fmt: off row_head, output_stats = _configure_training_output( pipeline, use_gpu, has_beam_widths) row_widths = [len(w) for w in row_head] row_settings = { "widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2 } # fmt: on print("") msg.row(row_head, **row_settings) msg.row(["-" * width for width in row_settings["widths"]], **row_settings) try: iter_since_best = 0 best_score = 0.0 for i in range(n_iter): train_docs = corpus.train_docs( nlp, noise_level=noise_level, orth_variant_level=orth_variant_level, gold_preproc=gold_preproc, max_length=0, ignore_misaligned=True, ) if raw_text: random.shuffle(raw_text) raw_batches = util.minibatch( (nlp.make_doc(rt["text"]) for rt in raw_text), size=8) words_seen = 0 with tqdm.tqdm(total=n_train_words, leave=False) as pbar: losses = {} for batch in util.minibatch_by_words(train_docs, size=batch_sizes): if not batch: continue docs, golds = zip(*batch) try: nlp.update( docs, golds, sgd=optimizer, drop=next(dropout_rates), losses=losses, ) except ValueError as e: err = "Error during training" if init_tok2vec: err += " Did you provide the same parameters during 'train' as during 'pretrain'?" msg.fail(err, "Original error message: {}".format(e), exits=1) if raw_text: # If raw text is available, perform 'rehearsal' updates, # which use unlabelled data to reduce overfitting. raw_batch = list(next(raw_batches)) nlp.rehearse(raw_batch, sgd=optimizer, losses=losses) if not int(os.environ.get("LOG_FRIENDLY", 0)): pbar.update(sum(len(doc) for doc in docs)) words_seen += sum(len(doc) for doc in docs) with nlp.use_params(optimizer.averages): util.set_env_log(False) epoch_model_path = output_path / ("model%d" % i) nlp.to_disk(epoch_model_path) nlp_loaded = util.load_model_from_path(epoch_model_path) for beam_width in eval_beam_widths: for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width dev_docs = list( corpus.dev_docs( nlp_loaded, gold_preproc=gold_preproc, ignore_misaligned=True, )) nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) end_time = timer() if use_gpu < 0: gpu_wps = None cpu_wps = nwords / (end_time - start_time) else: gpu_wps = nwords / (end_time - start_time) # Only evaluate on CPU in the first iteration (for # timing) if GPU is enabled if i == 0: with Model.use_device("cpu"): nlp_loaded = util.load_model_from_path( epoch_model_path) for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg[ "beam_width"] = beam_width dev_docs = list( corpus.dev_docs( nlp_loaded, gold_preproc=gold_preproc, ignore_misaligned=True, )) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) end_time = timer() cpu_wps = nwords / (end_time - start_time) acc_loc = output_path / ("model%d" % i) / "accuracy.json" srsly.write_json(acc_loc, scorer.scores) # Update model meta.json meta["lang"] = nlp.lang meta["pipeline"] = nlp.pipe_names meta["spacy_version"] = ">=%s" % about.__version__ if beam_width == 1: meta["speed"] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta.setdefault("accuracy", {}) for component in nlp.pipe_names: for metric in _get_metrics(component): meta["accuracy"][metric] = scorer.scores[ metric] else: meta.setdefault("beam_accuracy", {}) meta.setdefault("beam_speed", {}) for component in nlp.pipe_names: for metric in _get_metrics(component): meta["beam_accuracy"][metric] = scorer.scores[ metric] meta["beam_speed"][beam_width] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta["vectors"] = { "width": nlp.vocab.vectors_length, "vectors": len(nlp.vocab.vectors), "keys": nlp.vocab.vectors.n_keys, "name": nlp.vocab.vectors.name, } meta.setdefault("name", "model%d" % i) meta.setdefault("version", version) meta["labels"] = nlp.meta["labels"] meta_loc = output_path / ("model%d" % i) / "meta.json" srsly.write_json(meta_loc, meta) util.set_env_log(verbose) progress = _get_progress( i, losses, scorer.scores, output_stats, beam_width=beam_width if has_beam_widths else None, cpu_wps=cpu_wps, gpu_wps=gpu_wps, ) if i == 0 and "textcat" in pipeline: textcats_per_cat = scorer.scores.get( "textcats_per_cat", {}) for cat, cat_score in textcats_per_cat.items(): if cat_score.get("roc_auc_score", 0) < 0: msg.warn( "Textcat ROC AUC score is undefined due to " "only one value in label '{}'.".format( cat)) msg.row(progress, **row_settings) # Early stopping if n_early_stopping is not None: current_score = _score_for_model(meta) if current_score < best_score: iter_since_best += 1 else: iter_since_best = 0 best_score = current_score if iter_since_best >= n_early_stopping: iter_current = i + 1 msg.text("Early stopping, best iteration " "is: {}".format(iter_current - iter_since_best)) msg.text("Best score = {}; Final iteration " "score = {}".format(best_score, current_score)) break except Exception as e: msg.warn( "Aborting and saving the final best model. " "Encountered exception: {}".format(e), exits=1, ) finally: best_pipes = nlp.pipe_names if disabled_pipes: disabled_pipes.restore() meta["pipeline"] = nlp.pipe_names with nlp.use_params(optimizer.averages): final_model_path = output_path / "model-final" nlp.to_disk(final_model_path) srsly.write_json(final_model_path / "meta.json", meta) meta_loc = output_path / "model-final" / "meta.json" final_meta = srsly.read_json(meta_loc) final_meta.setdefault("accuracy", {}) final_meta["accuracy"].update(meta.get("accuracy", {})) final_meta.setdefault("speed", {}) final_meta["speed"].setdefault("cpu", None) final_meta["speed"].setdefault("gpu", None) meta.setdefault("speed", {}) meta["speed"].setdefault("cpu", None) meta["speed"].setdefault("gpu", None) # combine cpu and gpu speeds with the base model speeds if final_meta["speed"]["cpu"] and meta["speed"]["cpu"]: speed = _get_total_speed( [final_meta["speed"]["cpu"], meta["speed"]["cpu"]]) final_meta["speed"]["cpu"] = speed if final_meta["speed"]["gpu"] and meta["speed"]["gpu"]: speed = _get_total_speed( [final_meta["speed"]["gpu"], meta["speed"]["gpu"]]) final_meta["speed"]["gpu"] = speed # if there were no speeds to update, overwrite with meta if (final_meta["speed"]["cpu"] is None and final_meta["speed"]["gpu"] is None): final_meta["speed"].update(meta["speed"]) # note: beam speeds are not combined with the base model if has_beam_widths: final_meta.setdefault("beam_accuracy", {}) final_meta["beam_accuracy"].update( meta.get("beam_accuracy", {})) final_meta.setdefault("beam_speed", {}) final_meta["beam_speed"].update(meta.get("beam_speed", {})) srsly.write_json(meta_loc, final_meta) msg.good("Saved model to output directory", final_model_path) with msg.loading("Creating best model..."): best_model_path = _collate_best_model(final_meta, output_path, best_pipes) msg.good("Created best model", best_model_path)
def debug_data( config_path: Path, *, config_overrides: Dict[str, Any] = {}, ignore_warnings: bool = False, verbose: bool = False, no_format: bool = True, silent: bool = True, ): msg = Printer(no_print=silent, pretty=not no_format, ignore_warnings=ignore_warnings) # Make sure all files and paths exists if they are needed with show_validation_error(config_path): cfg = util.load_config(config_path, overrides=config_overrides) nlp = util.load_model_from_config(cfg) config = nlp.config.interpolate() T = registry.resolve(config["training"], schema=ConfigSchemaTraining) # Use original config here, not resolved version sourced_components = get_sourced_components(cfg) frozen_components = T["frozen_components"] resume_components = [ p for p in sourced_components if p not in frozen_components ] pipeline = nlp.pipe_names factory_names = [ nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names ] msg.divider("Data file validation") # Create the gold corpus to be able to better analyze data dot_names = [T["train_corpus"], T["dev_corpus"]] train_corpus, dev_corpus = resolve_dot_names(config, dot_names) nlp.initialize(lambda: train_corpus(nlp)) msg.good("Pipeline can be initialized with data") train_dataset = list(train_corpus(nlp)) dev_dataset = list(dev_corpus(nlp)) msg.good("Corpus is loadable") # Create all gold data here to avoid iterating over the train_dataset constantly gold_train_data = _compile_gold(train_dataset, factory_names, nlp, make_proj=True) gold_train_unpreprocessed_data = _compile_gold(train_dataset, factory_names, nlp, make_proj=False) gold_dev_data = _compile_gold(dev_dataset, factory_names, nlp, make_proj=True) train_texts = gold_train_data["texts"] dev_texts = gold_dev_data["texts"] frozen_components = T["frozen_components"] msg.divider("Training stats") msg.text(f"Language: {nlp.lang}") msg.text(f"Training pipeline: {', '.join(pipeline)}") if resume_components: msg.text( f"Components from other pipelines: {', '.join(resume_components)}") if frozen_components: msg.text(f"Frozen components: {', '.join(frozen_components)}") msg.text(f"{len(train_dataset)} training docs") msg.text(f"{len(dev_dataset)} evaluation docs") if not len(gold_dev_data): msg.fail("No evaluation docs") overlap = len(train_texts.intersection(dev_texts)) if overlap: msg.warn(f"{overlap} training examples also in evaluation data") else: msg.good("No overlap between training and evaluation data") # TODO: make this feedback more fine-grained and report on updated # components vs. blank components if not resume_components and len(train_dataset) < BLANK_MODEL_THRESHOLD: text = f"Low number of examples to train a new pipeline ({len(train_dataset)})" if len(train_dataset) < BLANK_MODEL_MIN_THRESHOLD: msg.fail(text) else: msg.warn(text) msg.text( f"It's recommended to use at least {BLANK_MODEL_THRESHOLD} examples " f"(minimum {BLANK_MODEL_MIN_THRESHOLD})", show=verbose, ) msg.divider("Vocab & Vectors") n_words = gold_train_data["n_words"] msg.info( f"{n_words} total word(s) in the data ({len(gold_train_data['words'])} unique)" ) if gold_train_data["n_misaligned_words"] > 0: n_misaligned = gold_train_data["n_misaligned_words"] msg.warn(f"{n_misaligned} misaligned tokens in the training data") if gold_dev_data["n_misaligned_words"] > 0: n_misaligned = gold_dev_data["n_misaligned_words"] msg.warn(f"{n_misaligned} misaligned tokens in the dev data") most_common_words = gold_train_data["words"].most_common(10) msg.text( f"10 most common words: {_format_labels(most_common_words, counts=True)}", show=verbose, ) if len(nlp.vocab.vectors): msg.info( f"{len(nlp.vocab.vectors)} vectors ({nlp.vocab.vectors.n_keys} " f"unique keys, {nlp.vocab.vectors_length} dimensions)") n_missing_vectors = sum( gold_train_data["words_missing_vectors"].values()) msg.warn( "{} words in training data without vectors ({:.0f}%)".format( n_missing_vectors, 100 * (n_missing_vectors / gold_train_data["n_words"]), ), ) msg.text( "10 most common words without vectors: {}".format( _format_labels( gold_train_data["words_missing_vectors"].most_common(10), counts=True, )), show=verbose, ) else: msg.info("No word vectors present in the package") if "ner" in factory_names: # Get all unique NER labels present in the data labels = set(label for label in gold_train_data["ner"] if label not in ("O", "-", None)) label_counts = gold_train_data["ner"] model_labels = _get_labels_from_model(nlp, "ner") has_low_data_warning = False has_no_neg_warning = False has_ws_ents_error = False has_boundary_cross_ents_warning = False msg.divider("Named Entity Recognition") msg.info(f"{len(model_labels)} label(s)") missing_values = label_counts["-"] msg.text(f"{missing_values} missing value(s) (tokens with '-' label)") for label in labels: if len(label) == 0: msg.fail("Empty label found in train data") labels_with_counts = [(label, count) for label, count in label_counts.most_common() if label != "-"] labels_with_counts = _format_labels(labels_with_counts, counts=True) msg.text(f"Labels in train data: {_format_labels(labels)}", show=verbose) missing_labels = model_labels - labels if missing_labels: msg.warn( "Some model labels are not present in the train data. The " "model performance may be degraded for these labels after " f"training: {_format_labels(missing_labels)}.") if gold_train_data["ws_ents"]: msg.fail( f"{gold_train_data['ws_ents']} invalid whitespace entity spans" ) has_ws_ents_error = True for label in labels: if label_counts[label] <= NEW_LABEL_THRESHOLD: msg.warn( f"Low number of examples for label '{label}' ({label_counts[label]})" ) has_low_data_warning = True with msg.loading("Analyzing label distribution..."): neg_docs = _get_examples_without_label( train_dataset, label) if neg_docs == 0: msg.warn( f"No examples for texts WITHOUT new label '{label}'") has_no_neg_warning = True if gold_train_data["boundary_cross_ents"]: msg.warn( f"{gold_train_data['boundary_cross_ents']} entity span(s) crossing sentence boundaries" ) has_boundary_cross_ents_warning = True if not has_low_data_warning: msg.good("Good amount of examples for all labels") if not has_no_neg_warning: msg.good("Examples without occurrences available for all labels") if not has_ws_ents_error: msg.good( "No entities consisting of or starting/ending with whitespace") if not has_boundary_cross_ents_warning: msg.good("No entities crossing sentence boundaries") if has_low_data_warning: msg.text( f"To train a new entity type, your data should include at " f"least {NEW_LABEL_THRESHOLD} instances of the new label", show=verbose, ) if has_no_neg_warning: msg.text( "Training data should always include examples of entities " "in context, as well as examples without a given entity " "type.", show=verbose, ) if has_ws_ents_error: msg.text("Entity spans consisting of or starting/ending " "with whitespace characters are considered invalid.") if "textcat" in factory_names: msg.divider("Text Classification (Exclusive Classes)") labels = _get_labels_from_model(nlp, "textcat") msg.info(f"Text Classification: {len(labels)} label(s)") msg.text(f"Labels: {_format_labels(labels)}", show=verbose) missing_labels = labels - set(gold_train_data["cats"]) if missing_labels: msg.warn( "Some model labels are not present in the train data. The " "model performance may be degraded for these labels after " f"training: {_format_labels(missing_labels)}.") if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]): msg.warn( "Potential train/dev mismatch: the train and dev labels are " "not the same. " f"Train labels: {_format_labels(gold_train_data['cats'])}. " f"Dev labels: {_format_labels(gold_dev_data['cats'])}.") if len(labels) < 2: msg.fail( "The model does not have enough labels. 'textcat' requires at " "least two labels due to mutually-exclusive classes, e.g. " "LABEL/NOT_LABEL or POSITIVE/NEGATIVE for a binary " "classification task.") if (gold_train_data["n_cats_bad_values"] > 0 or gold_dev_data["n_cats_bad_values"] > 0): msg.fail("Unsupported values for cats: the supported values are " "1.0/True and 0.0/False.") if gold_train_data["n_cats_multilabel"] > 0: # Note: you should never get here because you run into E895 on # initialization first. msg.fail( "The train data contains instances without mutually-exclusive " "classes. Use the component 'textcat_multilabel' instead of " "'textcat'.") if gold_dev_data["n_cats_multilabel"] > 0: msg.fail( "The dev data contains instances without mutually-exclusive " "classes. Use the component 'textcat_multilabel' instead of " "'textcat'.") if "textcat_multilabel" in factory_names: msg.divider("Text Classification (Multilabel)") labels = _get_labels_from_model(nlp, "textcat_multilabel") msg.info(f"Text Classification: {len(labels)} label(s)") msg.text(f"Labels: {_format_labels(labels)}", show=verbose) missing_labels = labels - set(gold_train_data["cats"]) if missing_labels: msg.warn( "Some model labels are not present in the train data. The " "model performance may be degraded for these labels after " f"training: {_format_labels(missing_labels)}.") if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]): msg.warn( "Potential train/dev mismatch: the train and dev labels are " "not the same. " f"Train labels: {_format_labels(gold_train_data['cats'])}. " f"Dev labels: {_format_labels(gold_dev_data['cats'])}.") if (gold_train_data["n_cats_bad_values"] > 0 or gold_dev_data["n_cats_bad_values"] > 0): msg.fail("Unsupported values for cats: the supported values are " "1.0/True and 0.0/False.") if gold_train_data["n_cats_multilabel"] > 0: if gold_dev_data["n_cats_multilabel"] == 0: msg.warn( "Potential train/dev mismatch: the train data contains " "instances without mutually-exclusive classes while the " "dev data contains only instances with mutually-exclusive " "classes.") else: msg.warn("The train data contains only instances with " "mutually-exclusive classes. You can potentially use the " "component 'textcat' instead of 'textcat_multilabel'.") if gold_dev_data["n_cats_multilabel"] > 0: msg.fail( "Train/dev mismatch: the dev data contains instances " "without mutually-exclusive classes while the train data " "contains only instances with mutually-exclusive classes.") if "tagger" in factory_names: msg.divider("Part-of-speech Tagging") label_list = [label for label in gold_train_data["tags"]] model_labels = _get_labels_from_model(nlp, "tagger") msg.info(f"{len(label_list)} label(s) in train data") labels = set(label_list) missing_labels = model_labels - labels if missing_labels: msg.warn( "Some model labels are not present in the train data. The " "model performance may be degraded for these labels after " f"training: {_format_labels(missing_labels)}.") labels_with_counts = _format_labels( gold_train_data["tags"].most_common(), counts=True) msg.text(labels_with_counts, show=verbose) if "morphologizer" in factory_names: msg.divider("Morphologizer (POS+Morph)") label_list = [label for label in gold_train_data["morphs"]] model_labels = _get_labels_from_model(nlp, "morphologizer") msg.info(f"{len(label_list)} label(s) in train data") labels = set(label_list) missing_labels = model_labels - labels if missing_labels: msg.warn( "Some model labels are not present in the train data. The " "model performance may be degraded for these labels after " f"training: {_format_labels(missing_labels)}.") labels_with_counts = _format_labels( gold_train_data["morphs"].most_common(), counts=True) msg.text(labels_with_counts, show=verbose) if "parser" in factory_names: has_low_data_warning = False msg.divider("Dependency Parsing") # profile sentence length msg.info( f"Found {gold_train_data['n_sents']} sentence(s) with an average " f"length of {gold_train_data['n_words'] / gold_train_data['n_sents']:.1f} words." ) # check for documents with multiple sentences sents_per_doc = gold_train_data["n_sents"] / len( gold_train_data["texts"]) if sents_per_doc < 1.1: msg.warn( f"The training data contains {sents_per_doc:.2f} sentences per " f"document. When there are very few documents containing more " f"than one sentence, the parser will not learn how to segment " f"longer texts into sentences.") # profile labels labels_train = [label for label in gold_train_data["deps"]] labels_train_unpreprocessed = [ label for label in gold_train_unpreprocessed_data["deps"] ] labels_dev = [label for label in gold_dev_data["deps"]] if gold_train_unpreprocessed_data["n_nonproj"] > 0: n_nonproj = gold_train_unpreprocessed_data["n_nonproj"] msg.info(f"Found {n_nonproj} nonprojective train sentence(s)") if gold_dev_data["n_nonproj"] > 0: n_nonproj = gold_dev_data["n_nonproj"] msg.info(f"Found {n_nonproj} nonprojective dev sentence(s)") msg.info(f"{len(labels_train_unpreprocessed)} label(s) in train data") msg.info(f"{len(labels_train)} label(s) in projectivized train data") labels_with_counts = _format_labels( gold_train_unpreprocessed_data["deps"].most_common(), counts=True) msg.text(labels_with_counts, show=verbose) # rare labels in train for label in gold_train_unpreprocessed_data["deps"]: if gold_train_unpreprocessed_data["deps"][ label] <= DEP_LABEL_THRESHOLD: msg.warn(f"Low number of examples for label '{label}' " f"({gold_train_unpreprocessed_data['deps'][label]})") has_low_data_warning = True # rare labels in projectivized train rare_projectivized_labels = [] for label in gold_train_data["deps"]: if (gold_train_data["deps"][label] <= DEP_LABEL_THRESHOLD and DELIMITER in label): rare_projectivized_labels.append( f"{label}: {gold_train_data['deps'][label]}") if len(rare_projectivized_labels) > 0: msg.warn( f"Low number of examples for {len(rare_projectivized_labels)} " "label(s) in the projectivized dependency trees used for " "training. You may want to projectivize labels such as punct " "before training in order to improve parser performance.") msg.warn( f"Projectivized labels with low numbers of examples: ", ", ".join(rare_projectivized_labels), show=verbose, ) has_low_data_warning = True # labels only in train if set(labels_train) - set(labels_dev): msg.warn( "The following labels were found only in the train data:", ", ".join(set(labels_train) - set(labels_dev)), show=verbose, ) # labels only in dev if set(labels_dev) - set(labels_train): msg.warn( "The following labels were found only in the dev data:", ", ".join(set(labels_dev) - set(labels_train)), show=verbose, ) if has_low_data_warning: msg.text( f"To train a parser, your data should include at " f"least {DEP_LABEL_THRESHOLD} instances of each label.", show=verbose, ) # multiple root labels if len(gold_train_unpreprocessed_data["roots"]) > 1: msg.warn( f"Multiple root labels " f"({', '.join(gold_train_unpreprocessed_data['roots'])}) " f"found in training data. spaCy's parser uses a single root " f"label ROOT so this distinction will not be available.") # these should not happen, but just in case if gold_train_data["n_nonproj"] > 0: msg.fail(f"Found {gold_train_data['n_nonproj']} nonprojective " f"projectivized train sentence(s)") if gold_train_data["n_cycles"] > 0: msg.fail( f"Found {gold_train_data['n_cycles']} projectivized train sentence(s) with cycles" ) msg.divider("Summary") good_counts = msg.counts[MESSAGES.GOOD] warn_counts = msg.counts[MESSAGES.WARN] fail_counts = msg.counts[MESSAGES.FAIL] if good_counts: msg.good( f"{good_counts} {'check' if good_counts == 1 else 'checks'} passed" ) if warn_counts: msg.warn( f"{warn_counts} {'warning' if warn_counts == 1 else 'warnings'}") if fail_counts: msg.fail(f"{fail_counts} {'error' if fail_counts == 1 else 'errors'}") sys.exit(1)
def main(vectors, gpu_id=-1, n_neighbors=100, batch_size=1024, cutoff=0, start=0, end=None): """ Step 6: Precompute nearest-neighbor queries (optional) Precompute nearest-neighbor queries for every entry in the vocab to make Sense2Vec.most_similar faster. The --cutoff option lets you define the number of earliest rows to limit the neighbors to. For instance, if cutoff is 100000, no word will have a nearest neighbor outside of the top 100k vectors. """ if gpu_id == -1: xp = numpy else: import cupy as xp import cupy.cuda.device cupy.take_along_axis = take_along_axis device = cupy.cuda.device.Device(gpu_id) device.use() vectors_dir = Path(vectors) vectors_file = vectors_dir / "vectors" if not vectors_dir.is_dir() or not vectors_file.exists(): err = "Are you passing in the exported sense2vec directory containing a vectors file?" msg.fail(f"Can't load vectors from {vectors}", err, exits=1) with msg.loading(f"Loading vectors from {vectors}"): vectors = xp.load(str(vectors_file)) msg.good( f"Loaded {vectors.shape[0]:,} vectors with dimension {vectors.shape[1]}" ) norms = xp.linalg.norm(vectors, axis=1, keepdims=True) norms[norms == 0] = 1 # Normalize to unit norm vectors /= norms if cutoff < 1: cutoff = vectors.shape[0] if end is None: end = vectors.shape[0] mean = float(norms.mean()) var = float(norms.var()) msg.good(f"Normalized (mean {mean:,.2f}, variance {var:,.2f})") msg.info( f"Finding {n_neighbors:,} neighbors among {cutoff:,} most frequent") n = min(n_neighbors, vectors.shape[0]) subset = vectors[:cutoff] best_rows = xp.zeros((end - start, n), dtype="i") scores = xp.zeros((end - start, n), dtype="f") for i in tqdm.tqdm(list(range(start, end, batch_size))): size = min(batch_size, end - i) batch = vectors[i:i + size] sims = xp.dot(batch, subset.T) # Set self-similarities to -inf, so that we don't return them. for j in range(size): if i + j < sims.shape[1]: sims[j, i + j] = -xp.inf # This used to use argpartition, to do a partial sort...But this ended # up being a ratsnest of terrible numpy crap. Just sorting the whole # list isn't really slower, and it's much simpler to read. ranks = xp.argsort(sims, axis=1) batch_rows = ranks[:, -n:] # Reverse batch_rows = batch_rows[:, ::-1] batch_scores = xp.take_along_axis(sims, batch_rows, axis=1) best_rows[i:i + size] = batch_rows scores[i:i + size] = batch_scores msg.info("Saving output") if not isinstance(best_rows, numpy.ndarray): best_rows = best_rows.get() if not isinstance(scores, numpy.ndarray): scores = scores.get() output = { "indices": best_rows, "scores": scores.astype("float16"), "start": start, "end": end, "cutoff": cutoff, } output_file = vectors_dir / "cache" with msg.loading("Saving output..."): srsly.write_msgpack(output_file, output) msg.good(f"Saved cache to {output_file}")
def main( model="./zh_vectors_web_ud_lg/model-final", new_model_name="zh_vectors_web_ud_clue_lg", output_dir="./zh_vectors_web_ud_clue_lg", train_path="./clue_spacy_train.jsonl", dev_path="./clue_spacy_dev.jsonl", meta_path="./meta.json", use_gpu=0, n_iter=50 ): import tqdm """Set up the pipeline and entity recognizer, and train the new entity.""" random.seed(0) if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) # Add entity recognizer to model if it's not in the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner) # otherwise, get it, so we can add labels to it else: ner = nlp.get_pipe("ner") for label in LABEL: if label not in ner.labels: ner.add_label(label) # add new entity label to entity recognizer train_path = ensure_path(train_path) dev_path = ensure_path(dev_path) if not train_path or not train_path.exists(): msg.fail("Training data not found", train_path, exits=1) if not dev_path or not dev_path.exists(): msg.fail("Development data not found", dev_path, exits=1) if output_dir.exists() and [p for p in output_dir.iterdir() if p.is_dir()]: msg.warn( "Output directory is not empty", "This can lead to unintended side effects when saving the model. " "Please use an empty directory or a different path instead. If " "the specified output path doesn't exist, the directory will be " "created for you.", ) if not output_dir.exists(): output_dir.mkdir() meta = srsly.read_json(meta_path) if meta_path else {} # Prepare training corpus msg.text("Counting training words (limit={})".format(0)) corpus = GoldCorpus(train_path, dev_path, limit=0) n_train_words = corpus.count_train() if model is None: optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) else: optimizer = create_default_optimizer(Model.ops) # Todo: gpu train? dropout_rates = decaying( 0.2, 0.2, 0.0) batch_sizes = compounding( 100.0, 1000.0 , 1.001) # get names of other pipes to disable them during training pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [ pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions] # UnboundLocalError: local variable 'has_beam_widths' referenced before assignment # fmt: off eval_beam_widths="" if not eval_beam_widths: eval_beam_widths = [1] else: eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")] if 1 not in eval_beam_widths: eval_beam_widths.append(1) eval_beam_widths.sort() has_beam_widths = eval_beam_widths != [1] row_head, output_stats = _configure_training_output(["ner"], use_gpu, has_beam_widths) row_widths = [len(w) for w in row_head] row_settings = {"widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2} # fmt: on print("") msg.row(row_head, **row_settings) msg.row(["-" * width for width in row_settings["widths"]], **row_settings) try: noise_level = 0.0 orth_variant_level = 0.0 gold_preproc = False verbose = False best_score = 0.0 with nlp.disable_pipes(*other_pipes): # only train NER for itn in range(n_iter): train_docs = corpus.train_docs( nlp, noise_level=noise_level, orth_variant_level=orth_variant_level, gold_preproc=gold_preproc, max_length=0, ignore_misaligned=True, ) words_seen = 0 with tqdm.tqdm(total=n_train_words, leave=False) as pbar: losses = {} for batch in minibatch_by_words(train_docs, size=batch_sizes): if not batch: continue docs, golds = zip(*batch) nlp.update( docs, golds, sgd=optimizer, drop=next(dropout_rates), losses=losses, ) if not int(os.environ.get("LOG_FRIENDLY", 0)): pbar.update(sum(len(doc) for doc in docs)) words_seen += sum(len(doc) for doc in docs) with nlp.use_params(optimizer.averages): set_env_log(False) epoch_model_path = output_dir / ("model%d" % itn) nlp.to_disk(epoch_model_path) nlp_loaded = load_model_from_path(epoch_model_path) for beam_width in eval_beam_widths: for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width dev_docs = list( corpus.dev_docs( nlp_loaded, gold_preproc=gold_preproc, ignore_misaligned=True, ) ) nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) end_time = timer() if use_gpu < 0: gpu_wps = None cpu_wps = nwords / (end_time - start_time) else: gpu_wps = nwords / (end_time - start_time) with Model.use_device("cpu"): nlp_loaded = load_model_from_path(epoch_model_path) for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width dev_docs = list( corpus.dev_docs( nlp_loaded, gold_preproc=gold_preproc, ignore_misaligned=True, ) ) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) end_time = timer() cpu_wps = nwords / (end_time - start_time) acc_loc = output_dir / ("model%d" % itn) / "accuracy.json" srsly.write_json(acc_loc, scorer.scores) # Update model meta.json meta["lang"] = nlp.lang meta["pipeline"] = nlp.pipe_names meta["spacy_version"] = ">=%s" % spacy.__version__ if beam_width == 1: meta["speed"] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta["accuracy"] = scorer.scores else: meta.setdefault("beam_accuracy", {}) meta.setdefault("beam_speed", {}) meta["beam_accuracy"][beam_width] = scorer.scores meta["beam_speed"][beam_width] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta["vectors"] = { "width": nlp.vocab.vectors_length, "vectors": len(nlp.vocab.vectors), "keys": nlp.vocab.vectors.n_keys, "name": nlp.vocab.vectors.name, } meta.setdefault("name", "model%d" % itn) meta.setdefault("version", "0.0.1") meta["labels"] = nlp.meta["labels"] meta_loc = output_dir / ("model%d" % itn) / "meta.json" srsly.write_json(meta_loc, meta) set_env_log(verbose) progress = _get_progress( itn, losses, scorer.scores, output_stats, beam_width=beam_width if has_beam_widths else None, cpu_wps=cpu_wps, gpu_wps=gpu_wps, ) msg.row(progress, **row_settings) finally: with nlp.use_params(optimizer.averages): final_model_path = output_dir / "model-final" nlp.to_disk(final_model_path) msg.good("Saved model to output directory", final_model_path) meta["pipeline"] = nlp.pipe_names meta["labels"] = nlp.meta["labels"] meta["factories"] = nlp.meta["factories"] with msg.loading("Creating best model..."): best_model_path = _collate_best_model(meta, output_dir, nlp.pipe_names) msg.good("Created best model", best_model_path)
def validate(): """ Validate that the currently installed version of spaCy is compatible with the installed models. Should be run after `pip install -U spacy`. """ with msg.loading("Loading compatibility table..."): r = requests.get(about.__compatibility__) if r.status_code != 200: msg.fail( "Server error ({})".format(r.status_code), "Couldn't fetch compatibility table.", exits=1, ) msg.good("Loaded compatibility table") compat = r.json()["spacy"] version = about.__version__ version = version.rsplit(".dev", 1)[0] current_compat = compat.get(version) if not current_compat: msg.fail( "Can't find spaCy v{} in compatibility table".format(version), about.__compatibility__, exits=1, ) all_models = set() for spacy_v, models in dict(compat).items(): all_models.update(models.keys()) for model, model_vs in models.items(): compat[spacy_v][model] = [reformat_version(v) for v in model_vs] model_links = get_model_links(current_compat) model_pkgs = get_model_pkgs(current_compat, all_models) incompat_links = {l for l, d in model_links.items() if not d["compat"]} incompat_models = { d["name"] for _, d in model_pkgs.items() if not d["compat"] } incompat_models.update( [d["name"] for _, d in model_links.items() if not d["compat"]]) na_models = [m for m in incompat_models if m not in current_compat] update_models = [m for m in incompat_models if m in current_compat] spacy_dir = Path(__file__).parent.parent msg.divider("Installed models (spaCy v{})".format(about.__version__)) msg.info("spaCy installation: {}".format(path2str(spacy_dir))) if model_links or model_pkgs: header = ("TYPE", "NAME", "MODEL", "VERSION", "") rows = [] for name, data in model_pkgs.items(): rows.append(get_model_row(current_compat, name, data, msg)) for name, data in model_links.items(): rows.append(get_model_row(current_compat, name, data, msg, "link")) msg.table(rows, header=header) else: msg.text("No models found in your current environment.", exits=0) if update_models: msg.divider("Install updates") msg.text("Use the following commands to update the model packages:") cmd = "python -m spacy download {}" print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n") if na_models: msg.text("The following models are not available for spaCy " "v{}: {}".format(about.__version__, ", ".join(na_models))) if incompat_links: msg.text( "You may also want to overwrite the incompatible links using the " "`python -m spacy link` command with `--force`, or remove them " "from the data directory. " "Data path: {path}".format(path=path2str(get_data_path()))) if incompat_models or incompat_links: sys.exit(1)
def create_wikigraph( output_path: Path, wiki="en", version="latest", dumps_path: Path = None, max_workers: int = None, silent: bool = None, force: bool = None, ): """ Create a `WikiGraph` from a specific dump. It can then be used by directly loading it, or it can be packaged with the `package-wikigraph` command. Parameters ---------- output_path : Path Path in which to store the `WikiGraph`. wiki : str, optional Wikipedia dump type to use, by default "en". version : str, optional Wikipedia dump version to use, by default "latest". dumps_path : Path, optional Path in which to find previously downloaded dumps, or where to save dumps downloaded in this call, by default None. max_workers : int, optional Maximum number of processes to use, by default None. silent : bool, optional Do not print anything in stout, by default None. force : bool, optional Overwrite content in output_path, if any, by default None. """ if not output_path.exists(): output_path.mkdir() msg.good(f"Created output directory: {output_path}") graph_name = f"{wiki}wiki_core" graph_path = output_path.joinpath(graph_name) if not force and graph_path.exists(): msg.fail( f"Output path already contains {graph_name} directory", "Use --force to overwrite it", exits=1, ) kwargs = { "dumps_path": dumps_path, "max_workers": max_workers, "wiki": wiki, "version": version, "verbose": not silent, } wg = WikiGraph.build(**kwargs) if not graph_path.exists(): graph_path.mkdir() graph_format = "picklez" with msg.loading("dump to disk..."): wg.dump(graph_path, graph_format=graph_format) meta = get_meta() meta["name"] = graph_name meta["version"] = wg.version meta["graph_format"] = graph_format meta["spikex_version"] = f">={spikex_version}" meta["fullname"] = f"{graph_name}-{spikex_version}" meta["sources"].append("Wikipedia") meta_path = graph_path.joinpath("meta.json") meta_path.write_text(json_dumps(meta, indent=2)) msg.good(f"Successfully created {graph_name}.")
def pretrain( texts_loc, vectors_model, output_dir, width=96, conv_depth=4, bilstm_depth=0, cnn_pieces=3, sa_depth=0, use_chars=False, cnn_window=1, embed_rows=2000, loss_func="cosine", use_vectors=False, dropout=0.2, n_iter=1000, batch_size=3000, max_length=500, min_length=5, seed=0, n_save_every=None, init_tok2vec=None, epoch_start=None, ): """ Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, using an approximate language-modelling objective. Specifically, we load pretrained vectors, and train a component like a CNN, BiLSTM, etc to predict vectors which match the pretrained ones. The weights are saved to a directory after each epoch. You can then pass a path to one of these pretrained weights files to the 'spacy train' command. This technique may be especially helpful if you have little labelled data. However, it's still quite experimental, so your mileage may vary. To load the weights back in during 'spacy train', you need to ensure all settings are the same between pretraining and training. The API and errors around this need some improvement. """ config = dict(locals()) for key in config: if isinstance(config[key], Path): config[key] = str(config[key]) util.fix_random_seed(seed) has_gpu = prefer_gpu() if has_gpu: import torch torch.set_default_tensor_type("torch.cuda.FloatTensor") msg.info("Using GPU" if has_gpu else "Not using GPU") output_dir = Path(output_dir) if output_dir.exists() and [p for p in output_dir.iterdir()]: msg.warn( "Output directory is not empty", "It is better to use an empty directory or refer to a new output path, " "then the new directory will be created for you.", ) if not output_dir.exists(): output_dir.mkdir() msg.good("Created output directory: {}".format(output_dir)) srsly.write_json(output_dir / "config.json", config) msg.good("Saved settings to config.json") # Load texts from file or stdin if texts_loc != "-": # reading from a file texts_loc = Path(texts_loc) if not texts_loc.exists(): msg.fail("Input text file doesn't exist", texts_loc, exits=1) with msg.loading("Loading input texts..."): texts = list(srsly.read_jsonl(texts_loc)) if not texts: msg.fail("Input file is empty", texts_loc, exits=1) msg.good("Loaded input texts") random.shuffle(texts) else: # reading from stdin msg.text("Reading input text from stdin...") texts = srsly.read_jsonl("-") with msg.loading("Loading model '{}'...".format(vectors_model)): nlp = util.load_model(vectors_model) msg.good("Loaded model '{}'".format(vectors_model)) pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name model = create_pretraining_model( nlp, Tok2Vec( width, embed_rows, conv_depth=conv_depth, pretrained_vectors=pretrained_vectors, bilstm_depth=bilstm_depth, # Requires PyTorch. Experimental. subword_features=not use_chars, # Set to False for Chinese etc cnn_maxout_pieces=cnn_pieces, # If set to 1, use Mish activation. ), ) # Load in pretrained weights if init_tok2vec is not None: components = _load_pretrained_tok2vec(nlp, init_tok2vec) msg.text("Loaded pretrained tok2vec for: {}".format(components)) # Parse the epoch number from the given weight file model_name = re.search(r"model\d+\.bin", str(init_tok2vec)) if model_name: # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin' epoch_start = int(model_name.group(0)[5:][:-4]) + 1 else: if not epoch_start: msg.fail( "You have to use the '--epoch-start' argument when using a renamed weight file for " "'--init-tok2vec'", exits=True, ) elif epoch_start < 0: msg.fail( "The argument '--epoch-start' has to be greater or equal to 0. '%d' is invalid" % epoch_start, exits=True, ) else: # Without '--init-tok2vec' the '--epoch-start' argument is ignored epoch_start = 0 optimizer = create_default_optimizer(model.ops) tracker = ProgressTracker(frequency=10000) msg.divider("Pre-training tok2vec layer - starting at epoch %d" % epoch_start) row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")} msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings) def _save_model(epoch, is_temp=False): is_temp_str = ".temp" if is_temp else "" with model.use_params(optimizer.averages): with (output_dir / ("model%d%s.bin" % (epoch, is_temp_str))).open( "wb" ) as file_: file_.write(model.tok2vec.to_bytes()) log = { "nr_word": tracker.nr_word, "loss": tracker.loss, "epoch_loss": tracker.epoch_loss, "epoch": epoch, } with (output_dir / "log.jsonl").open("a") as file_: file_.write(srsly.json_dumps(log) + "\n") skip_counter = 0 for epoch in range(epoch_start, n_iter + epoch_start): for batch_id, batch in enumerate( util.minibatch_by_words(((text, None) for text in texts), size=batch_size) ): docs, count = make_docs( nlp, [text for (text, _) in batch], max_length=max_length, min_length=min_length, ) skip_counter += count loss = make_update( model, docs, optimizer, objective=loss_func, drop=dropout ) progress = tracker.update(epoch, loss, docs) if progress: msg.row(progress, **row_settings) if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7: break if n_save_every and (batch_id % n_save_every == 0): _save_model(epoch, is_temp=True) _save_model(epoch) tracker.epoch_loss = 0.0 if texts_loc != "-": # Reshuffle the texts if texts were loaded from a file random.shuffle(texts) if skip_counter > 0: msg.warn("Skipped {count} empty values".format(count=str(skip_counter))) msg.good("Successfully finished pretrain")
def run(self): msg.info("Extracción de datos de los artículos del corpus.") self.start_time = time.perf_counter() with msg.loading("Analizando..."): asyncio.run(self.main())
def speech_to_text_demo(asr: "ASROnlineAudioFrame") -> None: """Speech to Text (ASR) Microphone Demo. Interrupt the notebook's kernel to stop the app from recoring. """ asr.reset() audio = pyaudio.PyAudio() offset = {"count": 0} columns = [] devices = [] for idx in range(audio.get_device_count()): device = audio.get_device_info_by_index(idx) if not device.get("maxInputChannels"): continue devices.append(idx) columns.append((idx, device.get("name"))) if columns: msg.good("Found the following input devices!") msg.table(columns, header=("ID", "Devices"), divider=True) if devices: device_index = -2 while device_index not in devices: msg.info("Please enter the device ID") device_index = int(input()) def callback(in_data, frame_count, time_info, status): signal = np.frombuffer(in_data, dtype=np.int16) text = asr.transcribe(signal) if text: print(text, end="") offset["count"] = asr.params.offset elif offset["count"] > 0: offset["count"] -= 1 if offset["count"] == 0: print(" ", end="") return (in_data, pyaudio.paContinue) stream = audio.open( input=True, format=pyaudio.paInt16, input_device_index=device_index, stream_callback=callback, channels=asr.params.channels, rate=asr.params.sample_rate, frames_per_buffer=asr.chunk_size, ) msg.loading("Listening...") stream.start_stream() try: while stream.is_active(): time.sleep(0.1) except (KeyboardInterrupt, Exception) as e: stream.stop_stream() stream.close() audio.terminate() msg.warn("WARNING: ASR stream stopped.", e) else: msg.fail("ERROR", "No audio input device found.")