def train_model( model, train_path, eval_path, n_iter=10, output=None, tok2vec=None, ): """ Train a model from Prodigy annotations and optionally save out the best model to disk. """ spacy.util.fix_random_seed(0) with msg.loading(f"Loading '{model}'..."): if model.startswith("blank:"): nlp = spacy.blank(model.replace("blank:", "")) else: nlp = spacy.load(model) msg.good(f"Loaded model '{model}'") train_data, labels = format_data(srsly.read_jsonl(train_path)) eval_data, _ = format_data(srsly.read_jsonl(eval_path)) ner = nlp.create_pipe("ner") for label in labels: ner.add_label(label) nlp.add_pipe(ner) t2v_cfg = { "embed_rows": 10000, "token_vector_width": 128, "conv_depth": 8, "nr_feature_tokens": 3, } optimizer = nlp.begin_training( component_cfg={"ner": t2v_cfg} if tok2vec else {}) if tok2vec: _load_pretrained_tok2vec(nlp, Path(tok2vec)) batch_size = spacy.util.compounding(1.0, 16.0, 1.001) best_acc = 0 best_model = None row_widths = (2, 8, 8, 8, 8) msg.row(("#", "L", "P", "R", "F"), widths=row_widths) for i in range(n_iter): random.shuffle(train_data) losses = {} data = tqdm.tqdm(train_data, leave=False) for batch in spacy.util.minibatch(data, size=batch_size): texts, annots = zip(*batch) nlp.update(texts, annots, drop=0.2, losses=losses) with nlp.use_params(optimizer.averages): sc = nlp.evaluate(eval_data) if sc.ents_f > best_acc: best_acc = sc.ents_f if output: best_model = nlp.to_bytes() acc = (f"{sc.ents_p:.3f}", f"{sc.ents_r:.3f}", f"{sc.ents_f:.3f}") msg.row((i + 1, f"{losses['ner']:.2f}", *acc), widths=row_widths) msg.text(f"Best F-Score: {best_acc:.3f}") if output and best_model: with msg.loading("Saving model..."): nlp.from_bytes(best_model).to_disk(output) msg.good("Saved model", output)
def generate_meta(model_path, existing_meta, msg): meta = existing_meta or {} settings = [ ("lang", "Model language", meta.get("lang", "en")), ("name", "Model name", meta.get("name", "model")), ("version", "Model version", meta.get("version", "0.0.0")), ("spacy_version", "Required spaCy version", ">=%s,<3.0.0" % about.__version__), ("description", "Model description", meta.get("description", False)), ("author", "Author", meta.get("author", False)), ("email", "Author email", meta.get("email", False)), ("url", "Author website", meta.get("url", False)), ("license", "License", meta.get("license", "CC BY-SA 3.0")), ] nlp = util.load_model_from_path(Path(model_path)) meta["pipeline"] = nlp.pipe_names meta["vectors"] = { "width": nlp.vocab.vectors_length, "vectors": len(nlp.vocab.vectors), "keys": nlp.vocab.vectors.n_keys, "name": nlp.vocab.vectors.name, } msg.divider("Generating meta.json") msg.text( "Enter the package settings for your model. The following information " "will be read from your model data: pipeline, vectors." ) for setting, desc, default in settings: response = get_raw_input(desc, default) meta[setting] = default if response == "" and default else response if about.__title__ != "spacy": meta["parent_package"] = about.__title__ return meta
def main(in_file, out_dir, spacy_model="en_core_web_sm", n_process=4): """ Step 1: Parse raw text with spaCy Expects an input file with one sentence per line and will output a .spacy file of the parsed collection of Doc objects (DocBin). """ input_path = Path(in_file) output_path = Path(out_dir) if not input_path.exists(): msg.fail("Can't find input file", in_file, exits=1) if not output_path.exists(): output_path.mkdir(parents=True) msg.good(f"Created output directory {out_dir}") nlp = spacy.load(spacy_model) msg.info(f"Using spaCy model {spacy_model}") doc_bin = DocBin(attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"]) msg.text("Preprocessing text...") with input_path.open("r", encoding="utf8") as texts: docs = nlp.pipe(texts, n_process=n_process) for doc in tqdm.tqdm(docs, desc="Docs", unit=""): doc_bin.add(doc) msg.good(f"Processed {len(doc_bin)} docs") doc_bin_bytes = doc_bin.to_bytes() output_file = output_path / f"{input_path.stem}.spacy" with output_file.open("wb") as f: f.write(doc_bin_bytes) msg.good(f"Saved parsed docs to file", output_file.resolve())
def eval_dataset(set_id): DB = connect() data = DB.get_dataset(set_id) accepted = [ eg for eg in data if eg["answer"] == "accept" and eg.get("accept") ] rejected = [eg for eg in data if eg["answer"] == "reject"] ignored = [eg for eg in data if eg["answer"] == "ignore"] if not accepted and not rejected: msg.warn("No annotations collected", exits=1) counts = Counter() for eg in accepted: for model_id in eg["accept"]: counts[model_id] += 1 preference, _ = counts.most_common(1)[0] ratio = f"{counts[preference]} / {sum(counts.values()) - counts[preference]}" msg.info(f"Evaluating data from '{set_id}'") msg.text( f"You rejected {len(rejected)} and ignored {len(ignored)} pair(s)") if counts["A"] == counts["B"]: msg.warn(f"No preference ({ratio})") else: pc = counts[preference] / sum(counts.values()) msg.good( f"You preferred vectors {preference} with {ratio} ({pc:.0%})") msg.text(mapping[preference])
def train(self, corpus: List[Fragment], verbose: bool = None): if not corpus: raise ValueError msg.no_print = not verbose with msg.loading("setting things up..."): self._setup_training(corpus) msg.text("train Naive Bayes model") feats = {} totals = {} for frag in corpus: for feat, val in frag.features.items(): feats[frag.label][feat + "_" + val] += 1 totals[frag.label] += len(frag.features) # add-1 smoothing and normalization with msg.loading("smoothing... "): smooth_inc = 0.1 all_feat_names = set(feats[True].keys()).union( set(feats[False].keys())) for label in [0, 1]: totals[label] += len(all_feat_names) * smooth_inc for feat in all_feat_names: feats[label][feat] += smooth_inc feats[label][feat] /= totals[label] self.feats[(label, feat)] = feats[label][feat] feats[label][self._PRIOR_FEAT] = (totals[label] / totals.totalCount()) self.feats[(label, self._PRIOR_FEAT)] = feats[label][self._PRIOR_FEAT] msg.good("done")
def main(name: ("模型名称", "positional", None, None, trf_list), make_cache_dir: (" 创建缓存文件夹", "flag", "mk"), use_local_class: ("不使用网络读取", "flag", "local")): if make_cache_dir: c_path = ensure_path(f"{cache_path + name}") if c_path.exists(): msg.warn(f"{cache_path + name} already exists") else: c_path.mkdir() msg.good(f" 缓存文件夹已创建:\t{cache_path}{name}") msg.warn("\n================url================\n") config_file = ALL_PRETRAINED_CONFIG_ARCHIVE_MAP[name] model_file = ALL_PRETRAINED_MODEL_ARCHIVE_MAP[name] msg.text(f"{config_file}\n{model_file}\n") vocab = get_tokenizer(name, use_local_class) pretrained_vocab_files_map = vocab.pretrained_vocab_files_map for vocab_file in pretrained_vocab_files_map.values(): msg.text(f"{vocab_file[name]}\n") msg.warn("\n================url================\n") msg.good("\n使用下载工具下载后,将模型文件放入缓存文件夹中。")
def get_model_row(compat, name, data, msg, model_type="package"): if data["compat"]: comp = msg.text("", color="green", icon="good", no_print=True) version = msg.text(data["version"], color="green", no_print=True) else: version = msg.text(data["version"], color="red", no_print=True) comp = "--> {}".format(compat.get(data["name"], ["n/a"])[0]) return (model_type, name, data["name"], version, comp)
def main( # fmt: off in_file: str = typer.Argument(..., help="Path to input file"), out_dir: str = typer.Argument(..., help="Path to output directory"), spacy_model: str = typer.Argument("en_core_web_sm", help="Name of spaCy model to use"), n_process: int = typer.Option( 1, "--n-process", "-n", help="Number of processes (multiprocessing)"), max_docs: int = typer.Option(10**6, "--max-docs", "-m", help="Maximum docs per batch"), # fmt: on ): """ Step 1: Parse raw text with spaCy Expects an input file with one sentence per line and will output a .spacy file of the parsed collection of Doc objects (DocBin). """ input_path = Path(in_file) output_path = Path(out_dir) if not input_path.exists(): msg.fail("Can't find input file", in_file, exits=1) if not output_path.exists(): output_path.mkdir(parents=True) msg.good(f"Created output directory {out_dir}") nlp = spacy.load(spacy_model) msg.info(f"Using spaCy model {spacy_model}") doc_bin = DocBin(attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"]) msg.text("Preprocessing text...") count = 0 batch_num = 0 with input_path.open("r", encoding="utf8") as texts: docs = nlp.pipe(texts, n_process=n_process) for doc in tqdm.tqdm(docs, desc="Docs", unit=""): if count < max_docs: doc_bin.add(doc) count += 1 else: batch_num += 1 count = 0 msg.good(f"Processed {len(doc_bin)} docs") doc_bin_bytes = doc_bin.to_bytes() output_file = output_path / f"{input_path.stem}-{batch_num}.spacy" with output_file.open("wb") as f: f.write(doc_bin_bytes) msg.good(f"Saved parsed docs to file", output_file.resolve()) doc_bin = DocBin( attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"]) batch_num += 1 output_file = output_path / f"{input_path.stem}-{batch_num}.spacy" with output_file.open("wb") as f: doc_bin_bytes = doc_bin.to_bytes() f.write(doc_bin_bytes) msg.good(f"Complete. Saved final parsed docs to file", output_file.resolve())
def package(input_dir, output_dir, meta_path=None, create_meta=False, force=False): """ Generate Python package for model data, including meta and required installation files. A new directory will be created in the specified output directory, and model data will be copied over. If --create-meta is set and a meta.json already exists in the output directory, the existing values will be used as the defaults in the command-line prompt. """ input_path = util.ensure_path(input_dir) output_path = util.ensure_path(output_dir) meta_path = util.ensure_path(meta_path) if not input_path or not input_path.exists(): msg.fail("Can't locate model data", input_path, exits=1) if not output_path or not output_path.exists(): msg.fail("Output directory not found", output_path, exits=1) if meta_path and not meta_path.exists(): msg.fail("Can't find model meta.json", meta_path, exits=1) meta_path = meta_path or input_path / "meta.json" if meta_path.is_file(): meta = srsly.read_json(meta_path) if not create_meta: # only print if user doesn't want to overwrite msg.good("Loaded meta.json from file", meta_path) else: meta = generate_meta(input_dir, meta, msg) for key in ("lang", "name", "version"): if key not in meta or meta[key] == "": msg.fail( "No '{}' setting found in meta.json".format(key), "This setting is required to build your package.", exits=1, ) model_name = meta["lang"] + "_" + meta["name"] model_name_v = model_name + "-" + meta["version"] main_path = output_path / model_name_v package_path = main_path / model_name if package_path.exists(): if force: shutil.rmtree(path2str(package_path)) else: msg.fail( "Package directory already exists", "Please delete the directory and try again, or use the " "`--force` flag to overwrite existing " "directories.".format(path=path2str(package_path)), exits=1, ) Path.mkdir(package_path, parents=True) shutil.copytree(path2str(input_path), path2str(package_path / model_name_v)) create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2)) create_file(main_path / "setup.py", TEMPLATE_SETUP) create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST) create_file(package_path / "textClassifications.py", TEMPLATE_INIT) msg.good("Successfully created package '{}'".format(model_name_v), main_path) msg.text("To build the package, run `python setup.py sdist` in this directory.")
def create_player(i, decision_function=None): if decision_function: p_name = "PigMachine" else: msg.text(f"Player {i} name (Press <Enter> to accept default): ") p_name = input() p_name = p_name if p_name else f"P{i}" return Player(p_name, decision_function)
def main(self, args: BaseArgumentParser) -> int: list_devices_response = self.get_client().list_devices() msg.divider("Registered Devices") for device in list_devices_response.devices: if device.is_available: msg.good(f"{device.name}") else: msg.fail(f"{device.name}:") msg.text( f" {color(device.error_type, bold=True)}: {device.error_message}" ) return 0
def train_model(model, train_path, eval_path, n_iter=10, output="./model2/", tok2vec=None): spacy.util.fix_random_seed(0) with msg.loading(f"Loading '{model}'..."): if model.startswith("blank:"): nlp = spacy.blank(model.replace("blank:", "")) else: nlp = spacy.load(model) msg.good(f"Loaded model '{model}'") train_data, labels = format_data(srsly.read_jsonl(train_path)) eval_data, _ = format_data(srsly.read_jsonl(eval_path)) if "textcat" not in nlp.pipe_names: textcat = nlp.create_pipe("textcat") nlp.add_pipe(textcat, last=True) else: textcat = nlp.get_pipe("textcat") for label in labels: textcat.add_label(label) optimizer = nlp.begin_training(component_cfg={"exclusive_classes": True}) batch_size = spacy.util.compounding(1.0, 16.0, 1.001) best_acc = 0 best_model = None row_widths = (2, 8, 8) msg.row(("#", "L", "F"), widths=row_widths) for i in range(n_iter): random.shuffle(train_data) losses = {} data = tqdm.tqdm(train_data, leave=False) for batch in spacy.util.minibatch(data, size=batch_size): #texts = [text for text, entities in batch] #annotations = [entities for text, entities in batch] texts, annotations = zip(*batch) nlp.update(texts, annotations, drop=0.2, losses=losses) with nlp.use_params(optimizer.averages): scorer = nlp.evaluate(eval_data) if scorer.textcat_score > best_acc: best_acc = scorer.textcat_score if output: best_model = nlp.to_bytes() acc = f"{scorer.textcat_score:.3f}" msg.row((i + 1, f"{losses['textcat']:.2f}", acc), widths=row_widths) msg.text(f"Best F-Score: {best_acc:.3f}") if output and best_model: with msg.loading("Saving model..."): nlp.from_bytes(best_model).to_disk(output) msg.good("Saved model", output)
def _parse_wiki_sql_dump(wiki_sql_dump_url, parse_fx, **kwargs): _kwargs = {**config, **kwargs} dumps_path = _kwargs["dumps_path"] max_workers = _kwargs["max_workers"] verbose = _kwargs["verbose"] compress_bytes_read = 0 dump_name = wiki_sql_dump_url.name msg.text(f"-> {dump_name}", show=verbose) tqdm_disable = not verbose tqdm_kwargs = { "unit": "B", "unit_scale": True, "unit_divisor": 1024, "disable": tqdm_disable, } compress_obj, content_len = _get_wiki_dump_obj(wiki_sql_dump_url, verbose) should_reopen_compress_obj = False if dumps_path is not None: if not dumps_path.exists(): dumps_path.mkdir() dump_filepath = dumps_path.joinpath(dump_name) if not dump_filepath.exists() or dump_filepath.stat().st_size == 0: with tqdm( desc="download to disk", total=content_len, **tqdm_kwargs, ) as pbar, dump_filepath.open("wb") as fd: bytes_read = 0 for chunk in compress_obj: fd.write(chunk) compress_bytes = compress_obj.tell() pbar.update(compress_bytes - bytes_read) bytes_read = compress_bytes compress_obj.close() should_reopen_compress_obj = True wiki_sql_dump_url = dump_filepath if should_reopen_compress_obj: compress_obj, content_len = _get_wiki_dump_obj(wiki_sql_dump_url) with tqdm( desc="parse", total=content_len, **tqdm_kwargs, ) as pbar, compression_wrapper(compress_obj, "rb") as decompress_obj: compress_bytes_read = 0 with closing(Pool(max_workers)) as pool: task = partial(_parsing_task, parse_fx=parse_fx) for res in pool.imap_unordered(task, decompress_obj, chunksize=10): compress_bytes = compress_obj.tell() pbar.update(compress_bytes - compress_bytes_read) compress_bytes_read = compress_bytes yield from pickle_loads(zlib.decompress(res)) msg.good(dump_name, show=verbose)
def package_wikigraph(input_path: Path, output_path: Path, force: bool = None): """ Generate an installable Python package for a `WikiGraph`. After packaging, "python setup.py sdist" must be run in the package directory, which will create a .tar.gz archive that can be installed via "pip install". Parameters ---------- input_path : Path [description] output_path : Path [description] force : bool, optional [description], by default None """ if not input_path or not input_path.exists(): msg.fail("Can't locate graph data", input_path, exits=1) if not output_path: msg.fail("Output directory is missing", output_path, exits=1) if not output_path.exists(): output_path.mkdir() msg.good("Created output directory: {}".format(output_path)) meta_path = input_path / "meta.json" if not meta_path.exists(): msg.fail("Can't find graph meta.json", meta_path, exits=1) meta = json_loads(meta_path.read_text()) graph_fullname = meta["fullname"] package_path = output_path / graph_fullname if package_path.exists(): if not force: msg.fail( "Package directory already exists", "Please delete the directory and try again, or use the " "`--force` flag to overwrite existing " "directories.".format(path=package_path), exits=1, ) shutil.rmtree(package_path) package_path.mkdir() shutil.copy(meta_path, package_path) copy_tree(str(pkg_path), str(package_path)) graph_name = meta["name"] rename(package_path / "graph-name", package_path / graph_name) module_path = package_path / graph_name copy_tree(str(input_path), str(module_path / graph_fullname)) msg.good("Successfully created package {}".format(graph_name), package_path) msg.text( "To build the package, run `python setup.py sdist` in this directory.")
def run_test(command, directory): """Execute a command that runs a test""" msg.text("RUNNING " + command) wrapped_command = f"cd {directory} && {command}" pipe = subprocess.Popen( wrapped_command, shell=True, ) pipe.wait() if pipe.returncode == 0: msg.good("TEST PASSED") else: msg.fail("TEST FAILED") msg.text('') return pipe.returncode
def _get_wiki_dump_obj(wiki_sql_dump_url, verbose=None): if isinstance(wiki_sql_dump_url, Path): if not wiki_sql_dump_url.exists(): raise FileNotFoundError compress_obj = wiki_sql_dump_url.open("rb") content_len = wiki_sql_dump_url.stat().st_size if content_len == 0: raise FileNotFoundError elif isinstance(wiki_sql_dump_url, URL): compress_obj = http_open(str(wiki_sql_dump_url), mode="rb") content_len = int(compress_obj.response.headers.get("content-length")) else: raise ValueError msg.text(f"from: {wiki_sql_dump_url}", show=verbose) return compress_obj, content_len
def main(in_file, out_dir, spacy_model="en_core_web_sm", n_process=1): """ Step 1: Parse raw text with spaCy Expects an input file with one sentence per line and will output a .spacy file of the parsed collection of Doc objects (DocBin). """ input_path = Path(in_file) output_path = Path(out_dir) if not input_path.exists(): msg.fail("Can't find input file", in_file, exits=1) if not output_path.exists(): output_path.mkdir(parents=True) msg.good(f"Created output directory {out_dir}") nlp = spacy.load(spacy_model) msg.info(f"Using spaCy model {spacy_model}") msg.text("Preprocessing text...") texts = [line.rstrip() for line in open(in_file, 'r')] docs = nlp.pipe(texts, n_process=n_process) output_file = output_path / f"{input_path.stem}.s2v" lines_count = 0 words_count = 0 wn_lemmas = set(wordnet.all_lemma_names()) with output_file.open("w", encoding="utf8") as f: for doc in tqdm.tqdm(docs, desc="Docs", unit=""): # print(doc) spans = get_phrases(doc, wn_lemmas) spans = filter_spans(spans) # print('NOUN SPAN', str(spans)) doc = merge_phrases(doc, spans) spans = get_adjective_phrases(doc) spans = filter_spans(spans) # print('ADJ SPAN', str(spans)) # print('*-----------------------------------------*') doc = merge_phrases(doc, spans) words = [] for token in doc: if not token.is_space: word, sense = make_spacy_key(token, prefer_ents=True) words.append(make_key(word, sense)) f.write(" ".join(words) + "\n") lines_count += 1 words_count += len(words) msg.good( f"Successfully preprocessed {lines_count} docs ({words_count} words)", output_file.resolve(), )
def main(self, args: BaseArgumentParser) -> int: response: ExperimentStatusResponse = self.get_client( ).experiment_status() sequence_status = response.sequence_status if sequence_status is None: print("There are no experiments running.") return 0 for experiment in sequence_status.experiments: if experiment.state == ExperimentState.FINISHED: msg.good(experiment.name) elif experiment.state == ExperimentState.RUNNING: progress = round(experiment.progress * 100) msg.text(f"\u25b6 {experiment.name} ({progress}%)") elif experiment.state == ExperimentState.NOT_STARTED: msg.text(f" {experiment.name}", color="grey") return 0
def main(self, args: ServerArgumentParser) -> int: if args.command == "start": server = Server(self.config) server.start() return 0 if args.command == "status": response = self.get_client().hello() if response == "Hello world": msg.good("Active") return 0 msg.fail("Invalid response") msg.text( "Server replied with an invalid response. This is probably a bug." ) return 1 if args.command == "stop": self.get_client().halt() return 0 raise Exception(f"Unknown server command {args.command}")
def run(cls, trigger: str, argv: Sequence[str]) -> int: try: command_klass = ALL_COMMANDS[trigger] # pyre-ignore[16]: command_klass has no __orig_bases__ attribute args_klass = get_args(command_klass.__orig_bases__[0])[0] args = args_klass(prog=f"labby {trigger}").parse_args(argv) auto_discover_drivers() with open(args.config, "r") as config_file: config = Config(config_file.read()) # pyre-ignore[45]: cannot instantiate Command with abstract method command = command_klass(config) return command.main(args) except pynng.exceptions.Timeout: # this had to be an inline import so the tests would use the # WASABI_LOG_FRIENDLY env variable correctly ¯\_(ツ)_/¯ from wasabi import msg msg.fail("Timeout") msg.text("The labby server did not respond. Are you sure it is started?") return 1
def show_validation_error( file_path: Optional[Union[str, Path]] = None, *, title: Optional[str] = None, desc: str = "", show_config: Optional[bool] = None, hint_fill: bool = True, ): """Helper to show custom config validation errors on the CLI. file_path (str / Path): Optional file path of config file, used in hints. title (str): Override title of custom formatted error. desc (str): Override description of custom formatted error. show_config (bool): Whether to output the config the error refers to. hint_fill (bool): Show hint about filling config. """ try: yield except ConfigValidationError as e: title = title if title is not None else e.title if e.desc: desc = f"{e.desc}" if not desc else f"{e.desc}\n\n{desc}" # Re-generate a new error object with overrides err = e.from_error(e, title="", desc=desc, show_config=show_config) msg.fail(title) print(err.text.strip()) if hint_fill and "value_error.missing" in err.error_types: config_path = (file_path if file_path is not None and str(file_path) != "-" else "config.cfg") msg.text( "If your config contains missing values, you can run the 'init " "fill-config' command to fill in all the defaults, if possible:", spaced=True, ) print(f"{COMMAND} init fill-config {config_path} {config_path} \n") sys.exit(1) except InterpolationError as e: msg.fail("Config validation error", e, exits=1)
def main(self, args: DeviceInfoArguments) -> int: device_info = self.get_client().device_info(args.device_name) if device_info.device_type is None: msg.fail( f"Unknown device {args.device_name}", text="See `labby devices` for a list of available devices.", ) return 1 msg.divider( f"{args.device_name} (device_info.device_type.friendly_name)") if device_info.is_connected: msg.table([ ("Connection", render.good("OK")), *self._render_device_info(device_info), ]) else: msg.table([("Connection", render.fail("Error"))]) msg.text(f"{color(device_info.error_type, bold=True)}: " + f"{device_info.error_message}") return 0
def ner_translate( in_sets: List[str], out_set: str, model_name_or_path: str, source_lang: str, target_lang: str, dry: bool = False, ) -> None: translator = TransformersMarianTranslator( model_name_or_path, source_lang=source_lang, target_lang=target_lang ) DB = connect() for set_id in in_sets: if set_id not in DB: msg.fail(f"Can't find dataset '{set_id}' in database", exits=1) if out_set in DB and len(DB.get_dataset(out_set)): msg.fail( f"Output dataset '{out_set}' already exists and includes examples", f"This can lead to unexpected results. Please use a new dataset.", exits=1, ) if out_set not in DB: if not dry: DB.add_dataset(out_set) msg.good(f"Created dataset '{out_set}'") matched_examples_t = [] mismatched_examples_t = [] for set_id in in_sets: msg.text(f"RECIPE: Translating and merging examples from '{set_id}'") raw_examples = DB.get_dataset(set_id) examples = [Example(**e) for e in raw_examples] examples_t = translate_ner_batch( examples, translate_f=translator.pipe, target_lang=target_lang ) for e, e_t in zip(examples, examples_t): if len(e.spans) != len(e_t.spans): mismatched_examples_t.append(e_t) else: matched_examples_t.append(e_t) msg.text(f"RECIPE: Translated {len(matched_examples_t)} examples from '{set_id}'") msg.text( f"RECIPE: Found {len(mismatched_examples_t)} examples with mismatched spans after translation from '{set_id}'" ) matched_examples_t = set_hashes(matched_examples_t) dry = False if not dry: DB.add_examples(matched_examples_t, datasets=[out_set]) msg.good( f"Translated and merged {len(matched_examples_t)} examples from {len(in_sets)} datasets", f"Created translated and merged dataset '{out_set}'", )
def eval_dataset(set_id): DB = connect() data = DB.get_dataset(set_id) accepted = [ eg for eg in data if eg["answer"] == "accept" and eg.get("accept") ] rejected = [eg for eg in data if eg["answer"] == "reject"] ignored = [eg for eg in data if eg["answer"] == "ignore"] if not accepted and not rejected: msg.warn("No annotations collected", exits=1) total_count = 0 agree_count = 0 for eg in accepted: total_count += len(eg.get("options", [])) agree_count += len(eg.get("accept", [])) msg.info(f"Evaluating data from '{set_id}'") msg.text( f"You rejected {len(rejected)} and ignored {len(ignored)} pair(s)") pc = agree_count / total_count text = f"You agreed {agree_count} / {total_count} times ({pc:.0%})" if pc > 0.5: msg.good(text) else: msg.fail(text)
def eval_dataset(set_id): """Output summary about user agreement with the model.""" DB = connect() data = DB.get_dataset(set_id) accepted = [ eg for eg in data if eg["answer"] == "accept" and eg.get("accept") ] rejected = [eg for eg in data if eg["answer"] == "reject"] if not accepted and not rejected: msg.warn("No annotations collected", exits=1) high_conf = 0.8 agree_count = 0 disagree_high_conf = len( [e for e in rejected if e["confidence"] > high_conf]) for eg in accepted: choice = eg["accept"][0] score_choice = [ o["score"] for o in eg["options"] if o["id"] == choice ][0] score_other = [ o["score"] for o in eg["options"] if o["id"] != choice ][0] if score_choice > score_other: agree_count += 1 elif eg["confidence"] > high_conf: disagree_high_conf += 1 pc = agree_count / (len(accepted) + len(rejected)) text = f"You agreed {agree_count} / {len(data)} times ({pc:.0%})" msg.info(f"Evaluating data from '{set_id}'") if pc > 0.5: msg.good(text) else: msg.fail(text) msg.text( f"You disagreed on {disagree_high_conf} high confidence scores") msg.text(f"You rejected {len(rejected)} suggestions as not similar")
def validate() -> None: model_pkgs, compat = get_model_pkgs() spacy_version = get_minor_version(about.__version__) current_compat = compat.get(spacy_version, {}) if not current_compat: msg.warn(f"No compatible packages found for v{spacy_version} of spaCy") incompat_models = { d["name"] for _, d in model_pkgs.items() if not d["compat"] } na_models = [m for m in incompat_models if m not in current_compat] update_models = [m for m in incompat_models if m in current_compat] spacy_dir = Path(__file__).parent.parent msg.divider(f"Installed pipeline packages (spaCy v{about.__version__})") msg.info(f"spaCy installation: {spacy_dir}") if model_pkgs: header = ("NAME", "SPACY", "VERSION", "") rows = [] for name, data in model_pkgs.items(): if data["compat"]: comp = msg.text("", color="green", icon="good", no_print=True) version = msg.text(data["version"], color="green", no_print=True) else: version = msg.text(data["version"], color="yellow", no_print=True) comp = f"--> {current_compat.get(data['name'], ['n/a'])[0]}" rows.append((data["name"], data["spacy"], version, comp)) msg.table(rows, header=header) else: msg.text("No pipeline packages found in your current environment.", exits=0) if update_models: msg.divider("Install updates") msg.text("Use the following commands to update the packages:") cmd = "python -m spacy download {}" print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n") if na_models: msg.info( f"The following packages are custom spaCy pipelines or not " f"available for spaCy v{about.__version__}:", ", ".join(na_models), ) if incompat_models: sys.exit(1)
def train( lang, output_path, train_path, dev_path, raw_text=None, base_model=None, pipeline="tagger,parser,ner", replace_components=False, vectors=None, width=96, conv_depth=4, cnn_window=1, cnn_pieces=3, bilstm_depth=0, embed_rows=2000, n_iter=30, n_early_stopping=None, n_examples=0, use_gpu=-1, version="0.0.0", meta_path=None, init_tok2vec=None, parser_multitasks="", entity_multitasks="", noise_level=0.0, orth_variant_level=0.0, eval_beam_widths="", gold_preproc=False, learn_tokens=False, textcat_multilabel=False, textcat_arch="bow", textcat_positive_label=None, tag_map_path=None, omit_extra_lookups=False, verbose=False, debug=False, ): """ Train or update a spaCy model. Requires data to be formatted in spaCy's JSON format. To convert data from other formats, use the `spacy convert` command. """ util.fix_random_seed() util.set_env_log(verbose) # Make sure all files and paths exists if they are needed train_path = util.ensure_path(train_path) dev_path = util.ensure_path(dev_path) meta_path = util.ensure_path(meta_path) output_path = util.ensure_path(output_path) if raw_text is not None: raw_text = list(srsly.read_jsonl(raw_text)) if not train_path or not train_path.exists(): msg.fail("Training data not found", train_path, exits=1) if not dev_path or not dev_path.exists(): msg.fail("Development data not found", dev_path, exits=1) if meta_path is not None and not meta_path.exists(): msg.fail("Can't find model meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) if meta_path else {} if output_path.exists() and [ p for p in output_path.iterdir() if p.is_dir() ]: msg.warn( "Output directory is not empty", "This can lead to unintended side effects when saving the model. " "Please use an empty directory or a different path instead. If " "the specified output path doesn't exist, the directory will be " "created for you.", ) if not output_path.exists(): output_path.mkdir() msg.good("Created output directory: {}".format(output_path)) # Take dropout and batch size as generators of values -- dropout # starts high and decays sharply, to force the optimizer to explore. # Batch size starts at 1 and grows, so that we make updates quickly # at the beginning of training. dropout_rates = util.decaying( util.env_opt("dropout_from", 0.2), util.env_opt("dropout_to", 0.2), util.env_opt("dropout_decay", 0.0), ) batch_sizes = util.compounding( util.env_opt("batch_from", 100.0), util.env_opt("batch_to", 1000.0), util.env_opt("batch_compound", 1.001), ) if not eval_beam_widths: eval_beam_widths = [1] else: eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")] if 1 not in eval_beam_widths: eval_beam_widths.append(1) eval_beam_widths.sort() has_beam_widths = eval_beam_widths != [1] # Set up the base model and pipeline. If a base model is specified, load # the model and make sure the pipeline matches the pipeline setting. If # training starts from a blank model, intitalize the language class. pipeline = [p.strip() for p in pipeline.split(",")] disabled_pipes = None pipes_added = False msg.text("Training pipeline: {}".format(pipeline)) if use_gpu >= 0: activated_gpu = None try: activated_gpu = set_gpu(use_gpu) except Exception as e: msg.warn("Exception: {}".format(e)) if activated_gpu is not None: msg.text("Using GPU: {}".format(use_gpu)) else: msg.warn("Unable to activate GPU: {}".format(use_gpu)) msg.text("Using CPU only") use_gpu = -1 base_components = [] if base_model: msg.text("Starting with base model '{}'".format(base_model)) nlp = util.load_model(base_model) if nlp.lang != lang: msg.fail( "Model language ('{}') doesn't match language specified as " "`lang` argument ('{}') ".format(nlp.lang, lang), exits=1, ) for pipe in pipeline: pipe_cfg = {} if pipe == "parser": pipe_cfg = {"learn_tokens": learn_tokens} elif pipe == "textcat": pipe_cfg = { "exclusive_classes": not textcat_multilabel, "architecture": textcat_arch, "positive_label": textcat_positive_label, } if pipe not in nlp.pipe_names: msg.text("Adding component to base model: '{}'".format(pipe)) nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) pipes_added = True elif replace_components: msg.text( "Replacing component from base model '{}'".format(pipe)) nlp.replace_pipe(pipe, nlp.create_pipe(pipe, config=pipe_cfg)) pipes_added = True else: if pipe == "textcat": textcat_cfg = nlp.get_pipe("textcat").cfg base_cfg = { "exclusive_classes": textcat_cfg["exclusive_classes"], "architecture": textcat_cfg["architecture"], "positive_label": textcat_cfg["positive_label"], } if base_cfg != pipe_cfg: msg.fail( "The base textcat model configuration does" "not match the provided training options. " "Existing cfg: {}, provided cfg: {}".format( base_cfg, pipe_cfg), exits=1, ) msg.text( "Extending component from base model '{}'".format(pipe)) base_components.append(pipe) disabled_pipes = nlp.disable_pipes( [p for p in nlp.pipe_names if p not in pipeline]) else: msg.text("Starting with blank model '{}'".format(lang)) lang_cls = util.get_lang_class(lang) nlp = lang_cls() for pipe in pipeline: if pipe == "parser": pipe_cfg = {"learn_tokens": learn_tokens} elif pipe == "textcat": pipe_cfg = { "exclusive_classes": not textcat_multilabel, "architecture": textcat_arch, "positive_label": textcat_positive_label, } else: pipe_cfg = {} nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) if tag_map_path is not None: tag_map = srsly.read_json(tag_map_path) # Replace tag map with provided mapping nlp.vocab.morphology.load_tag_map(tag_map) # Create empty extra lexeme tables so the data from spacy-lookups-data # isn't loaded if these features are accessed if omit_extra_lookups: nlp.vocab.lookups_extra = Lookups() nlp.vocab.lookups_extra.add_table("lexeme_cluster") nlp.vocab.lookups_extra.add_table("lexeme_prob") nlp.vocab.lookups_extra.add_table("lexeme_settings") if vectors: msg.text("Loading vector from model '{}'".format(vectors)) _load_vectors(nlp, vectors) # Multitask objectives multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)] for pipe_name, multitasks in multitask_options: if multitasks: if pipe_name not in pipeline: msg.fail("Can't use multitask objective without '{}' in the " "pipeline".format(pipe_name)) pipe = nlp.get_pipe(pipe_name) for objective in multitasks.split(","): pipe.add_multitask_objective(objective) # Prepare training corpus msg.text("Counting training words (limit={})".format(n_examples)) corpus = GoldCorpus(train_path, dev_path, limit=n_examples) n_train_words = corpus.count_train() if base_model and not pipes_added: # Start with an existing model, use default optimizer optimizer = nlp.resume_training(device=use_gpu) else: # Start with a blank model, call begin_training cfg = {"device": use_gpu} cfg["conv_depth"] = conv_depth cfg["token_vector_width"] = width cfg["bilstm_depth"] = bilstm_depth cfg["cnn_maxout_pieces"] = cnn_pieces cfg["embed_size"] = embed_rows cfg["conv_window"] = cnn_window optimizer = nlp.begin_training(lambda: corpus.train_tuples, **cfg) nlp._optimizer = None # Load in pretrained weights if init_tok2vec is not None: components = _load_pretrained_tok2vec(nlp, init_tok2vec, base_components) msg.text("Loaded pretrained tok2vec for: {}".format(components)) # Verify textcat config if "textcat" in pipeline: textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", []) if textcat_positive_label and textcat_positive_label not in textcat_labels: msg.fail( "The textcat_positive_label (tpl) '{}' does not match any " "label in the training data.".format(textcat_positive_label), exits=1, ) if textcat_positive_label and len(textcat_labels) != 2: msg.fail( "A textcat_positive_label (tpl) '{}' was provided for training " "data that does not appear to be a binary classification " "problem with two labels.".format(textcat_positive_label), exits=1, ) train_docs = corpus.train_docs( nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0, ignore_misaligned=True, ) train_labels = set() if textcat_multilabel: multilabel_found = False for text, gold in train_docs: train_labels.update(gold.cats.keys()) if list(gold.cats.values()).count(1.0) != 1: multilabel_found = True if not multilabel_found and not base_model: msg.warn("The textcat training instances look like they have " "mutually-exclusive classes. Remove the flag " "'--textcat-multilabel' to train a classifier with " "mutually-exclusive classes.") if not textcat_multilabel: for text, gold in train_docs: train_labels.update(gold.cats.keys()) if list(gold.cats.values()).count(1.0) != 1 and not base_model: msg.warn( "Some textcat training instances do not have exactly " "one positive label. Modifying training options to " "include the flag '--textcat-multilabel' for classes " "that are not mutually exclusive.") nlp.get_pipe("textcat").cfg["exclusive_classes"] = False textcat_multilabel = True break if base_model and set(textcat_labels) != train_labels: msg.fail( "Cannot extend textcat model using data with different " "labels. Base model labels: {}, training data labels: " "{}.".format(textcat_labels, list(train_labels)), exits=1, ) if textcat_multilabel: msg.text( "Textcat evaluation score: ROC AUC score macro-averaged across " "the labels '{}'".format(", ".join(textcat_labels))) elif textcat_positive_label and len(textcat_labels) == 2: msg.text("Textcat evaluation score: F1-score for the " "label '{}'".format(textcat_positive_label)) elif len(textcat_labels) > 1: if len(textcat_labels) == 2: msg.warn( "If the textcat component is a binary classifier with " "exclusive classes, provide '--textcat-positive-label' for " "an evaluation on the positive class.") msg.text( "Textcat evaluation score: F1-score macro-averaged across " "the labels '{}'".format(", ".join(textcat_labels))) else: msg.fail( "Unsupported textcat configuration. Use `spacy debug-data` " "for more information.") # fmt: off row_head, output_stats = _configure_training_output( pipeline, use_gpu, has_beam_widths) row_widths = [len(w) for w in row_head] row_settings = { "widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2 } # fmt: on print("") msg.row(row_head, **row_settings) msg.row(["-" * width for width in row_settings["widths"]], **row_settings) try: iter_since_best = 0 best_score = 0.0 for i in range(n_iter): train_docs = corpus.train_docs( nlp, noise_level=noise_level, orth_variant_level=orth_variant_level, gold_preproc=gold_preproc, max_length=0, ignore_misaligned=True, ) if raw_text: random.shuffle(raw_text) raw_batches = util.minibatch( (nlp.make_doc(rt["text"]) for rt in raw_text), size=8) words_seen = 0 with tqdm.tqdm(total=n_train_words, leave=False) as pbar: losses = {} for batch in util.minibatch_by_words(train_docs, size=batch_sizes): if not batch: continue docs, golds = zip(*batch) try: nlp.update( docs, golds, sgd=optimizer, drop=next(dropout_rates), losses=losses, ) except ValueError as e: err = "Error during training" if init_tok2vec: err += " Did you provide the same parameters during 'train' as during 'pretrain'?" msg.fail(err, "Original error message: {}".format(e), exits=1) if raw_text: # If raw text is available, perform 'rehearsal' updates, # which use unlabelled data to reduce overfitting. raw_batch = list(next(raw_batches)) nlp.rehearse(raw_batch, sgd=optimizer, losses=losses) if not int(os.environ.get("LOG_FRIENDLY", 0)): pbar.update(sum(len(doc) for doc in docs)) words_seen += sum(len(doc) for doc in docs) with nlp.use_params(optimizer.averages): util.set_env_log(False) epoch_model_path = output_path / ("model%d" % i) nlp.to_disk(epoch_model_path) nlp_loaded = util.load_model_from_path(epoch_model_path) for beam_width in eval_beam_widths: for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width dev_docs = list( corpus.dev_docs( nlp_loaded, gold_preproc=gold_preproc, ignore_misaligned=True, )) nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) end_time = timer() if use_gpu < 0: gpu_wps = None cpu_wps = nwords / (end_time - start_time) else: gpu_wps = nwords / (end_time - start_time) # Only evaluate on CPU in the first iteration (for # timing) if GPU is enabled if i == 0: with Model.use_device("cpu"): nlp_loaded = util.load_model_from_path( epoch_model_path) for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg[ "beam_width"] = beam_width dev_docs = list( corpus.dev_docs( nlp_loaded, gold_preproc=gold_preproc, ignore_misaligned=True, )) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) end_time = timer() cpu_wps = nwords / (end_time - start_time) acc_loc = output_path / ("model%d" % i) / "accuracy.json" srsly.write_json(acc_loc, scorer.scores) # Update model meta.json meta["lang"] = nlp.lang meta["pipeline"] = nlp.pipe_names meta["spacy_version"] = ">=%s" % about.__version__ if beam_width == 1: meta["speed"] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta.setdefault("accuracy", {}) for component in nlp.pipe_names: for metric in _get_metrics(component): meta["accuracy"][metric] = scorer.scores[ metric] else: meta.setdefault("beam_accuracy", {}) meta.setdefault("beam_speed", {}) for component in nlp.pipe_names: for metric in _get_metrics(component): meta["beam_accuracy"][metric] = scorer.scores[ metric] meta["beam_speed"][beam_width] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta["vectors"] = { "width": nlp.vocab.vectors_length, "vectors": len(nlp.vocab.vectors), "keys": nlp.vocab.vectors.n_keys, "name": nlp.vocab.vectors.name, } meta.setdefault("name", "model%d" % i) meta.setdefault("version", version) meta["labels"] = nlp.meta["labels"] meta_loc = output_path / ("model%d" % i) / "meta.json" srsly.write_json(meta_loc, meta) util.set_env_log(verbose) progress = _get_progress( i, losses, scorer.scores, output_stats, beam_width=beam_width if has_beam_widths else None, cpu_wps=cpu_wps, gpu_wps=gpu_wps, ) if i == 0 and "textcat" in pipeline: textcats_per_cat = scorer.scores.get( "textcats_per_cat", {}) for cat, cat_score in textcats_per_cat.items(): if cat_score.get("roc_auc_score", 0) < 0: msg.warn( "Textcat ROC AUC score is undefined due to " "only one value in label '{}'.".format( cat)) msg.row(progress, **row_settings) # Early stopping if n_early_stopping is not None: current_score = _score_for_model(meta) if current_score < best_score: iter_since_best += 1 else: iter_since_best = 0 best_score = current_score if iter_since_best >= n_early_stopping: iter_current = i + 1 msg.text("Early stopping, best iteration " "is: {}".format(iter_current - iter_since_best)) msg.text("Best score = {}; Final iteration " "score = {}".format(best_score, current_score)) break except Exception as e: msg.warn( "Aborting and saving the final best model. " "Encountered exception: {}".format(e), exits=1, ) finally: best_pipes = nlp.pipe_names if disabled_pipes: disabled_pipes.restore() meta["pipeline"] = nlp.pipe_names with nlp.use_params(optimizer.averages): final_model_path = output_path / "model-final" nlp.to_disk(final_model_path) srsly.write_json(final_model_path / "meta.json", meta) meta_loc = output_path / "model-final" / "meta.json" final_meta = srsly.read_json(meta_loc) final_meta.setdefault("accuracy", {}) final_meta["accuracy"].update(meta.get("accuracy", {})) final_meta.setdefault("speed", {}) final_meta["speed"].setdefault("cpu", None) final_meta["speed"].setdefault("gpu", None) meta.setdefault("speed", {}) meta["speed"].setdefault("cpu", None) meta["speed"].setdefault("gpu", None) # combine cpu and gpu speeds with the base model speeds if final_meta["speed"]["cpu"] and meta["speed"]["cpu"]: speed = _get_total_speed( [final_meta["speed"]["cpu"], meta["speed"]["cpu"]]) final_meta["speed"]["cpu"] = speed if final_meta["speed"]["gpu"] and meta["speed"]["gpu"]: speed = _get_total_speed( [final_meta["speed"]["gpu"], meta["speed"]["gpu"]]) final_meta["speed"]["gpu"] = speed # if there were no speeds to update, overwrite with meta if (final_meta["speed"]["cpu"] is None and final_meta["speed"]["gpu"] is None): final_meta["speed"].update(meta["speed"]) # note: beam speeds are not combined with the base model if has_beam_widths: final_meta.setdefault("beam_accuracy", {}) final_meta["beam_accuracy"].update( meta.get("beam_accuracy", {})) final_meta.setdefault("beam_speed", {}) final_meta["beam_speed"].update(meta.get("beam_speed", {})) srsly.write_json(meta_loc, final_meta) msg.good("Saved model to output directory", final_model_path) with msg.loading("Creating best model..."): best_model_path = _collate_best_model(final_meta, output_path, best_pipes) msg.good("Created best model", best_model_path)
def build( # fmt: off repo: str, commit: str, package_name: str = Option(None, help="Package name (if different from repo)"), py35: bool = Option(False, "--py35", help="Build wheels for Python 3.5"), llvm: bool = Option(False, "--llvm", help="Requires LLVM to be installed"), rust: bool = Option(False, "--rust", help="Requires Rust to be installed"), universal: bool = Option( False, "--universal", help="Build universal (pure Python) wheel and sdist"), skip_tests: bool = Option( False, "--skip-tests", help="Don't run tests (e.g. if package doesn't have any)"), build_constraints: bool = Option( False, "--build-constraints", help="Use build constraints for build requirements"), # fmt: on ): """Build wheels for a given repo and commit / tag.""" print(LOGO) repo_id = get_repo_id() user, package = repo.lower().split("/", 1) if package_name is None: package_name = package.replace("-", "_") msg.info(f"Building in repo {repo_id}") msg.info(f"Building wheels for {user}/{package}\n") if universal: msg.warn( "Building only universal sdist and wheel, no cross-platform wheels" ) if skip_tests: msg.warn("Not running any tests") clone_url = DEFAULT_CLONE_TEMPLATE.format(f"{user}/{package}") repo = get_gh().get_repo(repo_id) with msg.loading("Finding a unique name for this release..."): # Pick the release_name by finding an unused one i = 1 while True: release_name = f"{package_name}-{commit}" if i > 1: release_name += f"-{i}" try: repo.get_release(release_name) except github.UnknownObjectException: break i += 1 branch_name = f"branch-for-{release_name}" bs = { "clone-url": clone_url, "package-name": package_name, "commit": commit, "options": { "llvm": llvm, "rust": rust, "py35": py35, "universal": universal, "skip_tests": skip_tests, "build_constraints": build_constraints, }, "upload-to": { "type": "github-release", "repo-id": repo_id, "release-id": release_name, }, } bs_json = json.dumps(bs) bs_json_formatted = json.dumps(bs, indent=4) msg.text(f"Creating release {release_name} to collect assets") release_text = f"https://github.com/{user}/{package}\n\n### Build spec\n\n```json\n{bs_json_formatted}\n```" release = repo.create_git_release(release_name, release_name, release_text) with msg.loading("Creating build branch..."): # 'master' is a 'Commit'. 'master.commit' is a 'GitCommit'. These are # different types that are mostly *not* interchangeable: # https://pygithub.readthedocs.io/en/latest/github_objects/Commit.html # https://pygithub.readthedocs.io/en/latest/github_objects/GitCommit.html master = repo.get_commit("master") master_gitcommit = master.commit patch = github.InputGitTreeElement( "build-spec.json", "100644", "blob", content=bs_json, ) tree = repo.create_git_tree([patch], master_gitcommit.tree) our_gitcommit = repo.create_git_commit(f"Building: {release_name}", tree, [master_gitcommit]) repo.create_git_ref(f"refs/heads/{branch_name}", our_gitcommit.sha) msg.good(f"Commit is {our_gitcommit.sha[:8]} in branch {branch_name}") msg.text(f"Release: {release.html_url}") msg.text( f"Checks: https://github.com/{repo_id}/commit/{our_gitcommit.sha}/checks" )
def debug_data( config_path: Path, *, config_overrides: Dict[str, Any] = {}, ignore_warnings: bool = False, verbose: bool = False, no_format: bool = True, silent: bool = True, ): msg = Printer(no_print=silent, pretty=not no_format, ignore_warnings=ignore_warnings) # Make sure all files and paths exists if they are needed with show_validation_error(config_path): cfg = util.load_config(config_path, overrides=config_overrides) nlp = util.load_model_from_config(cfg) config = nlp.config.interpolate() T = registry.resolve(config["training"], schema=ConfigSchemaTraining) # Use original config here, not resolved version sourced_components = get_sourced_components(cfg) frozen_components = T["frozen_components"] resume_components = [ p for p in sourced_components if p not in frozen_components ] pipeline = nlp.pipe_names factory_names = [ nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names ] msg.divider("Data file validation") # Create the gold corpus to be able to better analyze data dot_names = [T["train_corpus"], T["dev_corpus"]] train_corpus, dev_corpus = resolve_dot_names(config, dot_names) nlp.initialize(lambda: train_corpus(nlp)) msg.good("Pipeline can be initialized with data") train_dataset = list(train_corpus(nlp)) dev_dataset = list(dev_corpus(nlp)) msg.good("Corpus is loadable") # Create all gold data here to avoid iterating over the train_dataset constantly gold_train_data = _compile_gold(train_dataset, factory_names, nlp, make_proj=True) gold_train_unpreprocessed_data = _compile_gold(train_dataset, factory_names, nlp, make_proj=False) gold_dev_data = _compile_gold(dev_dataset, factory_names, nlp, make_proj=True) train_texts = gold_train_data["texts"] dev_texts = gold_dev_data["texts"] frozen_components = T["frozen_components"] msg.divider("Training stats") msg.text(f"Language: {nlp.lang}") msg.text(f"Training pipeline: {', '.join(pipeline)}") if resume_components: msg.text( f"Components from other pipelines: {', '.join(resume_components)}") if frozen_components: msg.text(f"Frozen components: {', '.join(frozen_components)}") msg.text(f"{len(train_dataset)} training docs") msg.text(f"{len(dev_dataset)} evaluation docs") if not len(gold_dev_data): msg.fail("No evaluation docs") overlap = len(train_texts.intersection(dev_texts)) if overlap: msg.warn(f"{overlap} training examples also in evaluation data") else: msg.good("No overlap between training and evaluation data") # TODO: make this feedback more fine-grained and report on updated # components vs. blank components if not resume_components and len(train_dataset) < BLANK_MODEL_THRESHOLD: text = f"Low number of examples to train a new pipeline ({len(train_dataset)})" if len(train_dataset) < BLANK_MODEL_MIN_THRESHOLD: msg.fail(text) else: msg.warn(text) msg.text( f"It's recommended to use at least {BLANK_MODEL_THRESHOLD} examples " f"(minimum {BLANK_MODEL_MIN_THRESHOLD})", show=verbose, ) msg.divider("Vocab & Vectors") n_words = gold_train_data["n_words"] msg.info( f"{n_words} total word(s) in the data ({len(gold_train_data['words'])} unique)" ) if gold_train_data["n_misaligned_words"] > 0: n_misaligned = gold_train_data["n_misaligned_words"] msg.warn(f"{n_misaligned} misaligned tokens in the training data") if gold_dev_data["n_misaligned_words"] > 0: n_misaligned = gold_dev_data["n_misaligned_words"] msg.warn(f"{n_misaligned} misaligned tokens in the dev data") most_common_words = gold_train_data["words"].most_common(10) msg.text( f"10 most common words: {_format_labels(most_common_words, counts=True)}", show=verbose, ) if len(nlp.vocab.vectors): msg.info( f"{len(nlp.vocab.vectors)} vectors ({nlp.vocab.vectors.n_keys} " f"unique keys, {nlp.vocab.vectors_length} dimensions)") n_missing_vectors = sum( gold_train_data["words_missing_vectors"].values()) msg.warn( "{} words in training data without vectors ({:.0f}%)".format( n_missing_vectors, 100 * (n_missing_vectors / gold_train_data["n_words"]), ), ) msg.text( "10 most common words without vectors: {}".format( _format_labels( gold_train_data["words_missing_vectors"].most_common(10), counts=True, )), show=verbose, ) else: msg.info("No word vectors present in the package") if "ner" in factory_names: # Get all unique NER labels present in the data labels = set(label for label in gold_train_data["ner"] if label not in ("O", "-", None)) label_counts = gold_train_data["ner"] model_labels = _get_labels_from_model(nlp, "ner") has_low_data_warning = False has_no_neg_warning = False has_ws_ents_error = False has_boundary_cross_ents_warning = False msg.divider("Named Entity Recognition") msg.info(f"{len(model_labels)} label(s)") missing_values = label_counts["-"] msg.text(f"{missing_values} missing value(s) (tokens with '-' label)") for label in labels: if len(label) == 0: msg.fail("Empty label found in train data") labels_with_counts = [(label, count) for label, count in label_counts.most_common() if label != "-"] labels_with_counts = _format_labels(labels_with_counts, counts=True) msg.text(f"Labels in train data: {_format_labels(labels)}", show=verbose) missing_labels = model_labels - labels if missing_labels: msg.warn( "Some model labels are not present in the train data. The " "model performance may be degraded for these labels after " f"training: {_format_labels(missing_labels)}.") if gold_train_data["ws_ents"]: msg.fail( f"{gold_train_data['ws_ents']} invalid whitespace entity spans" ) has_ws_ents_error = True for label in labels: if label_counts[label] <= NEW_LABEL_THRESHOLD: msg.warn( f"Low number of examples for label '{label}' ({label_counts[label]})" ) has_low_data_warning = True with msg.loading("Analyzing label distribution..."): neg_docs = _get_examples_without_label( train_dataset, label) if neg_docs == 0: msg.warn( f"No examples for texts WITHOUT new label '{label}'") has_no_neg_warning = True if gold_train_data["boundary_cross_ents"]: msg.warn( f"{gold_train_data['boundary_cross_ents']} entity span(s) crossing sentence boundaries" ) has_boundary_cross_ents_warning = True if not has_low_data_warning: msg.good("Good amount of examples for all labels") if not has_no_neg_warning: msg.good("Examples without occurrences available for all labels") if not has_ws_ents_error: msg.good( "No entities consisting of or starting/ending with whitespace") if not has_boundary_cross_ents_warning: msg.good("No entities crossing sentence boundaries") if has_low_data_warning: msg.text( f"To train a new entity type, your data should include at " f"least {NEW_LABEL_THRESHOLD} instances of the new label", show=verbose, ) if has_no_neg_warning: msg.text( "Training data should always include examples of entities " "in context, as well as examples without a given entity " "type.", show=verbose, ) if has_ws_ents_error: msg.text("Entity spans consisting of or starting/ending " "with whitespace characters are considered invalid.") if "textcat" in factory_names: msg.divider("Text Classification (Exclusive Classes)") labels = _get_labels_from_model(nlp, "textcat") msg.info(f"Text Classification: {len(labels)} label(s)") msg.text(f"Labels: {_format_labels(labels)}", show=verbose) missing_labels = labels - set(gold_train_data["cats"]) if missing_labels: msg.warn( "Some model labels are not present in the train data. The " "model performance may be degraded for these labels after " f"training: {_format_labels(missing_labels)}.") if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]): msg.warn( "Potential train/dev mismatch: the train and dev labels are " "not the same. " f"Train labels: {_format_labels(gold_train_data['cats'])}. " f"Dev labels: {_format_labels(gold_dev_data['cats'])}.") if len(labels) < 2: msg.fail( "The model does not have enough labels. 'textcat' requires at " "least two labels due to mutually-exclusive classes, e.g. " "LABEL/NOT_LABEL or POSITIVE/NEGATIVE for a binary " "classification task.") if (gold_train_data["n_cats_bad_values"] > 0 or gold_dev_data["n_cats_bad_values"] > 0): msg.fail("Unsupported values for cats: the supported values are " "1.0/True and 0.0/False.") if gold_train_data["n_cats_multilabel"] > 0: # Note: you should never get here because you run into E895 on # initialization first. msg.fail( "The train data contains instances without mutually-exclusive " "classes. Use the component 'textcat_multilabel' instead of " "'textcat'.") if gold_dev_data["n_cats_multilabel"] > 0: msg.fail( "The dev data contains instances without mutually-exclusive " "classes. Use the component 'textcat_multilabel' instead of " "'textcat'.") if "textcat_multilabel" in factory_names: msg.divider("Text Classification (Multilabel)") labels = _get_labels_from_model(nlp, "textcat_multilabel") msg.info(f"Text Classification: {len(labels)} label(s)") msg.text(f"Labels: {_format_labels(labels)}", show=verbose) missing_labels = labels - set(gold_train_data["cats"]) if missing_labels: msg.warn( "Some model labels are not present in the train data. The " "model performance may be degraded for these labels after " f"training: {_format_labels(missing_labels)}.") if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]): msg.warn( "Potential train/dev mismatch: the train and dev labels are " "not the same. " f"Train labels: {_format_labels(gold_train_data['cats'])}. " f"Dev labels: {_format_labels(gold_dev_data['cats'])}.") if (gold_train_data["n_cats_bad_values"] > 0 or gold_dev_data["n_cats_bad_values"] > 0): msg.fail("Unsupported values for cats: the supported values are " "1.0/True and 0.0/False.") if gold_train_data["n_cats_multilabel"] > 0: if gold_dev_data["n_cats_multilabel"] == 0: msg.warn( "Potential train/dev mismatch: the train data contains " "instances without mutually-exclusive classes while the " "dev data contains only instances with mutually-exclusive " "classes.") else: msg.warn("The train data contains only instances with " "mutually-exclusive classes. You can potentially use the " "component 'textcat' instead of 'textcat_multilabel'.") if gold_dev_data["n_cats_multilabel"] > 0: msg.fail( "Train/dev mismatch: the dev data contains instances " "without mutually-exclusive classes while the train data " "contains only instances with mutually-exclusive classes.") if "tagger" in factory_names: msg.divider("Part-of-speech Tagging") label_list = [label for label in gold_train_data["tags"]] model_labels = _get_labels_from_model(nlp, "tagger") msg.info(f"{len(label_list)} label(s) in train data") labels = set(label_list) missing_labels = model_labels - labels if missing_labels: msg.warn( "Some model labels are not present in the train data. The " "model performance may be degraded for these labels after " f"training: {_format_labels(missing_labels)}.") labels_with_counts = _format_labels( gold_train_data["tags"].most_common(), counts=True) msg.text(labels_with_counts, show=verbose) if "morphologizer" in factory_names: msg.divider("Morphologizer (POS+Morph)") label_list = [label for label in gold_train_data["morphs"]] model_labels = _get_labels_from_model(nlp, "morphologizer") msg.info(f"{len(label_list)} label(s) in train data") labels = set(label_list) missing_labels = model_labels - labels if missing_labels: msg.warn( "Some model labels are not present in the train data. The " "model performance may be degraded for these labels after " f"training: {_format_labels(missing_labels)}.") labels_with_counts = _format_labels( gold_train_data["morphs"].most_common(), counts=True) msg.text(labels_with_counts, show=verbose) if "parser" in factory_names: has_low_data_warning = False msg.divider("Dependency Parsing") # profile sentence length msg.info( f"Found {gold_train_data['n_sents']} sentence(s) with an average " f"length of {gold_train_data['n_words'] / gold_train_data['n_sents']:.1f} words." ) # check for documents with multiple sentences sents_per_doc = gold_train_data["n_sents"] / len( gold_train_data["texts"]) if sents_per_doc < 1.1: msg.warn( f"The training data contains {sents_per_doc:.2f} sentences per " f"document. When there are very few documents containing more " f"than one sentence, the parser will not learn how to segment " f"longer texts into sentences.") # profile labels labels_train = [label for label in gold_train_data["deps"]] labels_train_unpreprocessed = [ label for label in gold_train_unpreprocessed_data["deps"] ] labels_dev = [label for label in gold_dev_data["deps"]] if gold_train_unpreprocessed_data["n_nonproj"] > 0: n_nonproj = gold_train_unpreprocessed_data["n_nonproj"] msg.info(f"Found {n_nonproj} nonprojective train sentence(s)") if gold_dev_data["n_nonproj"] > 0: n_nonproj = gold_dev_data["n_nonproj"] msg.info(f"Found {n_nonproj} nonprojective dev sentence(s)") msg.info(f"{len(labels_train_unpreprocessed)} label(s) in train data") msg.info(f"{len(labels_train)} label(s) in projectivized train data") labels_with_counts = _format_labels( gold_train_unpreprocessed_data["deps"].most_common(), counts=True) msg.text(labels_with_counts, show=verbose) # rare labels in train for label in gold_train_unpreprocessed_data["deps"]: if gold_train_unpreprocessed_data["deps"][ label] <= DEP_LABEL_THRESHOLD: msg.warn(f"Low number of examples for label '{label}' " f"({gold_train_unpreprocessed_data['deps'][label]})") has_low_data_warning = True # rare labels in projectivized train rare_projectivized_labels = [] for label in gold_train_data["deps"]: if (gold_train_data["deps"][label] <= DEP_LABEL_THRESHOLD and DELIMITER in label): rare_projectivized_labels.append( f"{label}: {gold_train_data['deps'][label]}") if len(rare_projectivized_labels) > 0: msg.warn( f"Low number of examples for {len(rare_projectivized_labels)} " "label(s) in the projectivized dependency trees used for " "training. You may want to projectivize labels such as punct " "before training in order to improve parser performance.") msg.warn( f"Projectivized labels with low numbers of examples: ", ", ".join(rare_projectivized_labels), show=verbose, ) has_low_data_warning = True # labels only in train if set(labels_train) - set(labels_dev): msg.warn( "The following labels were found only in the train data:", ", ".join(set(labels_train) - set(labels_dev)), show=verbose, ) # labels only in dev if set(labels_dev) - set(labels_train): msg.warn( "The following labels were found only in the dev data:", ", ".join(set(labels_dev) - set(labels_train)), show=verbose, ) if has_low_data_warning: msg.text( f"To train a parser, your data should include at " f"least {DEP_LABEL_THRESHOLD} instances of each label.", show=verbose, ) # multiple root labels if len(gold_train_unpreprocessed_data["roots"]) > 1: msg.warn( f"Multiple root labels " f"({', '.join(gold_train_unpreprocessed_data['roots'])}) " f"found in training data. spaCy's parser uses a single root " f"label ROOT so this distinction will not be available.") # these should not happen, but just in case if gold_train_data["n_nonproj"] > 0: msg.fail(f"Found {gold_train_data['n_nonproj']} nonprojective " f"projectivized train sentence(s)") if gold_train_data["n_cycles"] > 0: msg.fail( f"Found {gold_train_data['n_cycles']} projectivized train sentence(s) with cycles" ) msg.divider("Summary") good_counts = msg.counts[MESSAGES.GOOD] warn_counts = msg.counts[MESSAGES.WARN] fail_counts = msg.counts[MESSAGES.FAIL] if good_counts: msg.good( f"{good_counts} {'check' if good_counts == 1 else 'checks'} passed" ) if warn_counts: msg.warn( f"{warn_counts} {'warning' if warn_counts == 1 else 'warnings'}") if fail_counts: msg.fail(f"{fail_counts} {'error' if fail_counts == 1 else 'errors'}") sys.exit(1)
def fix_annotations( example: Example, corrections: List[Correction], case_sensitive: bool = False, dryrun: bool = False, ) -> Example: """Fix annotations in a copy of List[Example] data. This function will NOT add annotations to your data. It will only remove erroneous annotations and fix the labels for specific spans. Args: example (Example): Input Example corrections (Dict[str, str]): Dictionary of corrections mapping entity text to a new label. If the value is set to None, the annotation will be removed case_sensitive (bool, optional): Consider case of text for each correction dryrun (bool, optional): Treat corrections as a dryrun and just print all changes to be made Returns: Example: Example with fixed annotations """ if not case_sensitive: for c in corrections: c.annotation = c.annotation.lower() corrections_map: Dict[str, Correction] = {c.annotation: c for c in corrections} prints: List[str] = [] ents_to_remove: List[int] = [] for i, s in enumerate(example.spans): t = s.text if case_sensitive else s.text.lower() if t in corrections_map: c = corrections_map[t] if c.to_label is None and s.label in c.from_labels: if dryrun: prints.append(f"Deleting span: {s.text}") else: ents_to_remove.append(i) elif s.label in c.from_labels or "ANY" in c.from_labels: if dryrun: prints.append( f"Correction span: {s.text} from labels: {c.from_labels} to label: {c.to_label}" ) else: s.label = cast(str, c.to_label) i = len(ents_to_remove) - 1 while i >= 0: idx = ents_to_remove[i] del example.spans[idx] i -= 1 if dryrun: msg.divider("Example Text") msg.text(example.text) for line in prints: msg.text(line) return example