def train( config_path: Union[str, Path], output_path: Optional[Union[str, Path]] = None, *, use_gpu: int = -1, overrides: Dict[str, Any] = util.SimpleFrozenDict(), ): config_path = util.ensure_path(config_path) output_path = util.ensure_path(output_path) # Make sure all files and paths exists if they are needed if not config_path or (str(config_path) != "-" and not config_path.exists()): msg.fail("Config file not found", config_path, exits=1) if not output_path: msg.info("No output directory provided") else: if not output_path.exists(): output_path.mkdir(parents=True) msg.good(f"Created output directory: {output_path}") msg.info(f"Saving to output directory: {output_path}") setup_gpu(use_gpu) with show_validation_error(config_path): config = util.load_config(config_path, overrides=overrides, interpolate=False) msg.divider("Initializing pipeline") with show_validation_error(config_path, hint_fill=False): nlp = init_nlp(config, use_gpu=use_gpu) msg.good("Initialized pipeline") msg.divider("Training pipeline") train_nlp(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr)
def process_data(self, row, header_id, data=None): if data is None: data = row msg.info(f"id: {row['qid']}") msg.good(f"{data}") try: self.insert(data, row["id"], header_id) self.export() except pwb.exceptions.InvalidTitle: msg.warn("Título invalido: {row['qid']}") logging.error("Título invalido: {row['qid']}") self.utils.should_continue() except pwb.exceptions.NoPage: msg.warn("No tiene página en eswiki: {row['qid']}") logging.error("No tiene página en eswiki: {row['qid']}") except pwb.exceptions.IsRedirectPage: # TODO: Se debe añadir un sistema por el que detectar que es una redirección y obtener # la página de destino, y luego trabajar con ella. msg.warn(f"Es una redirección: {row['qid']}") logging.error(f"Es una redirección: {row['qid']}") pass # TODO: redefinir este "bare except" except: msg.warn(f"Error inesperado: {sys.exc_info()[0]}") logging.error(f"Error inesperado: {sys.exc_info()[0]}") pass
def load(self, source: Union[os.PathLike[str], str]) -> None: """ Load parameters from file. Parameters ---------- source : path Location of file to load parameters from. Raises ------ FileNotFoundError If the path does not exist. """ if not Path(source).exists(): raise FileNotFoundError(f"File '{source}' does not exist.") conn = sqlite3.connect(Path(source)) with conn as c: ser = c.execute( "SELECT rowid, * FROM params ORDER BY rowid DESC LIMIT 1" ).fetchone()[1] params = json.loads(ser) self.update(params) msg.info( f"Updated global parameters with values loaded from '{source}'.")
def main(in_file, out_dir, spacy_model="en_core_web_sm", n_process=4): """ Step 1: Parse raw text with spaCy Expects an input file with one sentence per line and will output a .spacy file of the parsed collection of Doc objects (DocBin). """ input_path = Path(in_file) output_path = Path(out_dir) if not input_path.exists(): msg.fail("Can't find input file", in_file, exits=1) if not output_path.exists(): output_path.mkdir(parents=True) msg.good(f"Created output directory {out_dir}") nlp = spacy.load(spacy_model) msg.info(f"Using spaCy model {spacy_model}") doc_bin = DocBin(attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"]) msg.text("Preprocessing text...") with input_path.open("r", encoding="utf8") as texts: docs = nlp.pipe(texts, n_process=n_process) for doc in tqdm.tqdm(docs, desc="Docs", unit=""): doc_bin.add(doc) msg.good(f"Processed {len(doc_bin)} docs") doc_bin_bytes = doc_bin.to_bytes() output_file = output_path / f"{input_path.stem}.spacy" with output_file.open("wb") as f: f.write(doc_bin_bytes) msg.good(f"Saved parsed docs to file", output_file.resolve())
def format_data_to_jsonl(data, file_path, print_label=False): result = [] labels = set() i = 0 data = tqdm.tqdm(data, leave=False) with file_path.open("w", encoding="utf-8") as f: for d in data: text = d['text'] ents = [] label_data = d["label"] for l, label_l in label_data.items(): labels.update([l]) label_ent_array = [] for text_labeled, ent_arrays in label_l.items(): start_char, end_char = ent_arrays[0] label_ent_array.append((start_char, end_char + 1, l)) ents.append(label_ent_array[0]) if True == diff_contain_overlapping(ents): i = i + 1 doc = nlp(text) tags = biluo_tags_from_offsets(doc, ents) doc.ents = spans_from_biluo_tags(doc, tags) line = docs_to_json([doc]) f.write(json_dumps(line) + "\n") msg.good(f"Finished {file_path} :: {i} rows") if print_label: msg.info(f"{labels}")
def eval_dataset(set_id): DB = connect() data = DB.get_dataset(set_id) accepted = [ eg for eg in data if eg["answer"] == "accept" and eg.get("accept") ] rejected = [eg for eg in data if eg["answer"] == "reject"] ignored = [eg for eg in data if eg["answer"] == "ignore"] if not accepted and not rejected: msg.warn("No annotations collected", exits=1) counts = Counter() for eg in accepted: for model_id in eg["accept"]: counts[model_id] += 1 preference, _ = counts.most_common(1)[0] ratio = f"{counts[preference]} / {sum(counts.values()) - counts[preference]}" msg.info(f"Evaluating data from '{set_id}'") msg.text( f"You rejected {len(rejected)} and ignored {len(ignored)} pair(s)") if counts["A"] == counts["B"]: msg.warn(f"No preference ({ratio})") else: pc = counts[preference] / sum(counts.values()) msg.good( f"You preferred vectors {preference} with {ratio} ({pc:.0%})") msg.text(mapping[preference])
def project_update_dvc( project_dir: Path, workflow: Optional[str] = None, *, verbose: bool = False, force: bool = False, ) -> None: """Update the auto-generated Data Version Control (DVC) config file. A DVC project can only define one pipeline, so you need to specify one workflow defined in the project.yml. Will only update the file if the checksum changed. project_dir (Path): The project directory. workflow (Optional[str]): Optional name of workflow defined in project.yml. If not set, the first workflow will be used. verbose (bool): Print more info. force (bool): Force update DVC config. """ config = load_project_config(project_dir) updated = update_dvc_config(project_dir, config, workflow, verbose=verbose, force=force) help_msg = "To execute the workflow with DVC, run: dvc repro" if updated: msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg) else: msg.info(f"No changes found in {PROJECT_FILE}, no update needed", help_msg)
def setup_gpu(use_gpu: int) -> None: """Configure the GPU and log info.""" if use_gpu >= 0: msg.info(f"Using GPU: {use_gpu}") require_gpu(use_gpu) else: msg.info("Using CPU")
def create_optimizer(config_path): msg.info(f"Loading config from: {config_path}") config = util.load_config(config_path, create_objects=False) util.fix_random_seed(config["training"]["seed"]) config = util.load_config(config_path, create_objects=True) training = config["training"] return training["optimizer"]
def pytest_sessionstart(session): lang = session.config.getoption(LANG_CLI_ARG) if lang: lang = [lang_code.strip() for lang_code in lang.split(",")] msg.info(f"Running only tests for {lang}") test_dir = Path(TESTS_DIR) if test_dir.exists(): shutil.rmtree(str(test_dir)) msg.info(f"Deleted existing test directory {TESTS_DIR}") test_dir.mkdir() msg.good(f"Created test directory {TESTS_DIR}") meta = srsly.read_json(META_FILE) n_files = 0 for test_lang, test_file, solution_file in get_source_files(lang): test_root = test_dir / test_lang if not test_root.exists(): test_root.mkdir() init_path = test_root / "__init__.py" init_path.touch() if not solution_file: # general test file, just copy it over shutil.copy(str(test_file), str(test_root / test_file.name)) n_files += 1 continue with test_file.open("r", encoding="utf8") as f: test_code = f.read() with solution_file.open("r", encoding="utf8") as f: solution_code = f.read() full_code = format_test(test_file.stem, meta[PYTEST_TEMPLATE], test_code, solution_code) test_path = test_root / test_file.name with test_path.open("w", encoding="utf8") as f: f.write(full_code) n_files += 1 msg.good(f"Created {n_files} files for pytest in {TESTS_DIR}")
def main(): import sys import typer from wasabi import msg from . import cli commands = { "create-wikigraph": cli.create_wikigraph, "download-wikigraph": cli.download_wikigraph, "package-wikigraph": cli.package_wikigraph, "profile-matcher": cli.profile_matcher, "profile-wikigraph-load": cli.profile_wikigraph_load, "profile-wikigraph-exec": cli.profile_wikigraph_exec, } if len(sys.argv) == 1: msg.info("Available commands", ", ".join(commands), exits=1) command = sys.argv.pop(1) sys.argv[0] = "spikex %s" % command if command in commands: typer.run(commands[command]) else: available = "Available: {}".format(", ".join(commands)) msg.fail("Unknown command: {}".format(command), available, exits=1)
def evaluate(self, dev_loader, verbose=1): """ Evaluate the neural network against a dev set. """ self.nn.eval() true = [] pred = [] for loaded_input, loaded_output, _idx in dev_loader: _input_tensor = loaded_input.float() _output_tensor = loaded_output.float() _logits = self.nn(_input_tensor) _true_batch = _output_tensor.argmax(dim=1).detach().numpy() _pred_batch = F.softmax(_logits, dim=1).argmax(dim=1).detach().numpy() true.append(_true_batch) pred.append(_pred_batch) true = np.concatenate(true) pred = np.concatenate(pred) accuracy = classification_accuracy(true, pred) conf_mat = confusion_matrix(true, pred) if verbose > 0: log_info = dict(self._dynamic_params) log_info["performance"] = "Acc {0:.3f}".format(accuracy) logger.info( "{0: <80}".format( "Eval: Epoch {epoch} {performance}".format(**log_info) ) ) return accuracy, conf_mat
def download_collection(row): collectionCode, packageCount = row ITR = [(collectionCode, n) for n in range(0, packageCount, 100)] msg.info(f"{collectionCode} {packageCount}") for offset in range(0, packageCount, 100): f_save = save_dest / f"{collectionCode}_{offset:08d}.json" if f_save.exists(): continue try: js = get_collection_page(collectionCode, offset) except: print(f"ERROR ON {row} {offset}") break js = json.dumps(js, indent=2) msg.good(f"Saved {f_save}") with open(f_save, "w") as FOUT: FOUT.write(js) time.sleep(0)
def profile(model, inputs=None, n_texts=10000): """ Profile a spaCy pipeline, to find out which functions take the most time. Input should be formatted as one JSON object per line with a key "text". It can either be provided as a JSONL file, or be read from sys.sytdin. If no input file is specified, the IMDB dataset is loaded via Thinc. """ if inputs is not None: inputs = _read_inputs(inputs, msg) if inputs is None: n_inputs = 25000 with msg.loading("Loading IMDB dataset via Thinc..."): imdb_train, _ = thinc.extra.datasets.imdb() inputs, _ = zip(*imdb_train) msg.info("Loaded IMDB dataset and using {} examples".format(n_inputs)) inputs = inputs[:n_inputs] with msg.loading("Loading model '{}'...".format(model)): nlp = load_model(model) msg.good("Loaded model '{}'".format(model)) texts = list(itertools.islice(inputs, n_texts)) cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof") s = pstats.Stats("Profile.prof") msg.divider("Profile stats") s.strip_dirs().sort_stats("time").print_stats()
def retrain_model(): """ Callback function. """ model_retrainer.disabled = True logger.info("Start training... button will be disabled temporarily.") dataset.setup_label_coding() model = vecnet_callback() train_loader = dataset.loader("train", vectorizer, smoothing_coeff=0.2) dev_loader = dataset.loader("dev", vectorizer) _ = model.train(train_loader, dev_loader, epochs=epochs_slider.value) logger.good("-- 1/2: retrained model") for _key in ["raw", "train", "dev"]: _probs = model.predict_proba(dataset.dfs[_key]["text"].tolist()) _labels = [ dataset.label_decoder[_val] for _val in _probs.argmax(axis=-1) ] _scores = _probs.max(axis=-1).tolist() dataset.dfs[_key]["pred_label"] = pd.Series(_labels) dataset.dfs[_key]["pred_score"] = pd.Series(_scores) softlabel._update_sources() softlabel.plot() model_retrainer.disabled = False logger.good("-- 2/2: updated predictions. Training button is re-enabled.")
def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) -> None: if inputs is not None: texts = _read_inputs(inputs, msg) texts = list(itertools.islice(texts, n_texts)) if inputs is None: try: import ml_datasets except ImportError: msg.fail( "This command, when run without an input file, " "requires the ml_datasets library to be installed: " "pip install ml_datasets", exits=1, ) with msg.loading("Loading IMDB dataset via ml_datasets..."): imdb_train, _ = ml_datasets.imdb(train_limit=n_texts, dev_limit=0) texts, _ = zip(*imdb_train) msg.info(f"Loaded IMDB dataset and using {n_texts} examples") with msg.loading(f"Loading pipeline '{model}'..."): nlp = load_model(model) msg.good(f"Loaded pipeline '{model}'") cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof") s = pstats.Stats("Profile.prof") msg.divider("Profile stats") s.strip_dirs().sort_stats("time").print_stats()
def main( # fmt: off in_file: str = typer.Argument(..., help="Vectors file (text-based)"), vocab_file: str = typer.Argument(..., help="Vocabulary file"), out_dir: str = typer.Argument(..., help="Path to output directory"), min_freq_ratio: float = typer.Option(0.0, "--min-freq-ratio", "-r", help="Frequency ratio threshold for discarding minority senses or casings"), min_distance: float = typer.Option(0.0, "--min-distance", "-s", help="Similarity threshold for discarding redundant keys"), # fmt: on ): """ Step 5: Export a sense2vec component Expects a vectors.txt and a vocab file trained with GloVe and exports a component that can be loaded with Sense2vec.from_disk. """ input_path = Path(in_file) vocab_path = Path(vocab_file) output_path = Path(out_dir) if not input_path.exists(): msg.fail("Can't find input file", in_file, exits=1) if input_path.suffix == ".bin": msg.fail("Need text-based vectors file, not binary", in_file, exits=1) if not vocab_path.exists(): msg.fail("Can't find vocab file", vocab_file, exits=1) if not output_path.exists(): output_path.mkdir(parents=True) msg.good(f"Created output directory {out_dir}") with input_path.open("r", encoding="utf8") as f: (n_vectors, vector_size), f = _get_shape(f) vectors_data = f.readlines() with vocab_path.open("r", encoding="utf8") as f: vocab = read_vocab(f) vectors = {} all_senses = set() for item in vectors_data: item = item.rstrip().rsplit(" ", vector_size) key = item[0] try: _, sense = split_key(key) except ValueError: continue vec = item[1:] if len(vec) != vector_size: msg.fail(f"Wrong vector size: {len(vec)} (expected {vector_size})", exits=1) all_senses.add(sense) vectors[key] = numpy.asarray(vec, dtype=numpy.float32) discarded = set() discarded.update(get_minority_keys(vocab, min_freq_ratio)) discarded.update(get_redundant_keys(vocab, vectors, min_distance)) n_vectors = len(vectors) - len(discarded) s2v = Sense2Vec(shape=(n_vectors, vector_size), senses=all_senses) for key, vector in vectors.items(): if key not in discarded: s2v.add(key, vector) s2v.set_freq(key, vocab[key]) msg.good("Created the sense2vec model") msg.info(f"{n_vectors} vectors, {len(all_senses)} total senses") s2v.to_disk(output_path) msg.good("Saved model to directory", out_dir)
def init_vectors_cli( # fmt: off lang: str = Arg(..., help="The language of the nlp object to create"), vectors_loc: Path = Arg(..., help="Vectors file in Word2Vec format", exists=True), output_dir: Path = Arg(..., help="Pipeline output directory"), prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"), truncate: int = Opt( 0, "--truncate", "-t", help= "Optional number of vectors to truncate to when reading in vectors file" ), name: Optional[str] = Opt( None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors" ), verbose: bool = Opt( False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), jsonl_loc: Optional[Path] = Opt( None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True), # fmt: on ): """Convert word vectors for use with spaCy. Will export an nlp object that you can use in the [initialize] block of your config to initialize a model with vectors. """ util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) msg.info(f"Creating blank nlp object for language '{lang}'") nlp = util.get_lang_class(lang)() if jsonl_loc is not None: update_lexemes(nlp, jsonl_loc) convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name) msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors") nlp.to_disk(output_dir) msg.good( "Saved nlp object with vectors to output directory. You can now use the " "path to it in your config as the 'vectors' setting in [initialize].", output_dir.resolve(), )
def setup_and_train(use_gpu, train_args, rank): if use_gpu >= 0: gpu_id = os.environ.get("CUDA_VISIBLE_DEVICES") msg.info(f"Using GPU (isolated): {gpu_id}") util.use_gpu(0) else: msg.info("Using CPU") train(randomization_index=rank, **train_args)
def project_run( project_dir: Path, subcommand: str, *, overrides: Dict[str, Any] = SimpleFrozenDict(), force: bool = False, dry: bool = False, capture: bool = False, ) -> None: """Run a named script defined in the project.yml. If the script is part of the default pipeline (defined in the "run" section), DVC is used to execute the command, so it can determine whether to rerun it. It then calls into "exec" to execute it. project_dir (Path): Path to project directory. subcommand (str): Name of command to run. overrides (Dict[str, Any]): Optional config overrides. force (bool): Force re-running, even if nothing changed. dry (bool): Perform a dry run and don't execute commands. capture (bool): Whether to capture the output and errors of individual commands. If False, the stdout and stderr will not be redirected, and if there's an error, sys.exit will be called with the return code. You should use capture=False when you want to turn over execution to the command, and capture=True when you want to run the command more like a function. """ config = load_project_config(project_dir, overrides=overrides) commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} workflows = config.get("workflows", {}) validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand) if subcommand in workflows: msg.info(f"Running workflow '{subcommand}'") for cmd in workflows[subcommand]: project_run( project_dir, cmd, overrides=overrides, force=force, dry=dry, capture=capture, ) else: cmd = commands[subcommand] for dep in cmd.get("deps", []): if not (project_dir / dep).exists(): err = f"Missing dependency specified by command '{subcommand}': {dep}" err_help = "Maybe you forgot to run the 'project assets' command or a previous step?" err_kwargs = {"exits": 1} if not dry else {} msg.fail(err, err_help, **err_kwargs) check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION) with working_dir(project_dir) as current_dir: msg.divider(subcommand) rerun = check_rerun(current_dir, cmd, check_spacy_commit=check_spacy_commit) if not rerun and not force: msg.info(f"Skipping '{cmd['name']}': nothing changed") else: run_commands(cmd["script"], dry=dry, capture=capture) if not dry: update_lockfile(current_dir, cmd)
def project_assets(project_dir: Path, *, sparse_checkout: bool = False) -> None: """Fetch assets for a project using DVC if possible. project_dir (Path): Path to project directory. """ project_path = ensure_path(project_dir) config = load_project_config(project_path) assets = config.get("assets", {}) if not assets: msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0) msg.info(f"Fetching {len(assets)} asset(s)") for asset in assets: dest = (project_dir / asset["dest"]).resolve() checksum = asset.get("checksum") if "git" in asset: git_err = ( f"Cloning spaCy project templates requires Git and the 'git' command. " f"Make sure it's installed and that the executable is available." ) get_git_version(error=git_err) if dest.exists(): # If there's already a file, check for checksum if checksum and checksum == get_checksum(dest): msg.good( f"Skipping download with matching checksum: {asset['dest']}" ) continue else: if dest.is_dir(): shutil.rmtree(dest) else: dest.unlink() if "repo" not in asset["git"] or asset["git"]["repo"] is None: msg.fail( "A git asset must include 'repo', the repository address.", exits=1) if "path" not in asset["git"] or asset["git"]["path"] is None: msg.fail( "A git asset must include 'path' - use \"\" to get the entire repository.", exits=1, ) git_checkout( asset["git"]["repo"], asset["git"]["path"], dest, branch=asset["git"].get("branch"), sparse=sparse_checkout, ) msg.good(f"Downloaded asset {dest}") else: url = asset.get("url") if not url: # project.yml defines asset without URL that the user has to place check_private_asset(dest, checksum) continue fetch_asset(project_path, url, dest, checksum)
def main( # fmt: off in_file: str = typer.Argument(..., help="Path to input file"), out_dir: str = typer.Argument(..., help="Path to output directory"), spacy_model: str = typer.Argument("en_core_web_sm", help="Name of spaCy model to use"), n_process: int = typer.Option( 1, "--n-process", "-n", help="Number of processes (multiprocessing)"), max_docs: int = typer.Option(10**6, "--max-docs", "-m", help="Maximum docs per batch"), # fmt: on ): """ Step 1: Parse raw text with spaCy Expects an input file with one sentence per line and will output a .spacy file of the parsed collection of Doc objects (DocBin). """ input_path = Path(in_file) output_path = Path(out_dir) if not input_path.exists(): msg.fail("Can't find input file", in_file, exits=1) if not output_path.exists(): output_path.mkdir(parents=True) msg.good(f"Created output directory {out_dir}") nlp = spacy.load(spacy_model) msg.info(f"Using spaCy model {spacy_model}") doc_bin = DocBin(attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"]) msg.text("Preprocessing text...") count = 0 batch_num = 0 with input_path.open("r", encoding="utf8") as texts: docs = nlp.pipe(texts, n_process=n_process) for doc in tqdm.tqdm(docs, desc="Docs", unit=""): if count < max_docs: doc_bin.add(doc) count += 1 else: batch_num += 1 count = 0 msg.good(f"Processed {len(doc_bin)} docs") doc_bin_bytes = doc_bin.to_bytes() output_file = output_path / f"{input_path.stem}-{batch_num}.spacy" with output_file.open("wb") as f: f.write(doc_bin_bytes) msg.good(f"Saved parsed docs to file", output_file.resolve()) doc_bin = DocBin( attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"]) batch_num += 1 output_file = output_path / f"{input_path.stem}-{batch_num}.spacy" with output_file.open("wb") as f: doc_bin_bytes = doc_bin.to_bytes() f.write(doc_bin_bytes) msg.good(f"Complete. Saved final parsed docs to file", output_file.resolve())
def main( # fmt: off in_file: str = typer.Argument(..., help="Path to input file"), out_dir: str = typer.Argument(..., help="Path to output directory"), spacy_model: str = typer.Argument("en_core_web_sm", help="Name of spaCy model to use"), n_process: int = typer.Option( 1, "--n-process", "-n", help="Number of processes (multiprocessing)"), # fmt: on ): """ Step 2: Preprocess text in sense2vec's format Expects a binary .spacy input file consisting of the parsed Docs (DocBin) and outputs a text file with one sentence per line in the expected sense2vec format (merged noun phrases, concatenated phrases with underscores and added "senses"). Example input: Rats, mould and broken furniture: the scandal of the UK's refugee housing Example output: Rats|NOUN ,|PUNCT mould|NOUN and|CCONJ broken_furniture|NOUN :|PUNCT the|DET scandal|NOUN of|ADP the|DET UK|GPE 's|PART refugee_housing|NOUN """ input_path = Path(in_file) output_path = Path(out_dir) if not input_path.exists(): msg.fail("Can't find input file", in_file, exits=1) if not output_path.exists(): output_path.mkdir(parents=True) msg.good(f"Created output directory {out_dir}") nlp = spacy.load(spacy_model) msg.info(f"Using spaCy model {spacy_model}") with input_path.open("rb") as f: doc_bin_bytes = f.read() doc_bin = DocBin().from_bytes(doc_bin_bytes) msg.good(f"Loaded {len(doc_bin)} parsed docs") docs = doc_bin.get_docs(nlp.vocab) output_file = output_path / f"{input_path.stem}.s2v" lines_count = 0 words_count = 0 with output_file.open("w", encoding="utf8") as f: for doc in tqdm.tqdm(docs, desc="Docs", unit=""): doc = merge_phrases(doc) words = [] for token in doc: if not token.is_space: word, sense = make_spacy_key(token, prefer_ents=True) words.append(make_key(word, sense)) f.write(" ".join(words) + "\n") lines_count += 1 words_count += len(words) msg.good( f"Successfully preprocessed {lines_count} docs ({words_count} words)", output_file.resolve(), )
def _init_labels(nlp, output_path): for name, component in nlp.pipeline: if getattr(component, "label_data", None) is not None: output_file = output_path / f"{name}.json" srsly.write_json(output_file, component.label_data) msg.good( f"Saving label data for component '{name}' to {output_file}") else: msg.info(f"No label data found for component '{name}'")
def main(in_file, vocab_file, out_dir): """ Step 5: Export a sense2vec component Expects a vectors.txt and a vocab file trained with GloVe and exports a component that can be loaded with Sense2vec.from_disk. """ input_path = Path(in_file) vocab_path = Path(vocab_file) output_path = Path(out_dir) if not input_path.exists(): msg.fail("Can't find input file", in_file, exits=1) if input_path.suffix == ".bin": msg.fail("Need text-based vectors file, not binary", in_file, exits=1) if not vocab_path.exists(): msg.fail("Can't find vocab file", vocab_file, exits=1) if not output_path.exists(): output_path.mkdir(parents=True) msg.good(f"Created output directory {out_dir}") with input_path.open("r", encoding="utf8") as f: (n_vectors, vector_size), f = _get_shape(f) vectors_data = f.readlines() with vocab_path.open("r", encoding="utf8") as f: vocab_data = f.readlines() data = [] all_senses = set() for item in vectors_data: item = item.rstrip().rsplit(" ", vector_size) key = item[0] try: _, sense = split_key(key) except ValueError: continue vec = item[1:] if len(vec) != vector_size: msg.fail(f"Wrong vector size: {len(vec)} (expected {vector_size})", exits=1) all_senses.add(sense) data.append((key, numpy.asarray(vec, dtype=numpy.float32))) s2v = Sense2Vec(shape=(len(data), vector_size), senses=all_senses) for key, vector in data: s2v.add(key, vector) for item in vocab_data: item = item.rstrip() if item.endswith(" word"): # for fastText vocabs item = item[:-5] try: key, freq = item.rsplit(" ", 1) except ValueError: continue s2v.set_freq(key, int(freq)) msg.good("Created the sense2vec model") msg.info(f"{len(data)} vectors, {len(all_senses)} total senses") s2v.to_disk(output_path) msg.good("Saved model to directory", out_dir)
def download_model_and_get_path(lang, model_id): save_path = "itranlit-models" os.makedirs(save_path, exist_ok=True) model_path = save_path + "/" + lang + ".pth" msg.info(f"{lang} model downloading inside {save_path}..") try: download_file_from_google_drive(id=model_id, destination=model_path) msg.good(f"{lang} model download successfull. model path {model_path}") except Exception as e: print(e) msg.fail(f"Fail to download {lang} model. please check exception")
def get_blacklisted_sense_keys(freqs): """Remove keys with sense that is blacklisted""" discarded = [] msg.info('collecting blacklisted sense keys') for key, freq in freqs.items(): try: term, sense = split_key(key) except ValueError: continue if sense and sense not in sense_whitelist: discarded.append(key) return discarded
def check_rerun( project_dir: Path, command: Dict[str, Any], *, check_spacy_version: bool = True, check_spacy_commit: bool = False, ) -> bool: """Check if a command should be rerun because its settings or inputs/outputs changed. project_dir (Path): The current project directory. command (Dict[str, Any]): The command, as defined in the project.yml. strict_version (bool): RETURNS (bool): Whether to re-run the command. """ # Always rerun if no-skip is set if command.get("no_skip", False): return True lock_path = project_dir / PROJECT_LOCK if not lock_path.exists(): # We don't have a lockfile, run command return True data = srsly.read_yaml(lock_path) if command["name"] not in data: # We don't have info about this command return True entry = data[command["name"]] # Always run commands with no outputs (otherwise they'd always be skipped) if not entry.get("outs", []): return True # Always rerun if spaCy version or commit hash changed spacy_v = entry.get("spacy_version") commit = entry.get("spacy_git_version") if check_spacy_version and not is_minor_version_match( spacy_v, about.__version__): info = f"({spacy_v} in {PROJECT_LOCK}, {about.__version__} current)" msg.info( f"Re-running '{command['name']}': spaCy minor version changed {info}" ) return True if check_spacy_commit and commit != GIT_VERSION: info = f"({commit} in {PROJECT_LOCK}, {GIT_VERSION} current)" msg.info( f"Re-running '{command['name']}': spaCy commit changed {info}") return True # If the entry in the lockfile matches the lockfile entry that would be # generated from the current command, we don't rerun because it means that # all inputs/outputs, hashes and scripts are the same and nothing changed lock_entry = get_lock_entry(project_dir, command) exclude = ["spacy_version", "spacy_git_version"] return get_hash(lock_entry, exclude=exclude) != get_hash(entry, exclude=exclude)
def train(self, data, model_name, epoch, lr=0.05, dim=300, ws=5, minCount=5, minn=3, maxn=6, neg=5, wordNgrams=1, loss="ns", bucket=2000000, thread=multiprocessing.cpu_count() - 1): """train fasttext with raw text data Args: data (str): raw text data path model_name (str): name of output trained model with extension epoch (int): number of training iteration lr (float, optional): learning rate. Defaults to 0.05. dim (int, optional): vector size or dimension. Defaults to 300. ws (int, optional): window size. Defaults to 5. minCount (int, optional): minimum word count to ignore training. Defaults to 5. minn (int, optional): [description]. Defaults to 3. maxn (int, optional): [description]. Defaults to 6. neg (int, optional): negative sampling. Defaults to 5. wordNgrams (int, optional): [description]. Defaults to 1. loss (str, optional): loss type . Defaults to "ns". bucket (int, optional): [description]. Defaults to 2000000. thread ([type], optional): [description]. Defaults to multiprocessing.cpu_count()-1. """ msg.info('training started.....') model = fasttext.train_unsupervised(data, model='skipgram', epoch=epoch, lr=lr, dim=dim, ws=ws, minCount=minCount, minn=minn, maxn=maxn, neg=neg, wordNgrams=wordNgrams, loss=loss, bucket=bucket, thread=thread) msg.good(f'training done! saving as {model_name}') model.save_model(model_name)
def check_spacy_model(model) -> bool: spacy_info = spacy.info() models = list(spacy_info.get('pipelines', spacy_info.get('models', None)).keys()) if models is None: raise ValueError('Unable to detect spacy models.') if model not in models: msg.info("Downloading spacy model {}".format(model)) spacy.cli.download(model) # spacy.info() doesnt update after a spacy.cli.download, so theres no point checking it models.append(model) # Always returns true, if it fails to download, spacy sys.exit()s return model in models