Esempio n. 1
0
def train(
    config_path: Union[str, Path],
    output_path: Optional[Union[str, Path]] = None,
    *,
    use_gpu: int = -1,
    overrides: Dict[str, Any] = util.SimpleFrozenDict(),
):
    config_path = util.ensure_path(config_path)
    output_path = util.ensure_path(output_path)
    # Make sure all files and paths exists if they are needed
    if not config_path or (str(config_path) != "-" and not config_path.exists()):
        msg.fail("Config file not found", config_path, exits=1)
    if not output_path:
        msg.info("No output directory provided")
    else:
        if not output_path.exists():
            output_path.mkdir(parents=True)
            msg.good(f"Created output directory: {output_path}")
        msg.info(f"Saving to output directory: {output_path}")
    setup_gpu(use_gpu)
    with show_validation_error(config_path):
        config = util.load_config(config_path, overrides=overrides, interpolate=False)
    msg.divider("Initializing pipeline")
    with show_validation_error(config_path, hint_fill=False):
        nlp = init_nlp(config, use_gpu=use_gpu)
    msg.good("Initialized pipeline")
    msg.divider("Training pipeline")
    train_nlp(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr)
Esempio n. 2
0
    def process_data(self, row, header_id, data=None):
        if data is None:
            data = row

        msg.info(f"id: {row['qid']}")
        msg.good(f"{data}")

        try:
            self.insert(data, row["id"], header_id)
            self.export()
        except pwb.exceptions.InvalidTitle:
            msg.warn("Título invalido: {row['qid']}")
            logging.error("Título invalido: {row['qid']}")
            self.utils.should_continue()
        except pwb.exceptions.NoPage:
            msg.warn("No tiene página en eswiki: {row['qid']}")
            logging.error("No tiene página en eswiki: {row['qid']}")

        except pwb.exceptions.IsRedirectPage:
            # TODO: Se debe añadir un sistema por el que detectar que es una redirección y obtener
            # la página de destino, y luego trabajar con ella.
            msg.warn(f"Es una redirección: {row['qid']}")
            logging.error(f"Es una redirección: {row['qid']}")
            pass
        # TODO: redefinir este "bare except"
        except:
            msg.warn(f"Error inesperado: {sys.exc_info()[0]}")
            logging.error(f"Error inesperado: {sys.exc_info()[0]}")
            pass
Esempio n. 3
0
    def load(self, source: Union[os.PathLike[str], str]) -> None:
        """
        Load parameters from file.

        Parameters
        ----------
        source : path
            Location of file to load parameters from.

        Raises
        ------
        FileNotFoundError
            If the path does not exist.
        """
        if not Path(source).exists():
            raise FileNotFoundError(f"File '{source}' does not exist.")
        conn = sqlite3.connect(Path(source))
        with conn as c:
            ser = c.execute(
                "SELECT rowid, * FROM params ORDER BY rowid DESC LIMIT 1"
            ).fetchone()[1]
        params = json.loads(ser)
        self.update(params)
        msg.info(
            f"Updated global parameters with values loaded from '{source}'.")
Esempio n. 4
0
def main(in_file, out_dir, spacy_model="en_core_web_sm", n_process=4):
    """
    Step 1: Parse raw text with spaCy

    Expects an input file with one sentence per line and will output a .spacy
    file of the parsed collection of Doc objects (DocBin).
    """
    input_path = Path(in_file)
    output_path = Path(out_dir)
    if not input_path.exists():
        msg.fail("Can't find input file", in_file, exits=1)
    if not output_path.exists():
        output_path.mkdir(parents=True)
        msg.good(f"Created output directory {out_dir}")
    nlp = spacy.load(spacy_model)
    msg.info(f"Using spaCy model {spacy_model}")
    doc_bin = DocBin(attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"])
    msg.text("Preprocessing text...")
    with input_path.open("r", encoding="utf8") as texts:
        docs = nlp.pipe(texts, n_process=n_process)
        for doc in tqdm.tqdm(docs, desc="Docs", unit=""):
            doc_bin.add(doc)
    msg.good(f"Processed {len(doc_bin)} docs")
    doc_bin_bytes = doc_bin.to_bytes()
    output_file = output_path / f"{input_path.stem}.spacy"
    with output_file.open("wb") as f:
        f.write(doc_bin_bytes)
    msg.good(f"Saved parsed docs to file", output_file.resolve())
def format_data_to_jsonl(data, file_path, print_label=False):
    result = []
    labels = set()
    i = 0

    data = tqdm.tqdm(data, leave=False)

    with file_path.open("w", encoding="utf-8") as f:
        for d in data:
            text = d['text']
            ents = []
            label_data = d["label"]
            for l, label_l in label_data.items():
                labels.update([l])
                label_ent_array = []
                for text_labeled, ent_arrays in label_l.items():
                    start_char, end_char = ent_arrays[0]
                    label_ent_array.append((start_char, end_char + 1, l))
                ents.append(label_ent_array[0])

            if True == diff_contain_overlapping(ents):
                i = i + 1

                doc = nlp(text)
                tags = biluo_tags_from_offsets(doc, ents)
                doc.ents = spans_from_biluo_tags(doc, tags)

                line = docs_to_json([doc])
                f.write(json_dumps(line) + "\n")

    msg.good(f"Finished {file_path} :: {i} rows")
    if print_label:
        msg.info(f"{labels}")
Esempio n. 6
0
 def eval_dataset(set_id):
     DB = connect()
     data = DB.get_dataset(set_id)
     accepted = [
         eg for eg in data if eg["answer"] == "accept" and eg.get("accept")
     ]
     rejected = [eg for eg in data if eg["answer"] == "reject"]
     ignored = [eg for eg in data if eg["answer"] == "ignore"]
     if not accepted and not rejected:
         msg.warn("No annotations collected", exits=1)
     counts = Counter()
     for eg in accepted:
         for model_id in eg["accept"]:
             counts[model_id] += 1
     preference, _ = counts.most_common(1)[0]
     ratio = f"{counts[preference]} / {sum(counts.values()) - counts[preference]}"
     msg.info(f"Evaluating data from '{set_id}'")
     msg.text(
         f"You rejected {len(rejected)} and ignored {len(ignored)} pair(s)")
     if counts["A"] == counts["B"]:
         msg.warn(f"No preference ({ratio})")
     else:
         pc = counts[preference] / sum(counts.values())
         msg.good(
             f"You preferred vectors {preference} with {ratio} ({pc:.0%})")
         msg.text(mapping[preference])
Esempio n. 7
0
def project_update_dvc(
    project_dir: Path,
    workflow: Optional[str] = None,
    *,
    verbose: bool = False,
    force: bool = False,
) -> None:
    """Update the auto-generated Data Version Control (DVC) config file. A DVC
    project can only define one pipeline, so you need to specify one workflow
    defined in the project.yml. Will only update the file if the checksum changed.

    project_dir (Path): The project directory.
    workflow (Optional[str]): Optional name of workflow defined in project.yml.
        If not set, the first workflow will be used.
    verbose (bool): Print more info.
    force (bool): Force update DVC config.
    """
    config = load_project_config(project_dir)
    updated = update_dvc_config(project_dir,
                                config,
                                workflow,
                                verbose=verbose,
                                force=force)
    help_msg = "To execute the workflow with DVC, run: dvc repro"
    if updated:
        msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg)
    else:
        msg.info(f"No changes found in {PROJECT_FILE}, no update needed",
                 help_msg)
Esempio n. 8
0
def setup_gpu(use_gpu: int) -> None:
    """Configure the GPU and log info."""
    if use_gpu >= 0:
        msg.info(f"Using GPU: {use_gpu}")
        require_gpu(use_gpu)
    else:
        msg.info("Using CPU")
Esempio n. 9
0
def create_optimizer(config_path):
    msg.info(f"Loading config from: {config_path}")
    config = util.load_config(config_path, create_objects=False)
    util.fix_random_seed(config["training"]["seed"])
    config = util.load_config(config_path, create_objects=True)
    training = config["training"]
    return training["optimizer"]
Esempio n. 10
0
def pytest_sessionstart(session):
    lang = session.config.getoption(LANG_CLI_ARG)
    if lang:
        lang = [lang_code.strip() for lang_code in lang.split(",")]
        msg.info(f"Running only tests for {lang}")
    test_dir = Path(TESTS_DIR)
    if test_dir.exists():
        shutil.rmtree(str(test_dir))
        msg.info(f"Deleted existing test directory {TESTS_DIR}")
    test_dir.mkdir()
    msg.good(f"Created test directory {TESTS_DIR}")
    meta = srsly.read_json(META_FILE)
    n_files = 0
    for test_lang, test_file, solution_file in get_source_files(lang):
        test_root = test_dir / test_lang
        if not test_root.exists():
            test_root.mkdir()
            init_path = test_root / "__init__.py"
            init_path.touch()
        if not solution_file:  # general test file, just copy it over
            shutil.copy(str(test_file), str(test_root / test_file.name))
            n_files += 1
            continue
        with test_file.open("r", encoding="utf8") as f:
            test_code = f.read()
        with solution_file.open("r", encoding="utf8") as f:
            solution_code = f.read()
        full_code = format_test(test_file.stem, meta[PYTEST_TEMPLATE],
                                test_code, solution_code)
        test_path = test_root / test_file.name
        with test_path.open("w", encoding="utf8") as f:
            f.write(full_code)
        n_files += 1
    msg.good(f"Created {n_files} files for pytest in {TESTS_DIR}")
Esempio n. 11
0
def main():
    import sys

    import typer
    from wasabi import msg

    from . import cli

    commands = {
        "create-wikigraph": cli.create_wikigraph,
        "download-wikigraph": cli.download_wikigraph,
        "package-wikigraph": cli.package_wikigraph,
        "profile-matcher": cli.profile_matcher,
        "profile-wikigraph-load": cli.profile_wikigraph_load,
        "profile-wikigraph-exec": cli.profile_wikigraph_exec,
    }

    if len(sys.argv) == 1:
        msg.info("Available commands", ", ".join(commands), exits=1)
    command = sys.argv.pop(1)
    sys.argv[0] = "spikex %s" % command
    if command in commands:
        typer.run(commands[command])
    else:
        available = "Available: {}".format(", ".join(commands))
        msg.fail("Unknown command: {}".format(command), available, exits=1)
Esempio n. 12
0
    def evaluate(self, dev_loader, verbose=1):
        """
        Evaluate the neural network against a dev set.
        """
        self.nn.eval()
        true = []
        pred = []
        for loaded_input, loaded_output, _idx in dev_loader:
            _input_tensor = loaded_input.float()
            _output_tensor = loaded_output.float()

            _logits = self.nn(_input_tensor)
            _true_batch = _output_tensor.argmax(dim=1).detach().numpy()
            _pred_batch = F.softmax(_logits, dim=1).argmax(dim=1).detach().numpy()
            true.append(_true_batch)
            pred.append(_pred_batch)
        true = np.concatenate(true)
        pred = np.concatenate(pred)
        accuracy = classification_accuracy(true, pred)
        conf_mat = confusion_matrix(true, pred)

        if verbose > 0:
            log_info = dict(self._dynamic_params)
            log_info["performance"] = "Acc {0:.3f}".format(accuracy)
            logger.info(
                "{0: <80}".format(
                    "Eval: Epoch {epoch} {performance}".format(**log_info)
                )
            )

        return accuracy, conf_mat
def download_collection(row):

    collectionCode, packageCount = row

    ITR = [(collectionCode, n) for n in range(0, packageCount, 100)]

    msg.info(f"{collectionCode} {packageCount}")

    for offset in range(0, packageCount, 100):

        f_save = save_dest / f"{collectionCode}_{offset:08d}.json"

        if f_save.exists():
            continue

        try:
            js = get_collection_page(collectionCode, offset)
        except:
            print(f"ERROR ON {row} {offset}")
            break

        js = json.dumps(js, indent=2)
        msg.good(f"Saved {f_save}")

        with open(f_save, "w") as FOUT:
            FOUT.write(js)

        time.sleep(0)
Esempio n. 14
0
def profile(model, inputs=None, n_texts=10000):
    """
    Profile a spaCy pipeline, to find out which functions take the most time.
    Input should be formatted as one JSON object per line with a key "text".
    It can either be provided as a JSONL file, or be read from sys.sytdin.
    If no input file is specified, the IMDB dataset is loaded via Thinc.
    """
    if inputs is not None:
        inputs = _read_inputs(inputs, msg)
    if inputs is None:
        n_inputs = 25000
        with msg.loading("Loading IMDB dataset via Thinc..."):
            imdb_train, _ = thinc.extra.datasets.imdb()
            inputs, _ = zip(*imdb_train)
        msg.info("Loaded IMDB dataset and using {} examples".format(n_inputs))
        inputs = inputs[:n_inputs]
    with msg.loading("Loading model '{}'...".format(model)):
        nlp = load_model(model)
    msg.good("Loaded model '{}'".format(model))
    texts = list(itertools.islice(inputs, n_texts))
    cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(),
                    "Profile.prof")
    s = pstats.Stats("Profile.prof")
    msg.divider("Profile stats")
    s.strip_dirs().sort_stats("time").print_stats()
Esempio n. 15
0
        def retrain_model():
            """
            Callback function.
            """
            model_retrainer.disabled = True
            logger.info("Start training... button will be disabled temporarily.")
            dataset.setup_label_coding()
            model = vecnet_callback()

            train_loader = dataset.loader("train", vectorizer, smoothing_coeff=0.2)
            dev_loader = dataset.loader("dev", vectorizer)

            _ = model.train(train_loader, dev_loader, epochs=epochs_slider.value)
            logger.good("-- 1/2: retrained model")

            for _key in ["raw", "train", "dev"]:
                _probs = model.predict_proba(dataset.dfs[_key]["text"].tolist())
                _labels = [
                    dataset.label_decoder[_val] for _val in _probs.argmax(axis=-1)
                ]
                _scores = _probs.max(axis=-1).tolist()
                dataset.dfs[_key]["pred_label"] = pd.Series(_labels)
                dataset.dfs[_key]["pred_score"] = pd.Series(_scores)

            softlabel._update_sources()
            softlabel.plot()
            model_retrainer.disabled = False
            logger.good("-- 2/2: updated predictions. Training button is re-enabled.")
Esempio n. 16
0
def profile(model: str,
            inputs: Optional[Path] = None,
            n_texts: int = 10000) -> None:
    if inputs is not None:
        texts = _read_inputs(inputs, msg)
        texts = list(itertools.islice(texts, n_texts))
    if inputs is None:
        try:
            import ml_datasets
        except ImportError:
            msg.fail(
                "This command, when run without an input file, "
                "requires the ml_datasets library to be installed: "
                "pip install ml_datasets",
                exits=1,
            )

        with msg.loading("Loading IMDB dataset via ml_datasets..."):
            imdb_train, _ = ml_datasets.imdb(train_limit=n_texts, dev_limit=0)
            texts, _ = zip(*imdb_train)
        msg.info(f"Loaded IMDB dataset and using {n_texts} examples")
    with msg.loading(f"Loading pipeline '{model}'..."):
        nlp = load_model(model)
    msg.good(f"Loaded pipeline '{model}'")
    cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(),
                    "Profile.prof")
    s = pstats.Stats("Profile.prof")
    msg.divider("Profile stats")
    s.strip_dirs().sort_stats("time").print_stats()
Esempio n. 17
0
def main(
    # fmt: off
    in_file: str = typer.Argument(..., help="Vectors file (text-based)"),
    vocab_file: str = typer.Argument(..., help="Vocabulary file"),
    out_dir: str = typer.Argument(..., help="Path to output directory"),
    min_freq_ratio: float = typer.Option(0.0, "--min-freq-ratio", "-r", help="Frequency ratio threshold for discarding minority senses or casings"),
    min_distance: float = typer.Option(0.0, "--min-distance", "-s", help="Similarity threshold for discarding redundant keys"),
    # fmt: on
):
    """
    Step 5: Export a sense2vec component

    Expects a vectors.txt and a vocab file trained with GloVe and exports
    a component that can be loaded with Sense2vec.from_disk.
    """
    input_path = Path(in_file)
    vocab_path = Path(vocab_file)
    output_path = Path(out_dir)
    if not input_path.exists():
        msg.fail("Can't find input file", in_file, exits=1)
    if input_path.suffix == ".bin":
        msg.fail("Need text-based vectors file, not binary", in_file, exits=1)
    if not vocab_path.exists():
        msg.fail("Can't find vocab file", vocab_file, exits=1)
    if not output_path.exists():
        output_path.mkdir(parents=True)
        msg.good(f"Created output directory {out_dir}")
    with input_path.open("r", encoding="utf8") as f:
        (n_vectors, vector_size), f = _get_shape(f)
        vectors_data = f.readlines()
    with vocab_path.open("r", encoding="utf8") as f:
        vocab = read_vocab(f)
    vectors = {}
    all_senses = set()
    for item in vectors_data:
        item = item.rstrip().rsplit(" ", vector_size)
        key = item[0]
        try:
            _, sense = split_key(key)
        except ValueError:
            continue
        vec = item[1:]
        if len(vec) != vector_size:
            msg.fail(f"Wrong vector size: {len(vec)} (expected {vector_size})", exits=1)
        all_senses.add(sense)
        vectors[key] = numpy.asarray(vec, dtype=numpy.float32)
    discarded = set()
    discarded.update(get_minority_keys(vocab, min_freq_ratio))
    discarded.update(get_redundant_keys(vocab, vectors, min_distance))
    n_vectors = len(vectors) - len(discarded)
    s2v = Sense2Vec(shape=(n_vectors, vector_size), senses=all_senses)
    for key, vector in vectors.items():
        if key not in discarded:
            s2v.add(key, vector)
            s2v.set_freq(key, vocab[key])
    msg.good("Created the sense2vec model")
    msg.info(f"{n_vectors} vectors, {len(all_senses)} total senses")
    s2v.to_disk(output_path)
    msg.good("Saved model to directory", out_dir)
Esempio n. 18
0
def init_vectors_cli(
    # fmt: off
    lang: str = Arg(..., help="The language of the nlp object to create"),
    vectors_loc: Path = Arg(...,
                            help="Vectors file in Word2Vec format",
                            exists=True),
    output_dir: Path = Arg(..., help="Pipeline output directory"),
    prune: int = Opt(-1,
                     "--prune",
                     "-p",
                     help="Optional number of vectors to prune to"),
    truncate: int = Opt(
        0,
        "--truncate",
        "-t",
        help=
        "Optional number of vectors to truncate to when reading in vectors file"
    ),
    name: Optional[str] = Opt(
        None,
        "--name",
        "-n",
        help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"
    ),
    verbose: bool = Opt(
        False,
        "--verbose",
        "-V",
        "-VV",
        help="Display more information for debugging purposes"),
    jsonl_loc: Optional[Path] = Opt(
        None,
        "--lexemes-jsonl",
        "-j",
        help="Location of JSONL-formatted attributes file",
        hidden=True),
    # fmt: on
):
    """Convert word vectors for use with spaCy. Will export an nlp object that
    you can use in the [initialize] block of your config to initialize
    a model with vectors.
    """
    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
    msg.info(f"Creating blank nlp object for language '{lang}'")
    nlp = util.get_lang_class(lang)()
    if jsonl_loc is not None:
        update_lexemes(nlp, jsonl_loc)
    convert_vectors(nlp,
                    vectors_loc,
                    truncate=truncate,
                    prune=prune,
                    name=name)
    msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
    nlp.to_disk(output_dir)
    msg.good(
        "Saved nlp object with vectors to output directory. You can now use the "
        "path to it in your config as the 'vectors' setting in [initialize].",
        output_dir.resolve(),
    )
Esempio n. 19
0
def setup_and_train(use_gpu, train_args, rank):
    if use_gpu >= 0:
        gpu_id = os.environ.get("CUDA_VISIBLE_DEVICES")
        msg.info(f"Using GPU (isolated): {gpu_id}")
        util.use_gpu(0)
    else:
        msg.info("Using CPU")
    train(randomization_index=rank, **train_args)
Esempio n. 20
0
def project_run(
    project_dir: Path,
    subcommand: str,
    *,
    overrides: Dict[str, Any] = SimpleFrozenDict(),
    force: bool = False,
    dry: bool = False,
    capture: bool = False,
) -> None:
    """Run a named script defined in the project.yml. If the script is part
    of the default pipeline (defined in the "run" section), DVC is used to
    execute the command, so it can determine whether to rerun it. It then
    calls into "exec" to execute it.

    project_dir (Path): Path to project directory.
    subcommand (str): Name of command to run.
    overrides (Dict[str, Any]): Optional config overrides.
    force (bool): Force re-running, even if nothing changed.
    dry (bool): Perform a dry run and don't execute commands.
    capture (bool): Whether to capture the output and errors of individual commands.
        If False, the stdout and stderr will not be redirected, and if there's an error,
        sys.exit will be called with the return code. You should use capture=False
        when you want to turn over execution to the command, and capture=True
        when you want to run the command more like a function.
    """
    config = load_project_config(project_dir, overrides=overrides)
    commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
    workflows = config.get("workflows", {})
    validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
    if subcommand in workflows:
        msg.info(f"Running workflow '{subcommand}'")
        for cmd in workflows[subcommand]:
            project_run(
                project_dir,
                cmd,
                overrides=overrides,
                force=force,
                dry=dry,
                capture=capture,
            )
    else:
        cmd = commands[subcommand]
        for dep in cmd.get("deps", []):
            if not (project_dir / dep).exists():
                err = f"Missing dependency specified by command '{subcommand}': {dep}"
                err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
                err_kwargs = {"exits": 1} if not dry else {}
                msg.fail(err, err_help, **err_kwargs)
        check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
        with working_dir(project_dir) as current_dir:
            msg.divider(subcommand)
            rerun = check_rerun(current_dir, cmd, check_spacy_commit=check_spacy_commit)
            if not rerun and not force:
                msg.info(f"Skipping '{cmd['name']}': nothing changed")
            else:
                run_commands(cmd["script"], dry=dry, capture=capture)
                if not dry:
                    update_lockfile(current_dir, cmd)
Esempio n. 21
0
def project_assets(project_dir: Path,
                   *,
                   sparse_checkout: bool = False) -> None:
    """Fetch assets for a project using DVC if possible.

    project_dir (Path): Path to project directory.
    """
    project_path = ensure_path(project_dir)
    config = load_project_config(project_path)
    assets = config.get("assets", {})
    if not assets:
        msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
    msg.info(f"Fetching {len(assets)} asset(s)")
    for asset in assets:
        dest = (project_dir / asset["dest"]).resolve()
        checksum = asset.get("checksum")
        if "git" in asset:
            git_err = (
                f"Cloning spaCy project templates requires Git and the 'git' command. "
                f"Make sure it's installed and that the executable is available."
            )
            get_git_version(error=git_err)
            if dest.exists():
                # If there's already a file, check for checksum
                if checksum and checksum == get_checksum(dest):
                    msg.good(
                        f"Skipping download with matching checksum: {asset['dest']}"
                    )
                    continue
                else:
                    if dest.is_dir():
                        shutil.rmtree(dest)
                    else:
                        dest.unlink()
            if "repo" not in asset["git"] or asset["git"]["repo"] is None:
                msg.fail(
                    "A git asset must include 'repo', the repository address.",
                    exits=1)
            if "path" not in asset["git"] or asset["git"]["path"] is None:
                msg.fail(
                    "A git asset must include 'path' - use \"\" to get the entire repository.",
                    exits=1,
                )
            git_checkout(
                asset["git"]["repo"],
                asset["git"]["path"],
                dest,
                branch=asset["git"].get("branch"),
                sparse=sparse_checkout,
            )
            msg.good(f"Downloaded asset {dest}")
        else:
            url = asset.get("url")
            if not url:
                # project.yml defines asset without URL that the user has to place
                check_private_asset(dest, checksum)
                continue
            fetch_asset(project_path, url, dest, checksum)
Esempio n. 22
0
def main(
    # fmt: off
    in_file: str = typer.Argument(..., help="Path to input file"),
    out_dir: str = typer.Argument(..., help="Path to output directory"),
    spacy_model: str = typer.Argument("en_core_web_sm",
                                      help="Name of spaCy model to use"),
    n_process: int = typer.Option(
        1, "--n-process", "-n", help="Number of processes (multiprocessing)"),
    max_docs: int = typer.Option(10**6,
                                 "--max-docs",
                                 "-m",
                                 help="Maximum docs per batch"),
    # fmt: on
):
    """
    Step 1: Parse raw text with spaCy

    Expects an input file with one sentence per line and will output a .spacy
    file of the parsed collection of Doc objects (DocBin).
    """
    input_path = Path(in_file)
    output_path = Path(out_dir)
    if not input_path.exists():
        msg.fail("Can't find input file", in_file, exits=1)
    if not output_path.exists():
        output_path.mkdir(parents=True)
        msg.good(f"Created output directory {out_dir}")
    nlp = spacy.load(spacy_model)
    msg.info(f"Using spaCy model {spacy_model}")
    doc_bin = DocBin(attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"])
    msg.text("Preprocessing text...")
    count = 0
    batch_num = 0
    with input_path.open("r", encoding="utf8") as texts:
        docs = nlp.pipe(texts, n_process=n_process)
        for doc in tqdm.tqdm(docs, desc="Docs", unit=""):
            if count < max_docs:
                doc_bin.add(doc)
                count += 1
            else:
                batch_num += 1
                count = 0
                msg.good(f"Processed {len(doc_bin)} docs")
                doc_bin_bytes = doc_bin.to_bytes()
                output_file = output_path / f"{input_path.stem}-{batch_num}.spacy"
                with output_file.open("wb") as f:
                    f.write(doc_bin_bytes)
                msg.good(f"Saved parsed docs to file", output_file.resolve())
                doc_bin = DocBin(
                    attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"])
        batch_num += 1
        output_file = output_path / f"{input_path.stem}-{batch_num}.spacy"
        with output_file.open("wb") as f:
            doc_bin_bytes = doc_bin.to_bytes()
            f.write(doc_bin_bytes)
            msg.good(f"Complete. Saved final parsed docs to file",
                     output_file.resolve())
Esempio n. 23
0
def main(
    # fmt: off
    in_file: str = typer.Argument(..., help="Path to input file"),
    out_dir: str = typer.Argument(..., help="Path to output directory"),
    spacy_model: str = typer.Argument("en_core_web_sm",
                                      help="Name of spaCy model to use"),
    n_process: int = typer.Option(
        1, "--n-process", "-n", help="Number of processes (multiprocessing)"),
    # fmt: on
):
    """
    Step 2: Preprocess text in sense2vec's format

    Expects a binary .spacy input file consisting of the parsed Docs (DocBin)
    and outputs a text file with one sentence per line in the expected sense2vec
    format (merged noun phrases, concatenated phrases with underscores and
    added "senses").

    Example input:
    Rats, mould and broken furniture: the scandal of the UK's refugee housing

    Example output:
    Rats|NOUN ,|PUNCT mould|NOUN and|CCONJ broken_furniture|NOUN :|PUNCT
    the|DET scandal|NOUN of|ADP the|DET UK|GPE 's|PART refugee_housing|NOUN
    """
    input_path = Path(in_file)
    output_path = Path(out_dir)
    if not input_path.exists():
        msg.fail("Can't find input file", in_file, exits=1)
    if not output_path.exists():
        output_path.mkdir(parents=True)
        msg.good(f"Created output directory {out_dir}")
    nlp = spacy.load(spacy_model)
    msg.info(f"Using spaCy model {spacy_model}")
    with input_path.open("rb") as f:
        doc_bin_bytes = f.read()
    doc_bin = DocBin().from_bytes(doc_bin_bytes)
    msg.good(f"Loaded {len(doc_bin)} parsed docs")
    docs = doc_bin.get_docs(nlp.vocab)
    output_file = output_path / f"{input_path.stem}.s2v"
    lines_count = 0
    words_count = 0
    with output_file.open("w", encoding="utf8") as f:
        for doc in tqdm.tqdm(docs, desc="Docs", unit=""):
            doc = merge_phrases(doc)
            words = []
            for token in doc:
                if not token.is_space:
                    word, sense = make_spacy_key(token, prefer_ents=True)
                    words.append(make_key(word, sense))
            f.write(" ".join(words) + "\n")
            lines_count += 1
            words_count += len(words)
    msg.good(
        f"Successfully preprocessed {lines_count} docs ({words_count} words)",
        output_file.resolve(),
    )
Esempio n. 24
0
def _init_labels(nlp, output_path):
    for name, component in nlp.pipeline:
        if getattr(component, "label_data", None) is not None:
            output_file = output_path / f"{name}.json"
            srsly.write_json(output_file, component.label_data)
            msg.good(
                f"Saving label data for component '{name}' to {output_file}")
        else:
            msg.info(f"No label data found for component '{name}'")
Esempio n. 25
0
def main(in_file, vocab_file, out_dir):
    """
    Step 5: Export a sense2vec component

    Expects a vectors.txt and a vocab file trained with GloVe and exports
    a component that can be loaded with Sense2vec.from_disk.
    """
    input_path = Path(in_file)
    vocab_path = Path(vocab_file)
    output_path = Path(out_dir)
    if not input_path.exists():
        msg.fail("Can't find input file", in_file, exits=1)
    if input_path.suffix == ".bin":
        msg.fail("Need text-based vectors file, not binary", in_file, exits=1)
    if not vocab_path.exists():
        msg.fail("Can't find vocab file", vocab_file, exits=1)
    if not output_path.exists():
        output_path.mkdir(parents=True)
        msg.good(f"Created output directory {out_dir}")
    with input_path.open("r", encoding="utf8") as f:
        (n_vectors, vector_size), f = _get_shape(f)
        vectors_data = f.readlines()
    with vocab_path.open("r", encoding="utf8") as f:
        vocab_data = f.readlines()
    data = []
    all_senses = set()
    for item in vectors_data:
        item = item.rstrip().rsplit(" ", vector_size)
        key = item[0]
        try:
            _, sense = split_key(key)
        except ValueError:
            continue
        vec = item[1:]
        if len(vec) != vector_size:
            msg.fail(f"Wrong vector size: {len(vec)} (expected {vector_size})",
                     exits=1)
        all_senses.add(sense)
        data.append((key, numpy.asarray(vec, dtype=numpy.float32)))
    s2v = Sense2Vec(shape=(len(data), vector_size), senses=all_senses)
    for key, vector in data:
        s2v.add(key, vector)
    for item in vocab_data:
        item = item.rstrip()
        if item.endswith(" word"):  # for fastText vocabs
            item = item[:-5]
        try:
            key, freq = item.rsplit(" ", 1)
        except ValueError:
            continue
        s2v.set_freq(key, int(freq))
    msg.good("Created the sense2vec model")
    msg.info(f"{len(data)} vectors, {len(all_senses)} total senses")
    s2v.to_disk(output_path)
    msg.good("Saved model to directory", out_dir)
Esempio n. 26
0
def download_model_and_get_path(lang, model_id):
    save_path = "itranlit-models"
    os.makedirs(save_path, exist_ok=True)
    model_path = save_path + "/" + lang + ".pth"
    msg.info(f"{lang} model downloading inside {save_path}..")
    try:
        download_file_from_google_drive(id=model_id, destination=model_path)
        msg.good(f"{lang} model download successfull. model path {model_path}")
    except Exception as e:
        print(e)
        msg.fail(f"Fail to download {lang} model. please check exception")
Esempio n. 27
0
def get_blacklisted_sense_keys(freqs):
    """Remove keys with sense that is blacklisted"""
    discarded = []
    msg.info('collecting blacklisted sense keys')
    for key, freq in freqs.items():
        try:
            term, sense = split_key(key)
        except ValueError:
            continue
        if sense and sense not in sense_whitelist:
            discarded.append(key)
    return discarded
Esempio n. 28
0
def check_rerun(
    project_dir: Path,
    command: Dict[str, Any],
    *,
    check_spacy_version: bool = True,
    check_spacy_commit: bool = False,
) -> bool:
    """Check if a command should be rerun because its settings or inputs/outputs
    changed.

    project_dir (Path): The current project directory.
    command (Dict[str, Any]): The command, as defined in the project.yml.
    strict_version (bool):
    RETURNS (bool): Whether to re-run the command.
    """
    # Always rerun if no-skip is set
    if command.get("no_skip", False):
        return True
    lock_path = project_dir / PROJECT_LOCK
    if not lock_path.exists():  # We don't have a lockfile, run command
        return True
    data = srsly.read_yaml(lock_path)
    if command["name"] not in data:  # We don't have info about this command
        return True
    entry = data[command["name"]]
    # Always run commands with no outputs (otherwise they'd always be skipped)
    if not entry.get("outs", []):
        return True
    # Always rerun if spaCy version or commit hash changed
    spacy_v = entry.get("spacy_version")
    commit = entry.get("spacy_git_version")
    if check_spacy_version and not is_minor_version_match(
            spacy_v, about.__version__):
        info = f"({spacy_v} in {PROJECT_LOCK}, {about.__version__} current)"
        msg.info(
            f"Re-running '{command['name']}': spaCy minor version changed {info}"
        )
        return True
    if check_spacy_commit and commit != GIT_VERSION:
        info = f"({commit} in {PROJECT_LOCK}, {GIT_VERSION} current)"
        msg.info(
            f"Re-running '{command['name']}': spaCy commit changed {info}")
        return True
    # If the entry in the lockfile matches the lockfile entry that would be
    # generated from the current command, we don't rerun because it means that
    # all inputs/outputs, hashes and scripts are the same and nothing changed
    lock_entry = get_lock_entry(project_dir, command)
    exclude = ["spacy_version", "spacy_git_version"]
    return get_hash(lock_entry, exclude=exclude) != get_hash(entry,
                                                             exclude=exclude)
Esempio n. 29
0
    def train(self,
              data,
              model_name,
              epoch,
              lr=0.05,
              dim=300,
              ws=5,
              minCount=5,
              minn=3,
              maxn=6,
              neg=5,
              wordNgrams=1,
              loss="ns",
              bucket=2000000,
              thread=multiprocessing.cpu_count() - 1):
        """train fasttext with raw text data

        Args:
            data (str): raw text data path
            model_name (str): name of output trained model with extension
            epoch (int): number of training iteration
            lr (float, optional): learning rate. Defaults to 0.05.
            dim (int, optional): vector size or dimension. Defaults to 300.
            ws (int, optional): window size. Defaults to 5.
            minCount (int, optional): minimum word count to ignore training. Defaults to 5.
            minn (int, optional): [description]. Defaults to 3.
            maxn (int, optional): [description]. Defaults to 6.
            neg (int, optional): negative sampling. Defaults to 5.
            wordNgrams (int, optional): [description]. Defaults to 1.
            loss (str, optional): loss type . Defaults to "ns".
            bucket (int, optional): [description]. Defaults to 2000000.
            thread ([type], optional): [description]. Defaults to multiprocessing.cpu_count()-1.
        """
        msg.info('training started.....')
        model = fasttext.train_unsupervised(data,
                                            model='skipgram',
                                            epoch=epoch,
                                            lr=lr,
                                            dim=dim,
                                            ws=ws,
                                            minCount=minCount,
                                            minn=minn,
                                            maxn=maxn,
                                            neg=neg,
                                            wordNgrams=wordNgrams,
                                            loss=loss,
                                            bucket=bucket,
                                            thread=thread)
        msg.good(f'training done! saving as {model_name}')
        model.save_model(model_name)
Esempio n. 30
0
    def check_spacy_model(model) -> bool:
        spacy_info = spacy.info()
        models = list(spacy_info.get('pipelines', spacy_info.get('models', None)).keys())
        if models is None:
            raise ValueError('Unable to detect spacy models.')

        if model not in models:
            msg.info("Downloading spacy model {}".format(model))
            spacy.cli.download(model)
            # spacy.info() doesnt update after a spacy.cli.download, so theres no point checking it
            models.append(model)

        # Always returns true, if it fails to download, spacy sys.exit()s
        return model in models