Esempio n. 1
0
def print_summary(nlp, pretty=True, no_print=False):
    """Print a formatted summary for the current nlp object's pipeline. Shows
    a table with the pipeline components and why they assign and require, as
    well as any problems if available.
    nlp (Language): The nlp object.
    pretty (bool): Pretty-print the results (color etc).
    no_print (bool): Don't print anything, just return the data.
    RETURNS (dict): A dict with "overview" and "problems".
    """
    msg = Printer(pretty=pretty, no_print=no_print)
    overview = []
    problems = {}
    for i, (name, pipe) in enumerate(nlp.pipeline):
        requires = getattr(pipe, "requires", [])
        assigns = getattr(pipe, "assigns", [])
        retok = getattr(pipe, "retokenizes", False)
        overview.append((i, name, requires, assigns, retok))
        problems[name] = analyze_pipes(nlp.pipeline, name, pipe, i, warn=False)
    msg.divider("Pipeline Overview")
    header = ("#", "Component", "Requires", "Assigns", "Retokenizes")
    msg.table(overview, header=header, divider=True, multiline=True)
    n_problems = sum(len(p) for p in problems.values())
    if any(p for p in problems.values()):
        msg.divider("Problems ({})".format(n_problems))
        for name, problem in problems.items():
            if problem:
                problem = ", ".join(problem)
                msg.warn("'{}' requirements not met: {}".format(name, problem))
    else:
        msg.good("No problems found.")
    if no_print:
        return {"overview": overview, "problems": problems}
Esempio n. 2
0
def profile(model, inputs=None, n_texts=10000):
    """
    Profile a spaCy pipeline, to find out which functions take the most time.
    Input should be formatted as one JSON object per line with a key "text".
    It can either be provided as a JSONL file, or be read from sys.sytdin.
    If no input file is specified, the IMDB dataset is loaded via Thinc.
    """
    msg = Printer()
    if inputs is not None:
        inputs = _read_inputs(inputs, msg)
    if inputs is None:
        n_inputs = 25000
        with msg.loading("Loading IMDB dataset via Thinc..."):
            imdb_train, _ = thinc.extra.datasets.imdb()
            inputs, _ = zip(*imdb_train)
        msg.info("Loaded IMDB dataset and using {} examples".format(n_inputs))
        inputs = inputs[:n_inputs]
    with msg.loading("Loading model '{}'...".format(model)):
        nlp = load_model(model)
    msg.good("Loaded model '{}'".format(model))
    texts = list(itertools.islice(inputs, n_texts))
    cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
    s = pstats.Stats("Profile.prof")
    msg.divider("Profile stats")
    s.strip_dirs().sort_stats("time").print_stats()
Esempio n. 3
0
def fill_config(
    output_file: Path,
    base_path: Path,
    *,
    pretraining: bool = False,
    diff: bool = False,
    silent: bool = False,
) -> Tuple[Config, Config]:
    is_stdout = str(output_file) == "-"
    no_print = is_stdout or silent
    msg = Printer(no_print=no_print)
    with show_validation_error(hint_fill=False):
        config = util.load_config(base_path)
        nlp = util.load_model_from_config(config,
                                          auto_fill=True,
                                          validate=False)
    # Load a second time with validation to be extra sure that the produced
    # config result is a valid config
    nlp = util.load_model_from_config(nlp.config)
    filled = nlp.config
    # If we have sourced components in the base config, those will have been
    # replaced with their actual config after loading, so we have to re-add them
    sourced = util.get_sourced_components(config)
    filled["components"].update(sourced)
    if pretraining:
        validate_config_for_pretrain(filled, msg)
        pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
        filled = pretrain_config.merge(filled)
    before = config.to_str()
    after = filled.to_str()
    if before == after:
        msg.warn("Nothing to auto-fill: base config is already complete")
    else:
        msg.good("Auto-filled config with all values")
    if diff and not no_print:
        if before == after:
            msg.warn("No diff to show: nothing was auto-filled")
        else:
            msg.divider("START CONFIG DIFF")
            print("")
            print(diff_strings(before, after))
            msg.divider("END CONFIG DIFF")
            print("")
    save_config(filled, output_file, is_stdout=is_stdout, silent=silent)
    return config, filled
Esempio n. 4
0
    def evaluate(self, data: List[Example]) -> None:
        msg = Printer()
        formatted_data, _ = self._format_data(data)
        sc = self.nlp.evaluate(formatted_data, batch_size=64)
        msg.divider("Recognizer Results")
        result = [
            ("Precision", f"{sc.ents_p:.3f}"),
            ("Recall", f"{sc.ents_r:.3f}"),
            ("F-Score", f"{sc.ents_f:.3f}"),
        ]
        msg.table(result)

        table_data = []
        for label, scores in sorted(sc.ents_per_type.items(),
                                    key=lambda tup: tup[0]):
            table_data.append((label, f"{scores['p']:.3f}",
                               f"{scores['r']:.3f}", f"{scores['f']:.3f}"))
        header = ("Label", "Precision", "Recall", "F-Score")
        formatted = msg.table(table_data, header=header, divider=True)
        return sc
Esempio n. 5
0
def generate_meta(existing_meta: Dict[str, Any],
                  msg: Printer) -> Dict[str, Any]:
    meta = existing_meta or {}
    settings = [
        ("lang", "Pipeline language", meta.get("lang", "en")),
        ("name", "Pipeline name", meta.get("name", "pipeline")),
        ("version", "Package version", meta.get("version", "0.0.0")),
        ("description", "Package description", meta.get("description", None)),
        ("author", "Author", meta.get("author", None)),
        ("email", "Author email", meta.get("email", None)),
        ("url", "Author website", meta.get("url", None)),
        ("license", "License", meta.get("license", "MIT")),
    ]
    msg.divider("Generating meta.json")
    msg.text(
        "Enter the package settings for your pipeline. The following information "
        "will be read from your pipeline data: pipeline, vectors.")
    for setting, desc, default in settings:
        response = get_raw_input(desc, default)
        meta[setting] = default if response == "" and default else response
    return meta
Esempio n. 6
0
def example_data(output_dir: Path, verbose: bool = False):
    """Download Example Data from Github

    output_dir (Path): path to output_dir for entities.jsonl and aliases.jsonl
    """
    msg = Printer(hide_animation=not verbose)

    msg.divider("Example Data")
    with msg.loading(f"Writing Example data to {output_dir}"):

        aliases_data = [
            {
                "alias": "ML",
                "entities": ["a1", "a2"],
                "probabilities": [0.5, 0.5]
            },
            {
                "alias": "Machine learning",
                "entities": ["a1"],
                "probabilities": [1.0]
            },
            {
                "alias": "Meta Language",
                "entities": ["a2"],
                "probabilities": [1.0]
            },
            {
                "alias": "NLP",
                "entities": ["a3", "a4"],
                "probabilities": [0.5, 0.5]
            },
            {
                "alias": "Natural language processing",
                "entities": ["a3"],
                "probabilities": [1.0],
            },
            {
                "alias": "Neuro-linguistic programming",
                "entities": ["a4"],
                "probabilities": [1.0],
            },
            {
                "alias": "Operating system",
                "entities": ["a5"],
                "probabilities": [1.0]
            },
            {
                "alias": "OS",
                "entities": ["a5"],
                "probabilities": [1.0]
            },
            {
                "alias": "Statistics",
                "entities": ["a6"],
                "probabilities": [1.0]
            },
            {
                "alias": "Audience segmentation",
                "entities": ["a7"],
                "probabilities": [1.0],
            },
            {
                "alias": "Decision analysis",
                "entities": ["a8"],
                "probabilities": [1.0]
            },
            {
                "alias": "Computer science",
                "entities": ["a9"],
                "probabilities": [1.0]
            },
            {
                "alias": "Photochemistry",
                "entities": ["a10"],
                "probabilities": [1.0]
            },
            {
                "alias": "Mineralogy",
                "entities": ["a11"],
                "probabilities": [1.0]
            },
            {
                "alias": "Stereochemistry",
                "entities": ["a12"],
                "probabilities": [1.0]
            },
            {
                "alias": "Environmental chemistry",
                "entities": ["a13"],
                "probabilities": [1.0],
            },
            {
                "alias": "Agronomy",
                "entities": ["a14"],
                "probabilities": [1.0]
            },
            {
                "alias": "Research",
                "entities": ["a15"],
                "probabilities": [1.0]
            },
        ]

        entities_data = [
            {
                "id":
                "a1",
                "name":
                "Machine learning (ML)",
                "description":
                "Machine learning (ML) is the scientific study of algorithms and statistical models...",
            },
            {
                "id":
                "a2",
                "name":
                'ML ("Meta Language")',
                "description":
                'ML ("Meta Language") is a general-purpose functional programming language. It has roots in Lisp, and has been characterized as "Lisp with types".',
            },
            {
                "id":
                "a3",
                "name":
                "Natural language processing (NLP)",
                "description":
                "Natural language processing (NLP) is a subfield of linguistics, computer science, information engineering, and artificial intelligence concerned with the interactions between computers and human (natural) languages, in particular how to program computers to process and analyze large amounts of natural language data.",
            },
            {
                "id":
                "a4",
                "name":
                "Neuro-linguistic programming (NLP)",
                "description":
                "Neuro-linguistic programming (NLP) is a pseudoscientific approach to communication, personal development, and psychotherapy created by Richard Bandler and John Grinder in California, United States in the 1970s.",
            },
            {
                "id": "a5",
                "name": "Operating system",
                "description":
                "Operating Systems consists of building system software that provides common services for other types of computer programs.",
                "label": "SKILL",
            },
            {
                "id": "a6",
                "name": "Statistics",
                "description":
                "Statistics deals with all aspects of data collection, organization, analysis, interpretation, and presentation.",
                "label": "SKILL",
            },
            {
                "id": "a7",
                "name": "Audience segmentation",
                "description":
                "Audience segmentation is a process of dividing people into homogeneous subgroups based upon defined criterion such as product usage, demographics, psychographics, communication behaviors and media use. Audience segmentation is used in commercial marketing so advertisers can design and tailor products and services that satisfy the targeted groups. In social marketing, audiences are segmented into subgroups and assumed to have similar interests, needs and behavioral patterns and this assumption allows social marketers to design relevant health or social messages that influence the people to adopt recommended behaviors. Audience segmentation is widely accepted as a fundamental strategy in communication campaigns to influence health and social change. Audience segmentation makes campaign efforts more effective when messages are tailored to the distinct subgroups and more efficient when the target audience is selected based on their susceptibility and receptivity.",
                "label": "SKILL",
            },
            {
                "id": "a8",
                "name": "Decision analysis",
                "description":
                "Decision analysis (DA) is the discipline comprising the philosophy, methodology, and professional practice necessary to address important decisions in a formal manner. Decision analysis includes many procedures, methods, and tools for identifying, clearly representing, and formally assessing important aspects of a decision, for prescribing a recommended course of action by applying the maximum expected utility action axiom to a well-formed representation of the decision, and for translating the formal representation of a decision and its corresponding recommendation into insight for the decision maker and other stakeholders.",
                "label": "SKILL",
            },
            {
                "id":
                "a9",
                "name":
                "Computer science",
                "description":
                "Computer science is the study of processes that interact with data and that can be represented as data in the form of programs. It enables the use of algorithms to manipulate, store, and communicate digital information. A computer scientist studies the theory of computation and the practice of designing software systems.",
            },
            {
                "id": "a10",
                "name": "Photochemistry",
                "description":
                "Photochemistry is the branch of chemistry concerned with the chemical effects of light. Generally, this term is used to describe a chemical reaction caused by absorption of ultraviolet (wavelength from 100 to 400 nm), visible light (400\u2013750 nm) or infrared radiation (750\u20132500 nm).",
                "label": "SKILL",
            },
            {
                "id": "a11",
                "name": "Mineralogy",
                "description":
                "Mineralogy is a subject of geology specializing in the scientific study of the chemistry, crystal structure, and physical (including optical) properties of minerals and mineralized artifacts. Specific studies within mineralogy include the processes of mineral origin and formation, classification of minerals, their geographical distribution, as well as their utilization.",
                "label": "SKILL",
            },
            {
                "id": "a12",
                "name": "Stereochemistry",
                "description":
                'Stereochemistry, a subdiscipline of chemistry, involves the study of the relative spatial arrangement of atoms that form the structure of molecules and their manipulation. The study of stereochemistry focuses on stereoisomers, which by definition have the same molecular formula and sequence of bonded atoms (constitution), but differ in the three-dimensional orientations of their atoms in space. For this reason, it is also known as 3D chemistry\u2014the prefix "stereo-" means "three-dimensionality".',
                "label": "SKILL",
            },
            {
                "id": "a13",
                "name": "Environmental chemistry",
                "description":
                "Environmental chemistry is the scientific study of the chemical and biochemical phenomena that occur in natural places. It should not be confused with green chemistry, which seeks to reduce potential pollution at its source. It can be defined as the study of the sources, reactions, transport, effects, and fates of chemical species in the air, soil, and water environments; and the effect of human activity and biological activity on these. Environmental chemistry is an interdisciplinary science that includes atmospheric, aquatic and soil chemistry, as well as heavily relying on analytical chemistry and being related to environmental and other areas of science.",
                "label": "SKILL",
            },
            {
                "id": "a14",
                "name": "Agronomy",
                "description":
                "Agronomy is the science and technology of producing and using plants for food, fuel, fiber, and land restoration. Agronomy has come to encompass work in the areas of plant genetics, plant physiology, meteorology, and soil science. It is the application of a combination of sciences like biology, chemistry, economics, ecology, earth science, and genetics. Agronomists of today are involved with many issues, including producing food, creating healthier food, managing the environmental impact of agriculture, and extracting energy from plants. Agronomists often specialise in areas such as crop rotation, irrigation and drainage, plant breeding, plant physiology, soil classification, soil fertility, weed control, and insect and pest control.",
                "label": "SKILL",
            },
            {
                "id": "a15",
                "name": "Research",
                "description":
                'Research is "creative and systematic work undertaken to increase the stock of knowledge, including knowledge of humans, culture and society, and the use of this stock of knowledge to devise new applications." or in other hand Research is a process of steps used to collect and analyze information to increase our understanding of a topic or issue. At a general level, research consists of three steps: 1. Pose a question. 2. Collect data to answer the question. 3. Present an answer to the question. This should be a familiar process. You engage in solving problems every day and you start with a question, collect some information, and then form an answer\nResearch is important for three reasons.1. Research adds to our knowledge: Adding to knowledge means that educators undertake research to contribute to existing information about issues 2.Research improves practice: Research is also important because it suggests improvements for practice. Armed with research results, teachers and other educators become more effective professionals. 3. Research informs policy debates: research also provides information to policy makers when they research and debate educational topics.',
                "label": "SKILL",
            },
        ]

        if not output_dir.exists():
            output_dir.mkdir(parents=True)

        srsly.write_jsonl(output_dir / "entities.jsonl", entities_data)
        srsly.write_jsonl(output_dir / "aliases.jsonl", aliases_data)
        msg.good("Done.")
def pretrain(
    config: Config,
    output_dir: Path,
    resume_path: Optional[Path] = None,
    epoch_resume: Optional[int] = None,
    use_gpu: int = -1,
    silent: bool = True,
):
    msg = Printer(no_print=silent)
    if config["training"]["seed"] is not None:
        fix_random_seed(config["training"]["seed"])
    allocator = config["training"]["gpu_allocator"]
    if use_gpu >= 0 and allocator:
        set_gpu_allocator(allocator)
    nlp = load_model_from_config(config)
    _config = nlp.config.interpolate()
    P = registry.resolve(_config["pretraining"], schema=ConfigSchemaPretrain)
    corpus = dot_to_object(_config, P["corpus"])
    corpus = registry.resolve({"corpus": corpus})["corpus"]
    batcher = P["batcher"]
    model = create_pretraining_model(nlp, P)
    optimizer = P["optimizer"]
    # Load in pretrained weights to resume from
    if resume_path is not None:
        _resume_model(model, resume_path, epoch_resume, silent=silent)
    else:
        # Without '--resume-path' the '--epoch-resume' argument is ignored
        epoch_resume = 0
    objective = model.attrs["loss"]
    # TODO: move this to logger function?
    tracker = ProgressTracker(frequency=10000)
    msg.divider(
        f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
    row_settings = {
        "widths": (3, 10, 10, 6, 4),
        "aligns": ("r", "r", "r", "r", "r")
    }
    msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)

    def _save_model(epoch, is_temp=False):
        is_temp_str = ".temp" if is_temp else ""
        with model.use_params(optimizer.averages):
            with (output_dir /
                  f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
                file_.write(model.get_ref("tok2vec").to_bytes())
            log = {
                "nr_word": tracker.nr_word,
                "loss": tracker.loss,
                "epoch_loss": tracker.epoch_loss,
                "epoch": epoch,
            }
            with (output_dir / "log.jsonl").open("a") as file_:
                file_.write(srsly.json_dumps(log) + "\n")

    # TODO: I think we probably want this to look more like the
    # 'create_train_batches' function?
    for epoch in range(epoch_resume, P["max_epochs"]):
        for batch_id, batch in enumerate(batcher(corpus(nlp))):
            docs = ensure_docs(batch)
            loss = make_update(model, docs, optimizer, objective)
            progress = tracker.update(epoch, loss, docs)
            if progress:
                msg.row(progress, **row_settings)
            if P["n_save_every"] and (batch_id % P["n_save_every"] == 0):
                _save_model(epoch, is_temp=True)
        _save_model(epoch)
        tracker.epoch_loss = 0.0
Esempio n. 8
0
def validate():
    """
    Validate that the currently installed version of spaCy is compatible
    with the installed models. Should be run after `pip install -U spacy`.
    """
    msg = Printer()
    with msg.loading("Loading compatibility table..."):
        r = requests.get(about.__compatibility__)
        if r.status_code != 200:
            msg.fail(
                "Server error ({})".format(r.status_code),
                "Couldn't fetch compatibility table.",
                exits=1,
            )
    msg.good("Loaded compatibility table")
    compat = r.json()["spacy"]
    version = about.__version__
    version = version.rsplit(".dev", 1)[0]
    current_compat = compat.get(version)
    if not current_compat:
        msg.fail(
            "Can't find spaCy v{} in compatibility table".format(version),
            about.__compatibility__,
            exits=1,
        )
    all_models = set()
    for spacy_v, models in dict(compat).items():
        all_models.update(models.keys())
        for model, model_vs in models.items():
            compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
    model_links = get_model_links(current_compat)
    model_pkgs = get_model_pkgs(current_compat, all_models)
    incompat_links = {l for l, d in model_links.items() if not d["compat"]}
    incompat_models = {
        d["name"]
        for _, d in model_pkgs.items() if not d["compat"]
    }
    incompat_models.update(
        [d["name"] for _, d in model_links.items() if not d["compat"]])
    na_models = [m for m in incompat_models if m not in current_compat]
    update_models = [m for m in incompat_models if m in current_compat]
    spacy_dir = Path(__file__).parent.parent

    msg.divider("Installed models (spaCy v{})".format(about.__version__))
    msg.info("spaCy installation: {}".format(path2str(spacy_dir)))

    if model_links or model_pkgs:
        header = ("TYPE", "NAME", "MODEL", "VERSION", "")
        rows = []
        for name, data in model_pkgs.items():
            rows.append(get_model_row(current_compat, name, data, msg))
        for name, data in model_links.items():
            rows.append(get_model_row(current_compat, name, data, msg, "link"))
        msg.table(rows, header=header)
    else:
        msg.text("No models found in your current environment.", exits=0)
    if update_models:
        msg.divider("Install updates")
        msg.text("Use the following commands to update the model packages:")
        cmd = "python -m spacy download {}"
        print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
    if na_models:
        msg.text("The following models are not available for spaCy "
                 "v{}: {}".format(about.__version__, ", ".join(na_models)))
    if incompat_links:
        msg.text(
            "You may also want to overwrite the incompatible links using the "
            "`python -m spacy link` command with `--force`, or remove them "
            "from the data directory. "
            "Data path: {path}".format(path=path2str(get_data_path())))
    if incompat_models or incompat_links:
        sys.exit(1)
Esempio n. 9
0
def create_index(
    model: str,
    kb_dir: Path,
    output_dir: Path,
    new_model_name: str = "ann_linker",
    cg_threshold: float = 0.8,
    n_iter: int = 5,
    verbose: bool = True,
):
    """Create an AnnLinker based on the Character N-Gram
    TF-IDF vectors for aliases in a KnowledgeBase

    model (str): spaCy language model directory or name to load
    kb_dir (Path): path to the directory with kb entities.jsonl and aliases.jsonl files
    output_dir (Path): path to output_dir for spaCy model with ann_linker pipe


    kb File Formats
    
    e.g. entities.jsonl

    {"id": "a1", "description": "Machine learning (ML) is the scientific study of algorithms and statistical models..."}
    {"id": "a2", "description": "ML (\"Meta Language\") is a general-purpose functional programming language. It has roots in Lisp, and has been characterized as \"Lisp with types\"."}

    e.g. aliases.jsonl
    {"alias": "ML", "entities": ["a1", "a2"], "probabilities": [0.5, 0.5]}
    """
    msg = Printer(hide_animation=not verbose)

    msg.divider("Load Model")
    with msg.loading(f"Loading model {model}"):
        nlp = spacy.load(model)
        msg.good("Done.")

    if output_dir is not None:
        output_dir = Path(output_dir / new_model_name)
        if not output_dir.exists():
            output_dir.mkdir(parents=True)

    entities, entities_copy = tee(srsly.read_jsonl(kb_dir / "entities.jsonl"))
    total_entities = sum(1 for _ in entities_copy)

    aliases, aliases_copy = tee(srsly.read_jsonl(kb_dir / "aliases.jsonl"))
    total_aliases = sum(1 for _ in aliases_copy)

    kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=INPUT_DIM)

    empty_doc = nlp.make_doc('').vector

    for entity in tqdm(entities,
                       desc='Adding entities to KB',
                       total=total_entities):
        id = entity['id']
        if not kb.contains_entity(id):
            embedding = nlp.make_doc(
                entity['description']
            ).vector if 'description' in entity else empty_doc
            label = entity['label'] if 'label' in entity else 0
            if label: label = kb_type_vs_index[label]
            kb.add_entity(
                entity=id,
                freq=
                label,  #TODO: Add a proper "label" field (repurposed freq field as the type label)
                entity_vector=embedding)

    for alias in tqdm(aliases,
                      desc="Setting kb entities and aliases",
                      total=total_aliases):
        entities = [e for e in alias["entities"] if kb.contains_entity(e)]
        num_entities = len(entities)
        if num_entities > 0:
            prior_probabilities = alias['probabilities'] if len(
                alias['probabilities']
            ) == num_entities else [1.0 / num_entities] * num_entities
            kb.add_alias(alias=alias["alias"],
                         entities=entities,
                         probabilities=prior_probabilities)

    msg.divider("Create ANN Index")
    alias_strings = kb.get_alias_strings()
    cg = CandidateGenerator().fit(alias_strings, verbose=True)

    ann_linker = nlp.create_pipe("ann_linker")
    ann_linker.set_kb(kb)
    ann_linker.set_cg(cg)

    nlp.add_pipe(ann_linker, last=True)

    nlp.meta["name"] = new_model_name
    nlp.to_disk(output_dir)
    nlp.from_disk(output_dir)
Esempio n. 10
0
class Vocab:
    def __init__(
        self,
        instances: Optional[List[List[str]]] = None,
        max_num_tokens: int = None,
        min_count: int = 1,
        unk_token: str = "<UNK>",
        pad_token: str = "<PAD>",
        start_token: str = "<SOS>",
        end_token: str = "<EOS>",
        special_token_freq: float = 1e10,
        store_location: str = None,
        max_instance_length: int = 100,
        include_special_vocab: bool = True,
        preprocessing_pipeline: List[Callable] = None,
    ):
        """

        Parameters
        ----------
        instances : Optional[List[List[str]]]
            A list of tokenized instances
        max_num_tokens : int
            The maximum number of tokens to be used in the vocab
            All the other tokens above this number will be replaced
            by UNK.
            If this is not passed then the maximum possible number
            will be used
        min_count : int
            All words that do not have min count will be mapped to `unk_token`
        unk_token : str
            This token will be used for unknown words
        pad_token : str
            This token will be used for <PAD> words
        start_token : str
            This token will be used for start of line indicator
        end_token : str
            This token will be used for end of sentence indicator
        special_token_freq : float
            special tokens should have high frequency.
        store_location : str
            The users can provide a store location optionally.
            The vocab will be stored in the location
            If the file exists then, the vocab will be restored from the file, rather than building it.
        max_instance_length : int
            Every vocab is related to a namespace. Every instance
            in that namespace will be clipped or padded to this
            length
        include_special_vocab : bool
            Boolean value to indicate whether special vocab should be included or no
            If this is false, you will have to set add_start_end_token to False
            and you cannot pad your instances. This is mostly set for labels -
            such as for classification that require no padding. For such
            cases please make sure that min_count is always 1 and max_num_tokens
            is always None. Otherwise some of the labels will be missed and it
            might result in error
        preprocessing_pipeline: List[Callable]
            You can add a set of callables that take in a list of
            str and return a list of str for pre-processing. For
            example methods look at instance_preprocessing module in sciwing.preprocessing
        """

        self.instances = instances
        self.max_num_tokens = max_num_tokens
        self.min_count = min_count
        self.unk_token = unk_token
        self.pad_token = pad_token
        self.start_token = start_token
        self.end_token = end_token
        self.special_token_freq = special_token_freq
        self.vocab = None
        self.orig_vocab = None
        self.idx2token = None
        self.token2idx = None
        self.store_location = store_location
        self.max_instance_length = max_instance_length
        self.include_special_vocab = include_special_vocab
        self.preprocessing_pipeline = preprocessing_pipeline

        self.msg_printer = Printer()

        # store the special tokens
        if self.include_special_vocab:
            self.special_vocab = {
                self.unk_token: (self.special_token_freq + 3, 0),
                self.pad_token: (self.special_token_freq + 2, 1),
                self.start_token: (self.special_token_freq + 1, 2),
                self.end_token: (self.special_token_freq, 3),
            }
        else:
            if self.min_count != 1:
                self.msg_printer.warn(
                    "Warning: You are building vocab without special vocab. "
                    "Please make sure that min_count is 1")
            if self.max_num_tokens is not None:
                self.msg_printer.warn(
                    "You are building vocab without special vocab. Please make "
                    "sure that max_num_tokens is None")
            self.special_vocab = {}

        if instances is not None:
            self.instances = list(
                flatten(instances))  # just flatten the entire instance
            if isinstance(self.instances[0], Token):
                self.instances = [tok.text for tok in self.instances]
            if self.preprocessing_pipeline:
                self.instances = self.apply_preprocessing()

    def apply_preprocessing(self):
        instances = deepcopy(self.instances)
        for preprocessor in self.preprocessing_pipeline:
            instances = preprocessor(instances)

        return instances

    def map_tokens_to_freq_idx(self) -> Dict[str, Tuple[int, int]]:
        """
        Build vocab from instances
        return the word -> (freq, idx)
        :return:
        """
        all_tokens = deepcopy(self.instances)

        # counter will map a list to Dict[str, count] values
        counter = Counter(all_tokens)

        # order the order in decreasing order of their frequencies
        # List[Tuple]
        counter = sorted(counter.items(), key=itemgetter(1), reverse=True)

        vocab = {}

        for idx, (token, freq) in enumerate(counter):
            vocab[token] = (freq, len(self.special_vocab) + idx)

        # merge the two dictionaries
        # courtesy https://stackoverflow.com/questions/38987/how-to-merge-two-dictionaries-in-a-single-expression
        vocab = {**vocab, **self.special_vocab}

        # BUG: if vocab and special vocab share same token, then
        # the index of the vocab will get overwritten by special vocab
        # the only way now is to recalculate indices based on frequencies
        vocab = sorted(vocab.items(), key=itemgetter(1), reverse=True)
        new_vocab = {}
        for idx, (token, (freq, _)) in enumerate(vocab):
            new_vocab[token] = (freq, idx)
        return new_vocab

    def clip_on_mincount(
            self, vocab: Dict[str, Tuple[int,
                                         int]]) -> Dict[str, Tuple[int, int]]:
        """
        Clip the vocab based on min count
        We decide to keep the word and it count
        We just change the idx of the token to idx of the unknown token
        :return: vocab: type: Dict[str, Tuple[int, int]]
        The new vocab
        """
        for key, (freq, idx) in vocab.items():
            if freq < self.min_count:
                vocab[key] = (freq, vocab[self.unk_token][1])

        return vocab

    def clip_on_max_num(
            self, vocab: Dict[str, Tuple[int,
                                         int]]) -> Dict[str, Tuple[int, int]]:
        """
        Clip the vocab based on the maximum number of words
        We return `max_num_words + len(self.special_vocab)` words effectively
        The rest of them will be mapped to `self.unk_token`
        Parameters
        ----------
        vocab : Dict[str, Tuple[int, int]]
            The mapping from token to idx and frequency
        Returns
        -------
        Dict[str, Tuple[int, int]]
            The new vocab

        """
        for key, (freq, idx) in vocab.items():
            if idx >= len(self.special_vocab) + self.max_num_tokens:
                vocab[key] = (freq, vocab[self.unk_token][1])

        return vocab

    def _add_token(self, token: str, save_vocab: bool = False):
        """
        Add token to an already existing vocabulary
        :param token: type str
        :return:
        """
        try:
            vocab = self.vocab
        except AttributeError:
            self.msg_printer.fail("Please build vocab using build vocab")
        tokens = vocab.keys()
        indices = [idx for freq, idx in vocab.values()]
        indices = sorted(indices, reverse=True)
        highest_idx = indices[0]

        if token not in tokens:
            self.vocab[token] = (1, highest_idx + 1)
            self.idx2token[highest_idx + 1] = token
            self.token2idx[token] = highest_idx + 1
            if save_vocab:
                self.save_to_file(
                    self.store_location)  # this can be expensive.

    def add_tokens(self, tokens: List[str]):
        try:
            vocab = self.vocab
        except AttributeError:
            self.msg_printer.fail("Please build vocab first")

        for token in tokens:
            self._add_token(token, save_vocab=False)

        if self.store_location:
            self.save_to_file(self.store_location)

    def build_vocab(self) -> Dict[str, Tuple[int, int]]:

        if self.store_location and os.path.isfile(self.store_location):
            vocab_object = self.load_from_file(self.store_location)
            self.msg_printer.good("Loaded vocab from file {0}".format(
                self.store_location))
            self.vocab = vocab_object.vocab
            self.orig_vocab = vocab_object.orig_vocab
            self.idx2token = vocab_object.idx2token
            self.token2idx = vocab_object.token2idx
            vocab = vocab_object.vocab

        else:
            self.msg_printer.info("BUILDING VOCAB")
            vocab = self.map_tokens_to_freq_idx()

            # dictionary are passed by reference. Be careful
            self.orig_vocab = deepcopy(vocab)

            # set max num of tokens to maximum possible if it is not set
            if self.max_num_tokens is None:
                self.max_num_tokens = len(self.orig_vocab.keys())

            vocab = self.clip_on_mincount(vocab)
            vocab = self.clip_on_max_num(vocab)
            self.vocab = vocab
            self.idx2token = self.get_idx2token_mapping()
            self.token2idx = self.get_token2idx_mapping()

            if self.store_location:
                self.msg_printer.info("SAVING VOCAB TO FILE")
                self.save_to_file(self.store_location)
        return vocab

    def get_vocab_len(self) -> int:
        if not self.vocab:
            raise ValueError("Build vocab first by calling build_vocab()")

        length = len(set(idx for freq, idx in self.vocab.values()))
        return length

    def get_orig_vocab_len(self) -> int:
        if not self.orig_vocab:
            raise ValueError("Build vocab first by calling build_vocab()")

        length = len(set(idx for freq, idx in self.orig_vocab.values()))
        return length

    def get_token2idx_mapping(self) -> Dict[str, int]:
        if not self.vocab:
            raise ValueError("Build vocab first by calling build_vocab()")

        token2idx = {}
        for word, (freq, idx) in self.vocab.items():
            token2idx[word] = idx

        return token2idx

    def get_idx2token_mapping(self) -> Dict[int, str]:
        if not self.vocab:
            raise ValueError("Build vocab first by calling build_vocab()")

        idx2words = {}
        for word, (freq, idx) in self.vocab.items():
            idx2words[idx] = word
        return idx2words

    def save_to_file(self, filename: str):
        """
        :param filename: str
        The filename where the result to the file will be stored
        The vocab will be stored in the json file name
        Please make sure that this is a json filename

        :return: None
        The whole vocab object will be saved to the file
        """

        if not self.vocab:
            raise ValueError("Build vocab first by calling build_vocab()")

        vocab_state = dict()
        vocab_state["options"] = {
            "max_num_words": self.max_num_tokens,
            "min_count": self.min_count,
            "unk_token": self.unk_token,
            "pad_token": self.pad_token,
            "start_token": self.start_token,
            "end_token": self.end_token,
            "special_token_freq": self.special_token_freq,
            "special_vocab": self.special_vocab,
        }
        vocab_state["vocab"] = self.vocab
        vocab_state["orig_vocab"] = self.orig_vocab
        try:
            with open(filename, "w") as fp:
                json.dump(vocab_state, fp)

        except FileNotFoundError:
            print("You passed {0} for the filename. Please check whether "
                  "the path exists and try again".format(filename))

    @classmethod
    def load_from_file(cls, filename: str) -> "Vocab":
        try:
            with open(filename, "r") as fp:
                vocab_state = json.load(fp)
                vocab_options = vocab_state["options"]
                vocab_dict = vocab_state["vocab"]
                orig_vocab_dict = vocab_state["orig_vocab"]

                # restore the object
                # restore all the property values from the file

                max_num_tokens = vocab_options["max_num_words"]
                min_count = vocab_options["min_count"]
                unk_token = vocab_options["unk_token"]
                pad_token = vocab_options["pad_token"]
                start_token = vocab_options["start_token"]
                end_token = vocab_options["end_token"]
                special_token_freq = vocab_options["special_token_freq"]
                store_location = filename
                vocab = cls(
                    max_num_tokens=max_num_tokens,
                    min_count=min_count,
                    unk_token=unk_token,
                    pad_token=pad_token,
                    start_token=start_token,
                    end_token=end_token,
                    instances=None,
                    special_token_freq=special_token_freq,
                    store_location=store_location,
                )

                # instead of building the vocab, set the vocab from vocab_dict
                vocab.set_vocab(vocab=vocab_dict)
                vocab.set_orig_vocab(orig_vocab_dict)
                idx2token = vocab.get_idx2token_mapping()
                token2idx = vocab.get_token2idx_mapping()
                vocab.set_idx2token(idx2token)
                vocab.set_token2idx(token2idx)

                return vocab
        except FileNotFoundError:
            print("You passed {0} for the filename. Please check whether "
                  "the path exists and try again. Please pass "
                  "a json file".format(filename))

    def get_token_from_idx(self, idx: int) -> str:
        if not self.vocab:
            raise ValueError("Please build the vocab first")

        if not self.idx2token:
            self.idx2token = self.get_idx2token_mapping()

        vocab_len = self.get_vocab_len()

        if idx > vocab_len - 1:
            message = (
                f"You tried to access idx {idx} of the vocab The length of the vocab is "
                f"{vocab_len}. Please Provide Number between 0 and {vocab_len - 1}"
            )
            raise ValueError(message)

        token = self.idx2token.get(idx)
        return token

    def get_idx_from_token(self, token: str) -> int:
        if not self.vocab:
            raise ValueError("Please build the vocab first")

        if not self.token2idx:
            self.token2idx = self.get_token2idx_mapping()

        try:
            return self.token2idx[token]
        except KeyError:
            return self.token2idx.get(self.unk_token, None)

    def get_topn_frequent_words(self, n: int = 5) -> List[Tuple[str, int]]:
        idx2token = self.idx2token
        token_freqs = []
        max_n = min(len(self.special_vocab) + n, self.get_vocab_len())
        for idx in range(len(self.special_vocab), max_n):
            token = idx2token[idx]
            freq = self.orig_vocab[token][0]
            token_freqs.append((token, freq))

        return token_freqs

    def print_stats(self) -> None:
        orig_vocab_len = self.get_orig_vocab_len()
        vocab_len = self.get_vocab_len()
        N = 5
        top_n = self.get_topn_frequent_words(n=N)

        data = [
            ("Original vocab length", orig_vocab_len),
            ("Clipped vocab length", vocab_len),
            ("Top {0} words".format(N), top_n),
        ]
        header = ("Stats Description", "#")
        table_string = wasabi.table(data=data, header=header, divider=True)
        self.msg_printer.divider("VOCAB STATS")
        print(table_string)

    def set_vocab(self, vocab: Dict[str, Tuple[int, int]]):
        self.vocab = vocab

    def set_orig_vocab(self, orig_vocab: Dict[str, Tuple[int, int]]):
        self.orig_vocab = orig_vocab

    def set_idx2token(self, idx2token: Dict[int, str]):
        self.idx2token = idx2token

    def set_token2idx(self, token2idx: Dict[str, int]):
        self.token2idx = token2idx

    def get_disp_sentence_from_indices(self, indices: List[int]) -> str:
        """ Given a set of indices in vocab, it returns a sentence mapping the index to string

        Parameters
        ----------
        indices : List[int]
            A list of indices where every index is between ``[0, vocab_len-1)``.

        Returns
        -------
        str
            A string representing the index
        """
        if self.special_vocab:
            pad_token_index = self.get_idx_from_token(self.pad_token)
            start_token_index = self.get_idx_from_token(self.start_token)
            end_token_index = self.get_idx_from_token(self.end_token)
            special_indices = [
                pad_token_index, start_token_index, end_token_index
            ]
        else:
            special_indices = []

        token = [
            self.get_token_from_idx(idx) for idx in indices
            if idx not in special_indices
        ]
        sentence = " ".join(token)
        return sentence

    @property
    def token2idx(self):
        return self._token2idx

    @token2idx.setter
    def token2idx(self, value):
        self._token2idx = value

    @property
    def idx2token(self):
        return self._idx2token

    @idx2token.setter
    def idx2token(self, value):
        self._idx2token = value
Esempio n. 11
0
def top_prediction_errors(
    recognizer: EntityRecognizer,
    data: List[Example],
    labels: List[str] = None,
    n: int = None,
    k: int = None,
    exclude_fp: bool = False,
    exclude_fn: bool = False,
    verbose: bool = False,
) -> List[PredictionError]:
    """Get a sorted list of examples your model is worst at predicting.

    Args:
        recognizer (EntityRecognizer): An instance of EntityRecognizer
        data (List[Example]): List of annotated Examples
        labels (List[str], optional): List of labels to get errors for.
            Defaults to the labels property of `recognizer`.
        n (int, optional): If set, only use the top n examples from data.
        k (int, optional): If set, return the top k prediction errors, otherwise the whole list.
        exclude_fp (bool, optional): Flag to exclude False Positive errors.
        exclude_fn (bool, optional): Flag to exclude False Negative errors.
        verbose (bool, optional): Show verbose output.

    Returns:
        List[PredictionError]: List of Prediction Errors your model is making, sorted by the
            spans your model has the most trouble with.
    """
    labels_ = labels or recognizer.labels
    if n is not None:
        data = data[:n]

    n_examples = len(data)
    texts = (e.text for e in data)
    anns = (e.spans for e in data)

    errors = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))  # type: ignore
    error_examples: DefaultDict[str, List[PredictionErrorExamplePair]] = defaultdict(list)
    n_errors = 0

    for orig_example, pred_example, ann in zip(data, recognizer.predict(texts), anns):
        if k is not None and n_errors > k:
            break

        pred_error_example_pair = PredictionErrorExamplePair(
            original=orig_example, predicted=pred_example
        )

        cand = set([(s.start, s.end, s.label) for s in pred_example.spans])
        gold = set([(s.start, s.end, s.label) for s in ann])

        fp_diff = cand - gold
        fn_diff = gold - cand

        seen = set()

        if fp_diff and not exclude_fp:
            for fp in fp_diff:
                gold_ent = None
                for ge in gold:
                    if fp[0] == ge[0] and fp[1] == ge[1]:
                        gold_ent = ge
                        break
                if gold_ent:
                    start, end, label = gold_ent
                    text = pred_example.text[start:end]
                    false_label = fp[2]
                    errors[label][text][false_label] += 1
                    error_examples[f"{text}||{label}||{false_label}"].append(
                        pred_error_example_pair
                    )
                else:
                    start, end, false_label = fp
                    text = pred_example.text[start:end]
                    errors[NONE][text][false_label] += 1
                    error_examples[f"{text}||{NONE}||{false_label}"].append(pred_error_example_pair)
                n_errors += 1
                seen.add((start, end))

        if fn_diff and not exclude_fn:
            for fn in fn_diff:
                start, end, label = fn
                if (start, end) not in seen:
                    text = pred_example.text[start:end]
                    errors[label][text][NONE] += 1
                    error_examples[f"{text}||{label}||{NONE}"].append(pred_error_example_pair)
                    n_errors += 1

    ranked_errors_map: Dict[str, PredictionError] = {}

    for label, errors_per_label in errors.items():
        for error_text, error_labels in errors_per_label.items():
            for error_label, count in error_labels.items():
                pe_hash = f"{error_text}||{label}||{error_label}"
                ranked_errors_map[pe_hash] = PredictionError(
                    text=error_text,
                    true_label=label,
                    pred_label=error_label,
                    count=count,
                    examples=error_examples[f"{error_text}||{label}||{error_label}"],
                )

    ranked_errors: List[PredictionError] = sorted(
        list(ranked_errors_map.values()), key=lambda error: error.count, reverse=True  # type: ignore
    )
    error_texts = set()
    for re in ranked_errors:
        if re.examples:
            for e in re.examples:
                error_texts.add(e.original.text)

    error_rate = round(len(error_texts) / len(data), 2)
    if verbose:
        error_summary = {
            "N Examples": len(data),
            "N Errors": len(ranked_errors),
            "N Error Examples": len(error_texts),
            "Error Rate": error_rate,
        }
        msg = Printer()
        msg.divider("Error Analysis")
        msg.table(error_summary)

    return ranked_errors
Esempio n. 12
0
def pretrain(
    texts_loc,
    vectors_model,
    output_dir,
    width=96,
    depth=4,
    embed_rows=2000,
    loss_func="cosine",
    use_vectors=False,
    dropout=0.2,
    n_iter=1000,
    batch_size=3000,
    max_length=500,
    min_length=5,
    seed=0,
    n_save_every=None,
):
    """
    Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
    using an approximate language-modelling objective. Specifically, we load
    pre-trained vectors, and train a component like a CNN, BiLSTM, etc to predict
    vectors which match the pre-trained ones. The weights are saved to a directory
    after each epoch. You can then pass a path to one of these pre-trained weights
    files to the 'spacy train' command.

    This technique may be especially helpful if you have little labelled data.
    However, it's still quite experimental, so your mileage may vary.

    To load the weights back in during 'spacy train', you need to ensure
    all settings are the same between pretraining and training. The API and
    errors around this need some improvement.
    """
    config = dict(locals())
    msg = Printer()
    util.fix_random_seed(seed)

    has_gpu = prefer_gpu()
    msg.info("Using GPU" if has_gpu else "Not using GPU")

    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
        msg.good("Created output directory")
    srsly.write_json(output_dir / "config.json", config)
    msg.good("Saved settings to config.json")

    # Load texts from file or stdin
    if texts_loc != "-":  # reading from a file
        texts_loc = Path(texts_loc)
        if not texts_loc.exists():
            msg.fail("Input text file doesn't exist", texts_loc, exits=1)
        with msg.loading("Loading input texts..."):
            texts = list(srsly.read_jsonl(texts_loc))
        msg.good("Loaded input texts")
        random.shuffle(texts)
    else:  # reading from stdin
        msg.text("Reading input text from stdin...")
        texts = srsly.read_jsonl("-")

    with msg.loading("Loading model '{}'...".format(vectors_model)):
        nlp = util.load_model(vectors_model)
    msg.good("Loaded model '{}'".format(vectors_model))
    pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
    model = create_pretraining_model(
        nlp,
        Tok2Vec(
            width,
            embed_rows,
            conv_depth=depth,
            pretrained_vectors=pretrained_vectors,
            bilstm_depth=0,  # Requires PyTorch. Experimental.
            cnn_maxout_pieces=3,  # You can try setting this higher
            subword_features=True,  # Set to False for Chinese etc
        ),
    )
    optimizer = create_default_optimizer(model.ops)
    tracker = ProgressTracker(frequency=10000)
    msg.divider("Pre-training tok2vec layer")
    row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
    msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)

    def _save_model(epoch, is_temp=False):
        is_temp_str = ".temp" if is_temp else ""
        with model.use_params(optimizer.averages):
            with (output_dir / ("model%d%s.bin" % (epoch, is_temp_str))).open(
                "wb"
            ) as file_:
                file_.write(model.tok2vec.to_bytes())
            log = {
                "nr_word": tracker.nr_word,
                "loss": tracker.loss,
                "epoch_loss": tracker.epoch_loss,
                "epoch": epoch,
            }
            with (output_dir / "log.jsonl").open("a") as file_:
                file_.write(srsly.json_dumps(log) + "\n")

    for epoch in range(n_iter):
        for batch_id, batch in enumerate(
            util.minibatch_by_words(((text, None) for text in texts), size=batch_size)
        ):
            docs = make_docs(
                nlp,
                [text for (text, _) in batch],
                max_length=max_length,
                min_length=min_length,
            )
            loss = make_update(
                model, docs, optimizer, objective=loss_func, drop=dropout
            )
            progress = tracker.update(epoch, loss, docs)
            if progress:
                msg.row(progress, **row_settings)
                if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7:
                    break
            if n_save_every and (batch_id % n_save_every == 0):
                _save_model(epoch, is_temp=True)
        _save_model(epoch)
        tracker.epoch_loss = 0.0
        if texts_loc != "-":
            # Reshuffle the texts if texts were loaded from a file
            random.shuffle(texts)
Esempio n. 13
0
def pretrain(
    texts_loc,
    vectors_model,
    output_dir,
    width=96,
    depth=4,
    bilstm_depth=2,
    embed_rows=2000,
    loss_func="cosine",
    use_vectors=False,
    dropout=0.2,
    n_iter=1000,
    batch_size=3000,
    max_length=500,
    min_length=5,
    seed=0,
    n_save_every=None,
    init_tok2vec=None,
    epoch_start=None,
):
    """
    Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
    using an approximate language-modelling objective. Specifically, we load
    pretrained vectors, and train a component like a CNN, BiLSTM, etc to predict
    vectors which match the pretrained ones. The weights are saved to a directory
    after each epoch. You can then pass a path to one of these pretrained weights
    files to the 'spacy train' command.

    This technique may be especially helpful if you have little labelled data.
    However, it's still quite experimental, so your mileage may vary.

    To load the weights back in during 'spacy train', you need to ensure
    all settings are the same between pretraining and training. The API and
    errors around this need some improvement.
    """
    config = dict(locals())
    for key in config:
        if isinstance(config[key], Path):
            config[key] = str(config[key])
    msg = Printer()
    util.fix_random_seed(seed)

    has_gpu = prefer_gpu()
    if has_gpu:
        import torch

        torch.set_default_tensor_type("torch.cuda.FloatTensor")
    msg.info("Using GPU" if has_gpu else "Not using GPU")

    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
        msg.good("Created output directory")
    srsly.write_json(output_dir / "config.json", config)
    msg.good("Saved settings to config.json")

    # Load texts from file or stdin
    if texts_loc != "-":  # reading from a file
        texts_loc = Path(texts_loc)
        if not texts_loc.exists():
            msg.fail("Input text file doesn't exist", texts_loc, exits=1)
        with msg.loading("Loading input texts..."):
            texts = list(srsly.read_jsonl(texts_loc))
        if not texts:
            msg.fail("Input file is empty", texts_loc, exits=1)
        msg.good("Loaded input texts")
        random.shuffle(texts)
    else:  # reading from stdin
        msg.text("Reading input text from stdin...")
        texts = srsly.read_jsonl("-")

    with msg.loading("Loading model '{}'...".format(vectors_model)):
        nlp = util.load_model(vectors_model)
    msg.good("Loaded model '{}'".format(vectors_model))
    pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
    model = create_pretraining_model(
        nlp,
        Tok2Vec(
            width,
            embed_rows,
            conv_depth=depth,
            pretrained_vectors=pretrained_vectors,
            bilstm_depth=bilstm_depth,  # Requires PyTorch. Experimental.
            cnn_maxout_pieces=3,  # You can try setting this higher
            subword_features=True,  # Set to False for Chinese etc
        ),
    )
    # Load in pretrained weights
    if init_tok2vec is not None:
        components = _load_pretrained_tok2vec(nlp, init_tok2vec)
        msg.text("Loaded pretrained tok2vec for: {}".format(components))
        # Parse the epoch number from the given weight file
        model_name = re.search(r"model\d+\.bin", str(init_tok2vec))
        if model_name:
            # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
            epoch_start = int(model_name.group(0)[5:][:-4]) + 1
        else:
            if not epoch_start:
                msg.fail(
                    "You have to use the '--epoch-start' argument when using a renamed weight file for "
                    "'--init-tok2vec'",
                    exits=True,
                )
            elif epoch_start < 0:
                msg.fail(
                    "The argument '--epoch-start' has to be greater or equal to 0. '%d' is invalid"
                    % epoch_start,
                    exits=True,
                )
    else:
        # Without '--init-tok2vec' the '--epoch-start' argument is ignored
        epoch_start = 0

    optimizer = create_default_optimizer(model.ops)
    tracker = ProgressTracker(frequency=10000)
    msg.divider("Pre-training tok2vec layer - starting at epoch %d" %
                epoch_start)
    row_settings = {
        "widths": (3, 10, 10, 6, 4),
        "aligns": ("r", "r", "r", "r", "r")
    }
    msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)

    def _save_model(epoch, is_temp=False):
        is_temp_str = ".temp" if is_temp else ""
        with model.use_params(optimizer.averages):
            with (output_dir / ("model%d%s.bin" %
                                (epoch, is_temp_str))).open("wb") as file_:
                file_.write(model.tok2vec.to_bytes())
            log = {
                "nr_word": tracker.nr_word,
                "loss": tracker.loss,
                "epoch_loss": tracker.epoch_loss,
                "epoch": epoch,
            }
            with (output_dir / "log.jsonl").open("a") as file_:
                file_.write(srsly.json_dumps(log) + "\n")

    skip_counter = 0
    for epoch in range(epoch_start, n_iter + epoch_start):
        for batch_id, batch in enumerate(
                util.minibatch_by_words(((text, None) for text in texts),
                                        size=batch_size)):
            docs, count = make_docs(
                nlp,
                [text for (text, _) in batch],
                max_length=max_length,
                min_length=min_length,
            )
            skip_counter += count
            loss = make_update(model,
                               docs,
                               optimizer,
                               objective=loss_func,
                               drop=dropout)
            progress = tracker.update(epoch, loss, docs)
            if progress:
                msg.row(progress, **row_settings)
                if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10**7:
                    break
            if n_save_every and (batch_id % n_save_every == 0):
                _save_model(epoch, is_temp=True)
        _save_model(epoch)
        tracker.epoch_loss = 0.0
        if texts_loc != "-":
            # Reshuffle the texts if texts were loaded from a file
            random.shuffle(texts)
    if skip_counter > 0:
        msg.warn(
            "Skipped {count} empty values".format(count=str(skip_counter)))
    msg.good("Successfully finished pretrain")
Esempio n. 14
0
def debug_data(
    lang,
    train_path,
    dev_path,
    base_model=None,
    pipeline="tagger,parser,ner",
    ignore_warnings=False,
    ignore_validation=False,
    verbose=False,
    no_format=False,
):
    msg = Printer(pretty=not no_format, ignore_warnings=ignore_warnings)

    # Make sure all files and paths exists if they are needed
    if not train_path.exists():
        msg.fail("Training data not found", train_path, exits=1)
    if not dev_path.exists():
        msg.fail("Development data not found", dev_path, exits=1)

    # Initialize the model and pipeline
    pipeline = [p.strip() for p in pipeline.split(",")]
    if base_model:
        nlp = load_model(base_model)
    else:
        lang_cls = get_lang_class(lang)
        nlp = lang_cls()

    msg.divider("Data format validation")

    # Validate data format using the JSON schema
    # TODO: update once the new format is ready
    # TODO: move validation to GoldCorpus in order to be able to load from dir
    train_data_errors = []  # TODO: validate_json
    dev_data_errors = []  # TODO: validate_json
    if not train_data_errors:
        msg.good("Training data JSON format is valid")
    if not dev_data_errors:
        msg.good("Development data JSON format is valid")
    for error in train_data_errors:
        msg.fail("Training data: {}".format(error))
    for error in dev_data_errors:
        msg.fail("Develoment data: {}".format(error))
    if (train_data_errors or dev_data_errors) and not ignore_validation:
        sys.exit(1)

    # Create the gold corpus to be able to better analyze data
    loading_train_error_message = ""
    loading_dev_error_message = ""
    with msg.loading("Loading corpus..."):
        corpus = GoldCorpus(train_path, dev_path)
        try:
            train_docs = list(corpus.train_docs(nlp))
            train_docs_unpreprocessed = list(
                corpus.train_docs_without_preprocessing(nlp))
        except ValueError as e:
            loading_train_error_message = "Training data cannot be loaded: {}".format(
                str(e))
        try:
            dev_docs = list(corpus.dev_docs(nlp))
        except ValueError as e:
            loading_dev_error_message = "Development data cannot be loaded: {}".format(
                str(e))
    if loading_train_error_message or loading_dev_error_message:
        if loading_train_error_message:
            msg.fail(loading_train_error_message)
        if loading_dev_error_message:
            msg.fail(loading_dev_error_message)
        sys.exit(1)
    msg.good("Corpus is loadable")

    # Create all gold data here to avoid iterating over the train_docs constantly
    gold_train_data = _compile_gold(train_docs, pipeline)
    gold_train_unpreprocessed_data = _compile_gold(train_docs_unpreprocessed,
                                                   pipeline)
    gold_dev_data = _compile_gold(dev_docs, pipeline)

    train_texts = gold_train_data["texts"]
    dev_texts = gold_dev_data["texts"]

    msg.divider("Training stats")
    msg.text("Training pipeline: {}".format(", ".join(pipeline)))
    for pipe in [p for p in pipeline if p not in nlp.factories]:
        msg.fail(
            "Pipeline component '{}' not available in factories".format(pipe))
    if base_model:
        msg.text("Starting with base model '{}'".format(base_model))
    else:
        msg.text("Starting with blank model '{}'".format(lang))
    msg.text("{} training docs".format(len(train_docs)))
    msg.text("{} evaluation docs".format(len(dev_docs)))

    overlap = len(train_texts.intersection(dev_texts))
    if overlap:
        msg.warn(
            "{} training examples also in evaluation data".format(overlap))
    else:
        msg.good("No overlap between training and evaluation data")
    if not base_model and len(train_docs) < BLANK_MODEL_THRESHOLD:
        text = "Low number of examples to train from a blank model ({})".format(
            len(train_docs))
        if len(train_docs) < BLANK_MODEL_MIN_THRESHOLD:
            msg.fail(text)
        else:
            msg.warn(text)
        msg.text(
            "It's recommended to use at least {} examples (minimum {})".format(
                BLANK_MODEL_THRESHOLD, BLANK_MODEL_MIN_THRESHOLD),
            show=verbose,
        )

    msg.divider("Vocab & Vectors")
    n_words = gold_train_data["n_words"]
    msg.info("{} total {} in the data ({} unique)".format(
        n_words, "word" if n_words == 1 else "words",
        len(gold_train_data["words"])))
    if gold_train_data["n_misaligned_words"] > 0:
        msg.warn("{} misaligned tokens in the training data".format(
            gold_train_data["n_misaligned_words"]))
    if gold_dev_data["n_misaligned_words"] > 0:
        msg.warn("{} misaligned tokens in the dev data".format(
            gold_dev_data["n_misaligned_words"]))
    most_common_words = gold_train_data["words"].most_common(10)
    msg.text(
        "10 most common words: {}".format(
            _format_labels(most_common_words, counts=True)),
        show=verbose,
    )
    if len(nlp.vocab.vectors):
        msg.info("{} vectors ({} unique keys, {} dimensions)".format(
            len(nlp.vocab.vectors),
            nlp.vocab.vectors.n_keys,
            nlp.vocab.vectors_length,
        ))
    else:
        msg.info("No word vectors present in the model")

    if "ner" in pipeline:
        # Get all unique NER labels present in the data
        labels = set(label for label in gold_train_data["ner"]
                     if label not in ("O", "-"))
        label_counts = gold_train_data["ner"]
        model_labels = _get_labels_from_model(nlp, "ner")
        new_labels = [l for l in labels if l not in model_labels]
        existing_labels = [l for l in labels if l in model_labels]
        has_low_data_warning = False
        has_no_neg_warning = False
        has_ws_ents_error = False

        msg.divider("Named Entity Recognition")
        msg.info("{} new {}, {} existing {}".format(
            len(new_labels),
            "label" if len(new_labels) == 1 else "labels",
            len(existing_labels),
            "label" if len(existing_labels) == 1 else "labels",
        ))
        missing_values = label_counts["-"]
        msg.text("{} missing {} (tokens with '-' label)".format(
            missing_values, "value" if missing_values == 1 else "values"))
        if new_labels:
            labels_with_counts = [
                (label, count) for label, count in label_counts.most_common()
                if label != "-"
            ]
            labels_with_counts = _format_labels(labels_with_counts,
                                                counts=True)
            msg.text("New: {}".format(labels_with_counts), show=verbose)
        if existing_labels:
            msg.text("Existing: {}".format(_format_labels(existing_labels)),
                     show=verbose)

        if gold_train_data["ws_ents"]:
            msg.fail("{} invalid whitespace entity spans".format(
                gold_train_data["ws_ents"]))
            has_ws_ents_error = True

        for label in new_labels:
            if label_counts[label] <= NEW_LABEL_THRESHOLD:
                msg.warn(
                    "Low number of examples for new label '{}' ({})".format(
                        label, label_counts[label]))
                has_low_data_warning = True

                with msg.loading("Analyzing label distribution..."):
                    neg_docs = _get_examples_without_label(train_docs, label)
                if neg_docs == 0:
                    msg.warn(
                        "No examples for texts WITHOUT new label '{}'".format(
                            label))
                    has_no_neg_warning = True

        if not has_low_data_warning:
            msg.good("Good amount of examples for all labels")
        if not has_no_neg_warning:
            msg.good("Examples without occurrences available for all labels")
        if not has_ws_ents_error:
            msg.good(
                "No entities consisting of or starting/ending with whitespace")

        if has_low_data_warning:
            msg.text(
                "To train a new entity type, your data should include at "
                "least {} instances of the new label".format(
                    NEW_LABEL_THRESHOLD),
                show=verbose,
            )
        if has_no_neg_warning:
            msg.text(
                "Training data should always include examples of entities "
                "in context, as well as examples without a given entity "
                "type.",
                show=verbose,
            )
        if has_ws_ents_error:
            msg.text(
                "As of spaCy v2.1.0, entity spans consisting of or starting/ending "
                "with whitespace characters are considered invalid.")

    if "textcat" in pipeline:
        msg.divider("Text Classification")
        labels = [label for label in gold_train_data["textcat"]]
        model_labels = _get_labels_from_model(nlp, "textcat")
        new_labels = [l for l in labels if l not in model_labels]
        existing_labels = [l for l in labels if l in model_labels]
        msg.info("Text Classification: {} new label(s), {} existing label(s)".
                 format(len(new_labels), len(existing_labels)))
        if new_labels:
            labels_with_counts = _format_labels(
                gold_train_data["textcat"].most_common(), counts=True)
            msg.text("New: {}".format(labels_with_counts), show=verbose)
        if existing_labels:
            msg.text("Existing: {}".format(_format_labels(existing_labels)),
                     show=verbose)

    if "tagger" in pipeline:
        msg.divider("Part-of-speech Tagging")
        labels = [label for label in gold_train_data["tags"]]
        tag_map = nlp.Defaults.tag_map
        msg.info("{} {} in data ({} {} in tag map)".format(
            len(labels),
            "label" if len(labels) == 1 else "labels",
            len(tag_map),
            "label" if len(tag_map) == 1 else "labels",
        ))
        labels_with_counts = _format_labels(
            gold_train_data["tags"].most_common(), counts=True)
        msg.text(labels_with_counts, show=verbose)
        non_tagmap = [l for l in labels if l not in tag_map]
        if not non_tagmap:
            msg.good("All labels present in tag map for language '{}'".format(
                nlp.lang))
        for label in non_tagmap:
            msg.fail(
                "Label '{}' not found in tag map for language '{}'".format(
                    label, nlp.lang))

    if "parser" in pipeline:
        msg.divider("Dependency Parsing")

        # profile sentence length
        msg.info("Found {} sentence{} with an average length of {:.1f} words.".
                 format(
                     gold_train_data["n_sents"],
                     "s" if len(train_docs) > 1 else "",
                     gold_train_data["n_words"] / gold_train_data["n_sents"]))

        # profile labels
        labels_train = [label for label in gold_train_data["deps"]]
        labels_train_unpreprocessed = [
            label for label in gold_train_unpreprocessed_data["deps"]
        ]
        labels_dev = [label for label in gold_dev_data["deps"]]

        if gold_train_unpreprocessed_data["n_nonproj"] > 0:
            msg.info("Found {} nonprojective train sentence{}".format(
                gold_train_unpreprocessed_data["n_nonproj"], "s"
                if gold_train_unpreprocessed_data["n_nonproj"] > 1 else ""))
        if gold_dev_data["n_nonproj"] > 0:
            msg.info("Found {} nonprojective dev sentence{}".format(
                gold_dev_data["n_nonproj"],
                "s" if gold_dev_data["n_nonproj"] > 1 else ""))

        msg.info("{} {} in train data".format(
            len(labels_train_unpreprocessed),
            "label" if len(labels_train) == 1 else "labels"))
        msg.info("{} {} in projectivized train data".format(
            len(labels_train),
            "label" if len(labels_train) == 1 else "labels"))

        labels_with_counts = _format_labels(
            gold_train_unpreprocessed_data["deps"].most_common(), counts=True)
        msg.text(labels_with_counts, show=verbose)

        # rare labels in train
        for label in gold_train_unpreprocessed_data["deps"]:
            if gold_train_unpreprocessed_data["deps"][
                    label] <= DEP_LABEL_THRESHOLD:
                msg.warn("Low number of examples for label '{}' ({})".format(
                    label, gold_train_unpreprocessed_data["deps"][label]))
                has_low_data_warning = True

        # rare labels in projectivized train
        rare_projectivized_labels = []
        for label in gold_train_data["deps"]:
            if gold_train_data["deps"][
                    label] <= DEP_LABEL_THRESHOLD and "||" in label:
                rare_projectivized_labels.append("{}: {}".format(
                    label, str(gold_train_data["deps"][label])))

        if len(rare_projectivized_labels) > 0:
            msg.warn(
                "Low number of examples for {} label{} in the "
                "projectivized dependency trees used for training. You may "
                "want to projectivize labels such as punct before "
                "training in order to improve parser performance.".format(
                    len(rare_projectivized_labels),
                    "s" if len(rare_projectivized_labels) > 1 else ""))
            msg.warn("Projectivized labels with low numbers of examples: "
                     "{}".format("\n".join(rare_projectivized_labels)),
                     show=verbose)
            has_low_data_warning = True

        # labels only in train
        if set(labels_train) - set(labels_dev):
            msg.warn("The following labels were found only in the train data: "
                     "{}".format(
                         ", ".join(set(labels_train) - set(labels_dev))),
                     show=verbose)

        # labels only in dev
        if set(labels_dev) - set(labels_train):
            msg.warn("The following labels were found only in the dev data: " +
                     ", ".join(set(labels_dev) - set(labels_train)),
                     show=verbose)

        if has_low_data_warning:
            msg.text(
                "To train a parser, your data should include at "
                "least {} instances of each label.".format(
                    DEP_LABEL_THRESHOLD),
                show=verbose,
            )

        # multiple root labels
        if len(gold_train_unpreprocessed_data["roots"]) > 1:
            msg.warn(
                "Multiple root labels ({}) ".format(", ".join(
                    gold_train_unpreprocessed_data["roots"])) +
                "found in training data. spaCy's parser uses a single root "
                "label ROOT so this distinction will not be available.")

        # these should not happen, but just in case
        if gold_train_data["n_nonproj"] > 0:
            msg.fail(
                "Found {} nonprojective projectivized train sentence{}".format(
                    gold_train_data["n_nonproj"],
                    "s" if gold_train_data["n_nonproj"] > 1 else ""))
        if gold_train_data["n_cycles"] > 0:
            msg.fail(
                "Found {} projectivized train sentence{} with cycles".format(
                    gold_train_data["n_cycles"],
                    "s" if gold_train_data["n_cycles"] > 1 else ""))

    msg.divider("Summary")
    good_counts = msg.counts[MESSAGES.GOOD]
    warn_counts = msg.counts[MESSAGES.WARN]
    fail_counts = msg.counts[MESSAGES.FAIL]
    if good_counts:
        msg.good("{} {} passed".format(
            good_counts, "check" if good_counts == 1 else "checks"))
    if warn_counts:
        msg.warn("{} {}".format(warn_counts,
                                "warning" if warn_counts == 1 else "warnings"))
    if fail_counts:
        msg.fail("{} {}".format(fail_counts,
                                "error" if fail_counts == 1 else "errors"))

    if fail_counts:
        sys.exit(1)
Esempio n. 15
0
class SectLabelDataset(BaseTextClassification, ClassNursery):
    def __init__(
        self,
        filename: str,
        dataset_type: str,
        max_num_words: int,
        max_instance_length: int,
        word_vocab_store_location: str,
        debug: bool = False,
        debug_dataset_proportion: float = 0.1,
        word_embedding_type: Union[str, None] = None,
        word_embedding_dimension: Union[int, None] = None,
        word_start_token: str = "<SOS>",
        word_end_token: str = "<EOS>",
        word_pad_token: str = "<PAD>",
        word_unk_token: str = "<UNK>",
        train_size: float = 0.8,
        test_size: float = 0.2,
        validation_size: float = 0.5,
        word_tokenization_type="vanilla",
    ):
        """ SectLabel Dataset - A logical section classification dataset from WING-NUS

            Parameters
            ----------
            filename : str
                Name of the file where the SectLabel dataset is stored
            dataset_type : str
                Either of `[train, valid, test]` that this dataset represents
            max_num_words : int
                Maximum number of words to be included in the vocab. The top most frequent
                ``max_num_words`` will be included in the vocab. Everything else will be mapped to
                ``word_unk`` tag.
            max_instance_length : int
                The maximum length for every instance
            word_vocab_store_location : str
                The path where the word vocabulary will be stored
            debug : bool
                Is this dataset a debug dataset where a small portion will be used for testing purposes.
            debug_dataset_proportion : float
                The proportion of the dataset that would be used as debug dataset
            word_embedding_type : str
                The embedding type is any of those that are accepted in ``vocab.embedding_loader``
            word_embedding_dimension : int
                Word embedding size. This might depend on the ``embedding_type`` that is used.
            word_start_token : str
                Every instance will be prepended with a ``word_start_token``
            word_end_token : str
                Every instance will be appended with a ``word_end_token``
            word_pad_token : str
                Token used for padding instances
            word_unk_token : str
                If word is not found in the training vocab, then the word
                is replaced with ``word_unk_token``
            train_size : int
                The portion of the dataset that will be used for training
            test_size : int
                The portion of the dataset that will be used for testing
            validation_size : int
                The portion of the dataset that will be used for validation
            word_tokenization_type : int
                The kind of word tokenization. ``tokenizers.word_tokenizer`` has more information
            """
        self.classname2idx = self.get_classname2idx()
        self.idx2classname = {
            idx: classname
            for classname, idx in self.classname2idx.items()
        }
        self.filename = filename
        self.train_size = train_size
        self.test_size = test_size
        self.validation_size = validation_size
        self.dataset_type = dataset_type
        self.debug = debug
        self.debug_dataset_proportion = debug_dataset_proportion
        self.max_instance_length = max_instance_length
        self.lines, self.labels = self.get_lines_labels(filename=self.filename)

        self.msg_printer = Printer()

    def __len__(self) -> int:
        return len(self.word_instances)

    def __getitem__(self, idx) -> Dict[str, Any]:
        line = self.lines[idx]
        label = self.labels[idx]

        return self.get_iter_dict(lines=line, labels=label)

    def get_lines_labels(self, filename: str) -> (List[str], List[str]):
        parsect_json = convert_sectlabel_to_json(filename)
        texts = []
        labels = []
        parsect_json = parsect_json["parse_sect"]

        for line_json in parsect_json:
            text = line_json["text"]
            label = line_json["label"]

            texts.append(text)
            labels.append(label)

        (train_lines, train_labels), (validation_lines, validation_labels), (
            test_lines,
            test_labels,
        ) = self.get_train_valid_test_stratified_split(texts, labels,
                                                       self.classname2idx)

        if self.dataset_type == "train":
            texts = train_lines
            labels = train_labels
        elif self.dataset_type == "valid":
            texts = validation_lines
            labels = validation_labels
        elif self.dataset_type == "test":
            texts = test_lines
            labels = test_labels

        if self.debug:
            # randomly sample `self.debug_dataset_proportion`  samples and return
            num_text = len(texts)
            np.random.seed(1729)  # so we can debug deterministically
            random_ints = np.random.randint(
                0,
                num_text - 1,
                size=int(self.debug_dataset_proportion * num_text))
            random_ints = list(random_ints)
            sample_texts = []
            sample_labels = []
            for random_int in random_ints:
                sample_texts.append(texts[random_int])
                sample_labels.append(labels[random_int])
            texts = sample_texts
            labels = sample_labels

        return texts, labels

    @classmethod
    def get_classname2idx(cls) -> Dict[str, int]:
        categories = [
            "address",
            "affiliation",
            "author",
            "bodyText",
            "category",
            "construct",
            "copyright",
            "email",
            "equation",
            "figure",
            "figureCaption",
            "footnote",
            "keyword",
            "listItem",
            "note",
            "page",
            "reference",
            "sectionHeader",
            "subsectionHeader",
            "subsubsectionHeader",
            "tableCaption",
            "table",
            "title",
        ]
        categories = [(word, idx) for idx, word in enumerate(categories)]
        categories = dict(categories)
        return categories

    def get_num_classes(self) -> int:
        return len(self.classname2idx.keys())

    def get_class_names_from_indices(self, indices: List) -> List[str]:
        """ Given a list of indices maps back to classnames
        Mostly used for inference and other higher level applications

        Parameters
        ----------
        indices : List[int]
            A list of indices where every index is in ``[0, num_classes)``

        Returns
        -------

        """
        return [self.idx2classname[idx] for idx in indices]

    def print_stats(self):
        num_instances = self.num_instances
        formatted = self.label_stats_table
        self.msg_printer.divider("Stats for {0} dataset".format(
            self.dataset_type))
        print(formatted)
        self.msg_printer.info(
            f"Number of instances in {self.dataset_type} dataset - {num_instances}"
        )

    def emits_keys(cls):
        return {
            "tokens":
            f"A torch.LongTensor of size `max_length`. "
            f"Example [0, 0, 1, 100] where every number represents an index in the vocab",
            "len_tokens":
            f"A torch.LongTensor. "
            f"Example [2] representing the number of tokens without padding",
            "label":
            f"A torch.LongTensor representing the label corresponding to the "
            f"instance. Example [2] representing class 2",
            "instance":
            f"A string that is padded to ``max_length``.",
            "raw_instance":
            f"A string that is not padded",
        }

    def get_iter_dict(
        self,
        lines: Union[List[str], str],
        labels: Optional[Union[str, List[str]]] = None,
    ):
        if isinstance(lines, str):
            lines = [lines]

        word_instances = self.word_tokenizer.tokenize_batch(lines)
        len_instances = [len(instance) for instance in word_instances]
        classnames2idx = SectLabelDataset.get_classname2idx()

        padded_instances = []
        for word_instance in word_instances:
            padded_instance = pack_to_length(
                tokenized_text=word_instance,
                max_length=self.max_instance_length,
                pad_token=self.word_vocab.pad_token,
                add_start_end_token=True,
                start_token=self.word_vocab.start_token,
                end_token=self.word_vocab.end_token,
            )
            padded_instances.append(padded_instance)

        tokens = self.word_numericalizer.numericalize_batch_instances(
            padded_instances)
        tokens = torch.LongTensor(tokens)
        tokens = tokens.squeeze(0)

        instances = []
        for instance in padded_instances:
            instances.append(" ".join(instance))

        raw_instances = []
        for instance in word_instances:
            raw_instances.append(" ".join(instance))

        len_tokens = torch.LongTensor(len_instances)

        # squeeze the dimensions if there are more than one dimension

        if len(instances) == 1:
            instances = instances[0]
            raw_instances = raw_instances[0]

        instance_dict = {
            "tokens": tokens,
            "len_tokens": len_tokens,
            "instance": instances,
            "raw_instance": raw_instances,
        }

        if labels is not None:
            if isinstance(labels, str):
                labels = [labels]

            labels = [classnames2idx[label] for label in labels]
            label = torch.LongTensor(labels)
            instance_dict["label"] = label

        return instance_dict
Esempio n. 16
0
def validate():
    """
    Validate that the currently installed version of spaCy is compatible
    with the installed models. Should be run after `pip install -U spacy`.
    """
    msg = Printer()
    with msg.loading("Loading compatibility table..."):
        r = requests.get(about.__compatibility__)
        if r.status_code != 200:
            msg.fail(
                "Server error ({})".format(r.status_code),
                "Couldn't fetch compatibility table.",
                exits=1,
            )
    msg.good("Loaded compatibility table")
    compat = r.json()["spacy"]
    version = about.__version__
    version = version.rsplit(".dev", 1)[0]
    current_compat = compat.get(version)
    if not current_compat:
        msg.fail(
            "Can't find spaCy v{} in compatibility table".format(version),
            about.__compatibility__,
            exits=1,
        )
    all_models = set()
    for spacy_v, models in dict(compat).items():
        all_models.update(models.keys())
        for model, model_vs in models.items():
            compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
    model_links = get_model_links(current_compat)
    model_pkgs = get_model_pkgs(current_compat, all_models)
    incompat_links = {l for l, d in model_links.items() if not d["compat"]}
    incompat_models = {d["name"] for _, d in model_pkgs.items() if not d["compat"]}
    incompat_models.update(
        [d["name"] for _, d in model_links.items() if not d["compat"]]
    )
    na_models = [m for m in incompat_models if m not in current_compat]
    update_models = [m for m in incompat_models if m in current_compat]
    spacy_dir = Path(__file__).parent.parent

    msg.divider("Installed models (spaCy v{})".format(about.__version__))
    msg.info("spaCy installation: {}".format(path2str(spacy_dir)))

    if model_links or model_pkgs:
        header = ("TYPE", "NAME", "MODEL", "VERSION", "")
        rows = []
        for name, data in model_pkgs.items():
            rows.append(get_model_row(current_compat, name, data, msg))
        for name, data in model_links.items():
            rows.append(get_model_row(current_compat, name, data, msg, "link"))
        msg.table(rows, header=header)
    else:
        msg.text("No models found in your current environment.", exits=0)
    if update_models:
        msg.divider("Install updates")
        msg.text("Use the following commands to update the model packages:")
        cmd = "python -m spacy download {}"
        print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
    if na_models:
        msg.text(
            "The following models are not available for spaCy "
            "v{}: {}".format(about.__version__, ", ".join(na_models))
        )
    if incompat_links:
        msg.text(
            "You may also want to overwrite the incompatible links using the "
            "`python -m spacy link` command with `--force`, or remove them "
            "from the data directory. "
            "Data path: {path}".format(path=path2str(get_data_path()))
        )
    if incompat_models or incompat_links:
        sys.exit(1)
Esempio n. 17
0
def create_index(
    model: str,
    kb_dir: Path,
    output_dir: Path,
    new_model_name: str = "ann_linker",
    cg_threshold: float = 0.8,
    n_iter: int = 5,
    verbose: bool = True,
):

    """Create an AnnLinker based on the Character N-Gram
    TF-IDF vectors for aliases in a KnowledgeBase

    model (str): spaCy language model directory or name to load
    kb_dir (Path): path to the directory with kb entities.jsonl and aliases.jsonl files
    output_dir (Path): path to output_dir for spaCy model with ann_linker pipe


    kb File Formats
    
    e.g. entities.jsonl

    {"id": "a1", "description": "Machine learning (ML) is the scientific study of algorithms and statistical models..."}
    {"id": "a2", "description": "ML (\"Meta Language\") is a general-purpose functional programming language. It has roots in Lisp, and has been characterized as \"Lisp with types\"."}

    e.g. aliases.jsonl
    {"alias": "ML", "entities": ["a1", "a2"], "probabilities": [0.5, 0.5]}
    """
    msg = Printer(hide_animation=not verbose)

    msg.divider("Load Model")
    with msg.loading(f"Loading model {model}"):
        nlp = spacy.load(model)
        msg.good("Done.")

    if output_dir is not None:
        output_dir = Path(output_dir / new_model_name)
        if not output_dir.exists():
            output_dir.mkdir(parents=True)

    entities = list(srsly.read_jsonl(kb_dir / "entities.jsonl"))
    aliases = list(srsly.read_jsonl(kb_dir / "aliases.jsonl"))
    kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=INPUT_DIM)

    # set up the data
    entity_ids = []
    descriptions = []
    freqs = []
    for e in entities:
        entity_ids.append(e["id"])
        descriptions.append(e.get("description", ""))
        freqs.append(100)

    # msg.divider("Train EntityEncoder")

    # with msg.loading("Starting training EntityEncoder"):
    #     # training entity description encodings
    #     # this part can easily be replaced with a custom entity encoder
    #     encoder = EntityEncoder(nlp=nlp, input_dim=INPUT_DIM, desc_width=DESC_WIDTH, epochs=n_iter)
    #     encoder.train(description_list=descriptions, to_print=True)
    #     msg.good("Done Training")

    msg.divider("Apply EntityEncoder")

    with msg.loading("Applying EntityEncoder to descriptions"):
        # get the pretrained entity vectors
        embeddings = [nlp.make_doc(desc).vector for desc in descriptions]
        msg.good("Finished, embeddings created")

    with msg.loading("Setting kb entities and aliases"):
        # set the entities, can also be done by calling `kb.add_entity` for each entity
        for i in range(len(entity_ids)):
            entity = entity_ids[i]
            if not kb.contains_entity(entity):
                kb.add_entity(entity, freqs[i], embeddings[i])

        for a in aliases:
            ents = [e for e in a["entities"] if kb.contains_entity(e)]
            n_ents = len(ents)
            if n_ents > 0:
                prior_prob = [1.0 / n_ents] * n_ents
                kb.add_alias(alias=a["alias"], entities=ents, probabilities=prior_prob)

        msg.good("Done adding entities and aliases to kb")

    msg.divider("Create ANN Index")

    cg = CandidateGenerator().fit(kb.get_alias_strings(), verbose=True)

    ann_linker = nlp.create_pipe("ann_linker")
    ann_linker.set_kb(kb)
    ann_linker.set_cg(cg)

    nlp.add_pipe(ann_linker, last=True)

    nlp.meta["name"] = new_model_name
    nlp.to_disk(output_dir)
    nlp.from_disk(output_dir)
Esempio n. 18
0
def debug_data(
    lang,
    train_path,
    dev_path,
    base_model=None,
    pipeline="tagger,parser,ner",
    ignore_warnings=False,
    ignore_validation=False,
    verbose=False,
    no_format=False,
):
    msg = Printer(pretty=not no_format, ignore_warnings=ignore_warnings)

    # Make sure all files and paths exists if they are needed
    if not train_path.exists():
        msg.fail("Training data not found", train_path, exits=1)
    if not dev_path.exists():
        msg.fail("Development data not found", dev_path, exits=1)

    # Initialize the model and pipeline
    pipeline = [p.strip() for p in pipeline.split(",")]
    if base_model:
        nlp = load_model(base_model)
    else:
        lang_cls = get_lang_class(lang)
        nlp = lang_cls()

    msg.divider("Data format validation")
    # Load the data in one – might take a while but okay in this case
    train_data = _load_file(train_path, msg)
    dev_data = _load_file(dev_path, msg)

    # Validate data format using the JSON schema
    # TODO: update once the new format is ready
    train_data_errors = []  # TODO: validate_json
    dev_data_errors = []  # TODO: validate_json
    if not train_data_errors:
        msg.good("Training data JSON format is valid")
    if not dev_data_errors:
        msg.good("Development data JSON format is valid")
    for error in train_data_errors:
        msg.fail("Training data: {}".format(error))
    for error in dev_data_errors:
        msg.fail("Develoment data: {}".format(error))
    if (train_data_errors or dev_data_errors) and not ignore_validation:
        sys.exit(1)

    # Create the gold corpus to be able to better analyze data
    with msg.loading("Analyzing corpus..."):
        train_data = read_json_object(train_data)
        dev_data = read_json_object(dev_data)
        corpus = GoldCorpus(train_data, dev_data)
        train_docs = list(corpus.train_docs(nlp))
        dev_docs = list(corpus.dev_docs(nlp))
    msg.good("Corpus is loadable")

    # Create all gold data here to avoid iterating over the train_docs constantly
    gold_data = _compile_gold(train_docs, pipeline)
    train_texts = gold_data["texts"]
    dev_texts = set([doc.text for doc, gold in dev_docs])

    msg.divider("Training stats")
    msg.text("Training pipeline: {}".format(", ".join(pipeline)))
    for pipe in [p for p in pipeline if p not in nlp.factories]:
        msg.fail("Pipeline component '{}' not available in factories".format(pipe))
    if base_model:
        msg.text("Starting with base model '{}'".format(base_model))
    else:
        msg.text("Starting with blank model '{}'".format(lang))
    msg.text("{} training docs".format(len(train_docs)))
    msg.text("{} evaluation docs".format(len(dev_docs)))

    overlap = len(train_texts.intersection(dev_texts))
    if overlap:
        msg.warn("{} training examples also in evaluation data".format(overlap))
    else:
        msg.good("No overlap between training and evaluation data")
    if not base_model and len(train_docs) < BLANK_MODEL_THRESHOLD:
        text = "Low number of examples to train from a blank model ({})".format(
            len(train_docs)
        )
        if len(train_docs) < BLANK_MODEL_MIN_THRESHOLD:
            msg.fail(text)
        else:
            msg.warn(text)
        msg.text(
            "It's recommended to use at least {} examples (minimum {})".format(
                BLANK_MODEL_THRESHOLD, BLANK_MODEL_MIN_THRESHOLD
            ),
            show=verbose,
        )

    msg.divider("Vocab & Vectors")
    n_words = gold_data["n_words"]
    msg.info(
        "{} total {} in the data ({} unique)".format(
            n_words, "word" if n_words == 1 else "words", len(gold_data["words"])
        )
    )
    most_common_words = gold_data["words"].most_common(10)
    msg.text(
        "10 most common words: {}".format(
            _format_labels(most_common_words, counts=True)
        ),
        show=verbose,
    )
    if len(nlp.vocab.vectors):
        msg.info(
            "{} vectors ({} unique keys, {} dimensions)".format(
                len(nlp.vocab.vectors),
                nlp.vocab.vectors.n_keys,
                nlp.vocab.vectors_length,
            )
        )
    else:
        msg.info("No word vectors present in the model")

    if "ner" in pipeline:
        # Get all unique NER labels present in the data
        labels = set(label for label in gold_data["ner"] if label not in ("O", "-"))
        label_counts = gold_data["ner"]
        model_labels = _get_labels_from_model(nlp, "ner")
        new_labels = [l for l in labels if l not in model_labels]
        existing_labels = [l for l in labels if l in model_labels]
        has_low_data_warning = False
        has_no_neg_warning = False
        has_ws_ents_error = False

        msg.divider("Named Entity Recognition")
        msg.info(
            "{} new {}, {} existing {}".format(
                len(new_labels),
                "label" if len(new_labels) == 1 else "labels",
                len(existing_labels),
                "label" if len(existing_labels) == 1 else "labels",
            )
        )
        missing_values = label_counts["-"]
        msg.text(
            "{} missing {} (tokens with '-' label)".format(
                missing_values, "value" if missing_values == 1 else "values"
            )
        )
        if new_labels:
            labels_with_counts = [
                (label, count)
                for label, count in label_counts.most_common()
                if label != "-"
            ]
            labels_with_counts = _format_labels(labels_with_counts, counts=True)
            msg.text("New: {}".format(labels_with_counts), show=verbose)
        if existing_labels:
            msg.text(
                "Existing: {}".format(_format_labels(existing_labels)), show=verbose
            )

        if gold_data["ws_ents"]:
            msg.fail("{} invalid whitespace entity spans".format(gold_data["ws_ents"]))
            has_ws_ents_error = True

        for label in new_labels:
            if label_counts[label] <= NEW_LABEL_THRESHOLD:
                msg.warn(
                    "Low number of examples for new label '{}' ({})".format(
                        label, label_counts[label]
                    )
                )
                has_low_data_warning = True

                with msg.loading("Analyzing label distribution..."):
                    neg_docs = _get_examples_without_label(train_docs, label)
                if neg_docs == 0:
                    msg.warn(
                        "No examples for texts WITHOUT new label '{}'".format(label)
                    )
                    has_no_neg_warning = True

        if not has_low_data_warning:
            msg.good("Good amount of examples for all labels")
        if not has_no_neg_warning:
            msg.good("Examples without occurences available for all labels")
        if not has_ws_ents_error:
            msg.good("No entities consisting of or starting/ending with whitespace")

        if has_low_data_warning:
            msg.text(
                "To train a new entity type, your data should include at "
                "least {} insteances of the new label".format(NEW_LABEL_THRESHOLD),
                show=verbose,
            )
        if has_no_neg_warning:
            msg.text(
                "Training data should always include examples of entities "
                "in context, as well as examples without a given entity "
                "type.",
                show=verbose,
            )
        if has_ws_ents_error:
            msg.text(
                "As of spaCy v2.1.0, entity spans consisting of or starting/ending "
                "with whitespace characters are considered invalid."
            )

    if "textcat" in pipeline:
        msg.divider("Text Classification")
        labels = [label for label in gold_data["textcat"]]
        model_labels = _get_labels_from_model(nlp, "textcat")
        new_labels = [l for l in labels if l not in model_labels]
        existing_labels = [l for l in labels if l in model_labels]
        msg.info(
            "Text Classification: {} new label(s), {} existing label(s)".format(
                len(new_labels), len(existing_labels)
            )
        )
        if new_labels:
            labels_with_counts = _format_labels(
                gold_data["textcat"].most_common(), counts=True
            )
            msg.text("New: {}".format(labels_with_counts), show=verbose)
        if existing_labels:
            msg.text(
                "Existing: {}".format(_format_labels(existing_labels)), show=verbose
            )

    if "tagger" in pipeline:
        msg.divider("Part-of-speech Tagging")
        labels = [label for label in gold_data["tags"]]
        tag_map = nlp.Defaults.tag_map
        msg.info(
            "{} {} in data ({} {} in tag map)".format(
                len(labels),
                "label" if len(labels) == 1 else "labels",
                len(tag_map),
                "label" if len(tag_map) == 1 else "labels",
            )
        )
        labels_with_counts = _format_labels(
            gold_data["tags"].most_common(), counts=True
        )
        msg.text(labels_with_counts, show=verbose)
        non_tagmap = [l for l in labels if l not in tag_map]
        if not non_tagmap:
            msg.good("All labels present in tag map for language '{}'".format(nlp.lang))
        for label in non_tagmap:
            msg.fail(
                "Label '{}' not found in tag map for language '{}'".format(
                    label, nlp.lang
                )
            )

    if "parser" in pipeline:
        msg.divider("Dependency Parsing")
        labels = [label for label in gold_data["deps"]]
        msg.info(
            "{} {} in data".format(
                len(labels), "label" if len(labels) == 1 else "labels"
            )
        )
        labels_with_counts = _format_labels(
            gold_data["deps"].most_common(), counts=True
        )
        msg.text(labels_with_counts, show=verbose)

    msg.divider("Summary")
    good_counts = msg.counts[MESSAGES.GOOD]
    warn_counts = msg.counts[MESSAGES.WARN]
    fail_counts = msg.counts[MESSAGES.FAIL]
    if good_counts:
        msg.good(
            "{} {} passed".format(
                good_counts, "check" if good_counts == 1 else "checks"
            )
        )
    if warn_counts:
        msg.warn(
            "{} {}".format(warn_counts, "warning" if warn_counts == 1 else "warnings")
        )
    if fail_counts:
        msg.fail("{} {}".format(fail_counts, "error" if fail_counts == 1 else "errors"))

    if fail_counts:
        sys.exit(1)
def pretrain(
    texts_loc,
    vectors_model,
    output_dir,
    width=96,
    depth=4,
    embed_rows=2000,
    loss_func="cosine",
    use_vectors=False,
    dropout=0.2,
    n_iter=1000,
    batch_size=3000,
    max_length=500,
    min_length=5,
    seed=0,
    n_save_every=None,
):
    """
    Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
    using an approximate language-modelling objective. Specifically, we load
    pre-trained vectors, and train a component like a CNN, BiLSTM, etc to predict
    vectors which match the pre-trained ones. The weights are saved to a directory
    after each epoch. You can then pass a path to one of these pre-trained weights
    files to the 'spacy train' command.

    This technique may be especially helpful if you have little labelled data.
    However, it's still quite experimental, so your mileage may vary.

    To load the weights back in during 'spacy train', you need to ensure
    all settings are the same between pretraining and training. The API and
    errors around this need some improvement.
    """
    config = dict(locals())
    msg = Printer()
    util.fix_random_seed(seed)

    has_gpu = prefer_gpu()
    msg.info("Using GPU" if has_gpu else "Not using GPU")

    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
        msg.good("Created output directory")
    srsly.write_json(output_dir / "config.json", config)
    msg.good("Saved settings to config.json")

    # Load texts from file or stdin
    if texts_loc != "-":  # reading from a file
        texts_loc = Path(texts_loc)
        if not texts_loc.exists():
            msg.fail("Input text file doesn't exist", texts_loc, exits=1)
        with msg.loading("Loading input texts..."):
            texts = list(srsly.read_jsonl(texts_loc))
        msg.good("Loaded input texts")
        random.shuffle(texts)
    else:  # reading from stdin
        msg.text("Reading input text from stdin...")
        texts = srsly.read_jsonl("-")

    with msg.loading("Loading model '{}'...".format(vectors_model)):
        nlp = util.load_model(vectors_model)
    msg.good("Loaded model '{}'".format(vectors_model))
    pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
    model = create_pretraining_model(
        nlp,
        Tok2Vec(
            width,
            embed_rows,
            conv_depth=depth,
            pretrained_vectors=pretrained_vectors,
            bilstm_depth=0,  # Requires PyTorch. Experimental.
            cnn_maxout_pieces=3,  # You can try setting this higher
            subword_features=True,  # Set to False for Chinese etc
        ),
    )
    optimizer = create_default_optimizer(model.ops)
    tracker = ProgressTracker(frequency=10000)
    msg.divider("Pre-training tok2vec layer")
    row_settings = {
        "widths": (3, 10, 10, 6, 4),
        "aligns": ("r", "r", "r", "r", "r")
    }
    msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)

    def _save_model(epoch, is_temp=False):
        is_temp_str = ".temp" if is_temp else ""
        with model.use_params(optimizer.averages):
            with (output_dir / ("model%d%s.bin" %
                                (epoch, is_temp_str))).open("wb") as file_:
                file_.write(model.tok2vec.to_bytes())
            log = {
                "nr_word": tracker.nr_word,
                "loss": tracker.loss,
                "epoch_loss": tracker.epoch_loss,
                "epoch": epoch,
            }
            with (output_dir / "log.jsonl").open("a") as file_:
                file_.write(srsly.json_dumps(log) + "\n")

    for epoch in range(n_iter):
        for batch_id, batch in enumerate(
                util.minibatch_by_words(((text, None) for text in texts),
                                        size=batch_size)):
            docs = make_docs(
                nlp,
                [text for (text, _) in batch],
                max_length=max_length,
                min_length=min_length,
            )
            loss = make_update(model,
                               docs,
                               optimizer,
                               objective=loss_func,
                               drop=dropout)
            progress = tracker.update(epoch, loss, docs)
            if progress:
                msg.row(progress, **row_settings)
                if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10**7:
                    break
            if n_save_every and (batch_id % n_save_every == 0):
                _save_model(epoch, is_temp=True)
        _save_model(epoch)
        tracker.epoch_loss = 0.0
        if texts_loc != "-":
            # Reshuffle the texts if texts were loaded from a file
            random.shuffle(texts)
Esempio n. 20
0
class Engine(ClassNursery):
    def __init__(
        self,
        model: nn.Module,
        datasets_manager: DatasetsManager,
        optimizer: optim,
        batch_size: int,
        save_dir: str,
        num_epochs: int,
        save_every: int,
        log_train_metrics_every: int,
        train_metric: BaseMetric,
        validation_metric: BaseMetric,
        test_metric: BaseMetric,
        experiment_name: Optional[str] = None,
        experiment_hyperparams: Optional[Dict[str, Any]] = None,
        tensorboard_logdir: str = None,
        track_for_best: str = "loss",
        collate_fn=list,
        device: Union[torch.device, str] = torch.device("cpu"),
        gradient_norm_clip_value: Optional[float] = 5.0,
        lr_scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None,
        use_wandb: bool = False,
        sample_proportion: float = 1.0,
        seeds: Dict[str, int] = None,
    ):
        """ Engine runs the models end to end. It iterates through the train dataset and passes
        it through the model. During training it helps in tracking a lot of parameters for the run
        and saving the parameters. It also reports validation and test parameters from time to time.
        Many utilities required for end-end running of the model is here.

        Parameters
        ----------
        model : nn.Module
            A pytorch module defining a model to be run
        datasets_manager : DatasetsManager
            A datasets manager that handles all the different datasets
        optimizer : torch.optim
            Any Optimizer object instantiated using  ``torch.optim``
        batch_size : int
            Batch size for the dataset. The same batch size is used for ``train``, ``valid``
            and ``test`` dataset
        save_dir : int
            The experiments are saved in ``save_dir``. We save checkpoints, the best model,
            logs and other information into the save dir
        num_epochs : int
            The number of epochs to run the training
        save_every : int
            The model will be checkpointed every ``save_every`` number of iterations
        log_train_metrics_every : int
            The train metrics will be reported every ``log_train_metrics_every`` iterations
            during training
        train_metric : BaseMetric
            Anything that is an instance of ``BaseMetric`` for calculating training metrics
        validation_metric : BaseMetric
            Anything that is an instance of ``BaseMetric`` for calculating validation metrics
        test_metric : BaseMetric
            Anything that is an instance of ``BaseMetric`` for calculating test metrics
        experiment_name : str
            The experiment should be given a name for ease of tracking. Instead experiment
            name is not given, we generate a unique 10 digit sha for the experiment.
        experiment_hyperparams : Dict[str, Any]
            This is mostly used for tracking the different hyper-params of the experiment
            being run. This may be used by ``wandb`` to save the hyper-params
        tensorboard_logdir : str
            The directory where all the tensorboard runs are stored. If ``None`` is passed
            then it defaults to the tensorboard default of storing the log in the current directory.
        track_for_best : str
            Which metric should be tracked for deciding the best model?. Anything that
            the metric emits and is a single value can be used for tracking. The defauly value
            is ``loss``. If its loss, then the best value will be the lowest one. For some
            other metrics like ``macro_fscore``, the best metric might be the one that has the highest
            value
        collate_fn : Callable[[List[Any]], List[Any]]
            Collates the different examples into a single batch of examples.
            This is the same terminology adopted from ``pytorch``. There is no different
        device : torch.device
            The device on which the model will be placed. If this is "cpu", then the model
            and the tensors will all be on cpu. If this is "cuda:0", then the model and
            the tensors will be placed on cuda device 0. You can mention any other cuda
            device that is suitable for your environment
        gradient_norm_clip_value : float
            To avoid gradient explosion, the gradients of the norm will be clipped
            if the gradient norm exceeds this value
        lr_scheduler : torch.optim.lr_scheduler
            Any pytorch ``lr_scheduler`` can be used for reducing the learning rate
            if the performance on the validation set reduces.
        use_wandb : bool
            wandb or weights and biases is a tool that is used to track experiments
            online. Sciwing comes with inbuilt functionality to track experiments
            on weights and biases
        seeds: Dict[str, int]
            The dict of seeds to be set.
            Set the random_seed, pytorch_seed and numpy_seed
            Found in
            https://github.com/allenai/allennlp/blob/master/allennlp/common/util.py
        """

        if isinstance(device, str):
            device = torch.device(device)

        if seeds is None:
            seeds = {}
        self.seeds = seeds

        self._set_seeds()

        self.model = model
        self.datasets_manager = datasets_manager
        self.train_dataset = self.datasets_manager.train_dataset
        self.validation_dataset = self.datasets_manager.dev_dataset
        self.test_dataset = self.datasets_manager.test_dataset
        self.optimizer = optimizer
        self.batch_size = batch_size
        self.save_dir = pathlib.Path(save_dir)
        self.num_epochs = num_epochs
        self.msg_printer = Printer()
        self.save_every = save_every
        self.log_train_metrics_every = log_train_metrics_every
        self.tensorboard_logdir = tensorboard_logdir
        self.train_metric_calc = train_metric
        self.validation_metric_calc = validation_metric
        self.test_metric_calc = test_metric
        self.summaryWriter = SummaryWriter(log_dir=tensorboard_logdir)
        self.track_for_best = track_for_best
        self.collate_fn = collate_fn
        self.device = device
        self.best_track_value = None
        self.set_best_track_value(self.best_track_value)
        self.gradient_norm_clip_value = gradient_norm_clip_value
        self.lr_scheduler = lr_scheduler
        self.lr_scheduler_is_plateau = isinstance(
            self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau)
        self.use_wandb = wandb and use_wandb
        self.sample_proportion = sample_proportion
        self.label_namespaces = self.datasets_manager.label_namespaces
        self.datasets_manager.print_stats()

        if experiment_name is None:
            hash_ = hashlib.sha1()
            hash_.update(str(time.time()).encode("utf-8"))
            digest = hash_.hexdigest()
            experiment_name = digest[:10]

        self.experiment_name = experiment_name
        self.experiment_hyperparams = experiment_hyperparams or {}

        if self.use_wandb:
            wandb.init(
                project="project-scwing",
                name=self.experiment_name,
                config=self.experiment_hyperparams,
            )

        if not self.save_dir.is_dir():
            self.save_dir.mkdir(parents=True)

        with open(self.save_dir.joinpath("hyperparams.json"), "w") as fp:
            json.dump(self.experiment_hyperparams, fp)

        self.num_workers = 1
        self.model.to(self.device)

        self.train_loader = self.get_loader(self.train_dataset)
        self.validation_loader = self.get_loader(self.validation_dataset)
        self.test_loader = self.get_loader(self.test_dataset)

        # refresh the iters at the beginning of every epoch
        self.train_iter = None
        self.validation_iter = None
        self.test_iter = None

        # initializing loss meters
        self.train_loss_meter = LossMeter()
        self.validation_loss_meter = LossMeter()

        self.msg_printer.divider("ENGINE STARTING")
        time.sleep(3)

        # get the loggers ready
        self.train_log_filename = self.save_dir.joinpath("train.log")
        self.validation_log_filename = self.save_dir.joinpath("validation.log")
        self.test_log_filename = self.save_dir.joinpath("test.log")

        self.train_logger = logzero.setup_logger(
            name="train-logger",
            logfile=self.train_log_filename,
            level=logging.INFO)
        self.validation_logger = logzero.setup_logger(
            name="valid-logger",
            logfile=self.validation_log_filename,
            level=logging.INFO,
        )
        self.test_logger = logzero.setup_logger(name="test-logger",
                                                logfile=self.test_log_filename,
                                                level=logging.INFO)

        if self.lr_scheduler_is_plateau:
            if self.best_track_value == "loss" and self.lr_scheduler.mode == "max":
                self.msg_printer.warn(
                    "You are optimizing loss and lr schedule mode is max instead of min"
                )
            if (self.best_track_value == "macro_fscore"
                    or self.best_track_value == "fscore"
                    and self.lr_scheduler.mode == "min"):
                self.msg_printer.warn(
                    f"You are optimizing for macro_fscore and lr scheduler mode is min instead of max"
                )
            if (self.best_track_value == "micro_fscore"
                    and self.lr_scheduler.mode == "min"):
                self.msg_printer.warn(
                    f"You are optimizing for micro_fscore and lr scheduler mode is min instead of max"
                )

    def get_loader(self, dataset: Dataset) -> DataLoader:
        """ Returns the DataLoader for the Dataset

        Parameters
        ----------
        dataset : Dataset

        Returns
        -------
        DataLoader
            A pytorch DataLoader

        """
        dataset_size = len(dataset)
        sample_size = int(np.floor(dataset_size * self.sample_proportion))
        indices = np.random.choice(range(dataset_size),
                                   size=sample_size,
                                   replace=False)
        sampler = SubsetRandomSampler(indices=indices)
        loader = DataLoader(
            dataset=dataset,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            collate_fn=self.collate_fn,
            pin_memory=True,
            sampler=sampler,
        )
        return loader

    def is_best_lower(self, current_best=None):
        """ Returns True if the current value of the metric is lower than the best metric.
        This is useful for tracking metrics like loss where, lower the value, the better it is

        Parameters
        ----------
        current_best : float
            The current value for the metric that is being tracked

        Returns
        -------
        bool


        """
        return True if current_best < self.best_track_value else False

    def is_best_higher(self, current_best=None):
        """ Returns ``True`` if the current value of the metric is HIGHER than the best metric.
        This is useful for tracking metrics like FSCORE where, higher the value, the better it is

        Parameters
        ----------
        current_best : float
            The current value for the metric that is being tracked

        Returns
        -------
        bool
        """
        return True if current_best >= self.best_track_value else False

    def set_best_track_value(self, current_best=None):
        """ Set the best value of the value being tracked

        Parameters
        ----------
        current_best : float
            The current value that is best

        Returns
        -------

        """
        if self.track_for_best == "loss":
            self.best_track_value = np.inf if current_best is None else current_best
        elif self.track_for_best == "macro_fscore" or self.track_for_best == "fscore":
            self.best_track_value = 0 if current_best is None else current_best
        elif self.track_for_best == "micro_fscore":
            self.best_track_value = 0 if current_best is None else current_best

    def run(self):
        """
        Run the engine
        :return:
        """
        for epoch_num in range(self.num_epochs):
            self.train_epoch(epoch_num)
            self.validation_epoch(epoch_num)

        self.test_epoch(epoch_num)

    def train_epoch(self, epoch_num: int):
        """
        Run the training for one epoch
        :param epoch_num: type: int
        The current epoch number
        """

        # refresh everything necessary before training begins
        num_iterations = 0
        train_iter = self.get_iter(self.train_loader)
        self.train_loss_meter.reset()
        self.train_metric_calc.reset()
        self.model.train()

        self.msg_printer.info(
            f"Starting Training Epoch: {epoch_num+1}/{self.num_epochs}")
        while True:
            try:
                # N*T, N * 1, N * 1
                lines_labels = next(train_iter)
                lines_labels = list(zip(*lines_labels))
                lines = lines_labels[0]
                labels = lines_labels[1]
                batch_size = len(lines)

                model_forward_out = self.model(
                    lines=lines,
                    labels=labels,
                    is_training=True,
                    is_validation=False,
                    is_test=False,
                )
                self.train_metric_calc.calc_metric(
                    lines=lines,
                    labels=labels,
                    model_forward_dict=model_forward_out)

                try:
                    self.optimizer.zero_grad()
                    loss = model_forward_out["loss"]
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(
                        self.model.parameters(),
                        max_norm=self.gradient_norm_clip_value)
                    self.optimizer.step()
                    self.train_loss_meter.add_loss(loss.item(), batch_size)

                except KeyError:
                    self.msg_printer.fail(
                        "The model output dictionary does not have "
                        "a key called loss. Please check to have "
                        "loss in the model output")
                num_iterations += 1
                if (num_iterations + 1) % self.log_train_metrics_every == 0:
                    metrics = self.train_metric_calc.report_metrics()
                    for label_namespace, table in metrics.items():
                        self.msg_printer.divider(
                            text=f"Train Metrics for {label_namespace.upper()}"
                        )
                        print(table)
            except StopIteration:
                self.train_epoch_end(epoch_num)
                break

    def train_epoch_end(self, epoch_num: int):
        """ Performs house-keeping at the end of a training epoch

        At the end of the training epoch, it does some house-keeping. It reports the average loss, the
        average metric and other information.

        Parameters
        ----------
        epoch_num : int
            The current epoch number (0 based)

        """
        self.msg_printer.divider(f"Training end @ Epoch {epoch_num + 1}")
        average_loss = self.train_loss_meter.get_average()
        self.msg_printer.text("Average Loss: {0}".format(average_loss))
        self.train_logger.info(
            f"Average loss @ Epoch {epoch_num+1} - {average_loss}")
        metric = self.train_metric_calc.get_metric()

        if self.use_wandb:
            wandb.log({"train_loss": average_loss}, step=epoch_num + 1)
            if self.track_for_best != "loss":
                for label_namespace in self.label_namespaces:
                    wandb.log(
                        {
                            f"train_{self.track_for_best}_{label_namespace}":
                            metric[label_namespace][self.track_for_best]
                        },
                        step=epoch_num + 1,
                    )

        # save the model after every `self.save_every` epochs
        if (epoch_num + 1) % self.save_every == 0:
            torch.save(
                {
                    "epoch_num": epoch_num,
                    "optimizer_state": self.optimizer.state_dict(),
                    "model_state": self.model.state_dict(),
                    "loss": average_loss,
                },
                self.save_dir.joinpath(f"model_epoch_{epoch_num+1}.pt"),
            )

        # log loss to tensor board
        self.summaryWriter.add_scalars(
            "train_validation_loss",
            {"train_loss": average_loss or np.inf},
            epoch_num + 1,
        )

    def validation_epoch(self, epoch_num: int):
        """ Runs one validation epoch on the validation dataset

        Parameters
        ----------
        epoch_num : int
        0-based epoch number

        """
        self.model.eval()
        valid_iter = iter(self.validation_loader)
        self.validation_loss_meter.reset()
        self.validation_metric_calc.reset()

        self.msg_printer.info(
            f"Starting Validation Epoch: {epoch_num + 1}/{self.num_epochs}")
        while True:
            try:
                lines_labels = next(valid_iter)
                lines_labels = list(zip(*lines_labels))
                lines = lines_labels[0]
                labels = lines_labels[1]
                batch_size = len(lines)

                with torch.no_grad():
                    model_forward_out = self.model(
                        lines=lines,
                        labels=labels,
                        is_training=False,
                        is_validation=True,
                        is_test=False,
                    )
                loss = model_forward_out["loss"]
                self.validation_loss_meter.add_loss(loss, batch_size)
                self.validation_metric_calc.calc_metric(
                    lines=lines,
                    labels=labels,
                    model_forward_dict=model_forward_out)
            except StopIteration:
                self.validation_epoch_end(epoch_num)
                break

    def validation_epoch_end(self, epoch_num: int):
        """Performs house-keeping at the end of validation epoch

        Parameters
        ----------
        epoch_num : int
            The current epoch number
        """

        self.msg_printer.divider(f"Validation @ Epoch {epoch_num+1}")

        metric_report = self.validation_metric_calc.report_metrics()

        average_loss = self.validation_loss_meter.get_average()

        for label_namespace, table in metric_report.items():
            self.msg_printer.divider(
                text=f"Validation Metrics for {label_namespace.upper()}")
            print(table)

        self.msg_printer.text(f"Average Loss: {average_loss}")

        self.validation_logger.info(
            f"Validation Loss @ Epoch {epoch_num+1} - {average_loss}")

        if self.use_wandb:
            wandb.log({"validation_loss": average_loss}, step=epoch_num + 1)
            metric = self.validation_metric_calc.get_metric()
            if self.track_for_best != "loss":
                for label_namespace in self.label_namespaces:
                    wandb.log(
                        {
                            f"validation_{self.track_for_best}_{label_namespace}":
                            metric[label_namespace][self.track_for_best]
                        },
                        step=epoch_num + 1,
                    )

        self.summaryWriter.add_scalars(
            "train_validation_loss",
            {"validation_loss": average_loss or np.inf},
            epoch_num + 1,
        )

        is_best: bool = None
        value_tracked: str = None
        if self.track_for_best == "loss":
            value_tracked = average_loss
            is_best = self.is_best_lower(average_loss)
        elif (self.track_for_best == "micro_fscore"
              or self.track_for_best == "macro_fscore"
              or self.track_for_best == "fscore"):
            # If there are multiple namespaces for the metric
            # we decide the best model based on the average score
            values_tracked = []
            metrics = self.validation_metric_calc.get_metric()
            for label_namespace in self.label_namespaces:
                value_tracked = metrics[label_namespace][self.track_for_best]
                values_tracked.append(value_tracked)

            value_tracked = sum(values_tracked) / len(values_tracked)
            is_best = self.is_best_higher(current_best=value_tracked)

        if self.lr_scheduler is not None:
            self.lr_scheduler.step(value_tracked)

        if is_best:
            self.set_best_track_value(current_best=value_tracked)
            self.msg_printer.good(f"Found Best Model @ epoch {epoch_num + 1}")
            torch.save(
                {
                    "epoch_num": epoch_num,
                    "optimizer_state": self.optimizer.state_dict(),
                    "model_state": self.model.state_dict(),
                    "loss": average_loss,
                },
                self.save_dir.joinpath("best_model.pt"),
            )

    def test_epoch(self, epoch_num: int):
        """Runs the test epoch for ``epoch_num``

        Loads the best model that is saved during the training
        and runs the test dataset.

        Parameters
        ----------
        epoch_num : int
            zero based epoch number for which the test dataset is run
            This is after the last training epoch.

        """
        self.msg_printer.divider("Running on Test Batch")
        self.load_model_from_file(self.save_dir.joinpath("best_model.pt"))
        self.model.eval()
        test_iter = iter(self.test_loader)
        while True:
            try:
                lines_labels = next(test_iter)
                lines_labels = list(zip(*lines_labels))
                lines = lines_labels[0]
                labels = lines_labels[1]

                with torch.no_grad():
                    model_forward_out = self.model(
                        lines=lines,
                        labels=labels,
                        is_training=False,
                        is_validation=False,
                        is_test=True,
                    )
                self.test_metric_calc.calc_metric(
                    lines=lines,
                    labels=labels,
                    model_forward_dict=model_forward_out)
            except StopIteration:
                self.test_epoch_end(epoch_num)
                break

    def test_epoch_end(self, epoch_num: int):
        """ Performs house-keeping at the end of the test epoch

        It reports the metric that is being traced at the end
        of the test epoch

        Parameters
        ----------
        epoch_num : int
            Epoch num after which the test dataset is run

        """
        metric_report = self.test_metric_calc.report_metrics()
        for label_namespace, table in metric_report.items():
            self.msg_printer.divider(
                text=f"Test Metrics for {label_namespace.upper()}")
            print(table)

        precision_recall_fmeasure = self.test_metric_calc.get_metric()
        self.msg_printer.divider(f"Test @ Epoch {epoch_num+1}")
        self.test_logger.info(
            f"Test Metrics @ Epoch {epoch_num+1} - {precision_recall_fmeasure}"
        )
        if self.use_wandb:
            wandb.log({"test_metrics": str(precision_recall_fmeasure)})

        self.summaryWriter.close()

    def get_train_dataset(self):
        """ Returns the train dataset of the experiment

        Returns
        -------
        Dataset
            Anything that conforms to the pytorch style dataset.

        """
        return self.train_dataset

    def get_validation_dataset(self):
        """ Returns the validation dataset of the experiment

        Returns
        -------
        Dataset
            Anything that conforms to the pytorch style dataset.

        """
        return self.validation_dataset

    def get_test_dataset(self):
        """ Returns the test dataset of the experiment

        Returns
        -------
        Dataset
            Anything that conforms to the pytorch style dataset.

        """
        return self.test_dataset

    @staticmethod
    def get_iter(loader: DataLoader) -> Iterator:
        """ Returns the iterator for a pytorch data loader.

        The ``loader`` is a pytorch DataLoader that iterates
        over the dataset in batches and employs many strategies to do
        so. We want an iterator that returns the dataset in batches.
        The end of the iterator would signify the end of an epoch
        and then we can use that information to perform house-keeping.


        Parameters
        ----------
        loader : DataLoader
            a pytorch data loader

        Returns
        -------
        Iterator
            An iterator over the data loader
        """
        iterator = iter(loader)
        return iterator

    def load_model_from_file(self, filename: str):
        self.msg_printer.divider("LOADING MODEL FROM FILE")
        with self.msg_printer.loading(
                f"Loading Pytorch Model from file {filename}"):
            model_chkpoint = torch.load(filename)

        self.msg_printer.good("Finished Loading the Model")

        model_state = model_chkpoint["model_state"]
        self.model.load_state_dict(model_state)

    def _set_seeds(self):
        seed = self.seeds.get("random_seed", 17290)
        numpy_seed = self.seeds.get("numpy_seed", 1729)
        torch_seed = self.seeds.get("pytorch_seed", 172)

        if seed is not None:
            random.seed(seed)
        if numpy_seed is not None:
            np.random.seed(numpy_seed)
        if torch_seed is not None:
            torch.manual_seed(torch_seed)
            # Seed all GPUs with the same seed if available.
            if torch.cuda.is_available():
                torch.cuda.manual_seed_all(torch_seed)
Esempio n. 21
0
class Vocab:
    def __init__(
        self,
        instances: Optional[List[List[str]]] = None,
        max_num_tokens: int = None,
        min_count: int = 1,
        unk_token: str = "<UNK>",
        pad_token: str = "<PAD>",
        start_token: str = "<SOS>",
        end_token: str = "<EOS>",
        special_token_freq: float = 1e10,
        store_location: str = None,
        embedding_type: Union[str, None] = None,
        embedding_dimension: Union[int, None] = None,
    ):
        """

        :param instances: type: List[List[str]]
         Pass in the list of tokenized instances from which vocab is built
        :param max_num_tokens: type: int
        The top `max_num_words` frequent words will be considered for
        vocabulary and the rest of them will be mapped to `unk_token`
        :param min_count: type: int
        All words that do not have min count will be mapped to `unk_token`
        :param unk_token: str
        This token will be used for unknown words
        :param pad_token: type: str
        This token will be used for <PAD> words
        :param start_token: type: str
        This token will be used for start of sentence indicator
        :param end_token: type: str
        This token will be used for end of sentence indicator
        :param special_token_freq: type: float
        special tokens should have high frequency.
        The higher the frequency, the more common they are
        :param store_location: type: str
        The users can provide a store location optionally.
        The vocab will be stored in the location
        If the file exists then, the vocab will be restored from the file, rather than building it.
        :param embedding_type: type: str
        The embedding type is the type of pre-trained embedding that will be loaded
        for all the words in the vocab optionally. You can refer to `WordEmbLoder`
        for all the available embedding types
        :param embedding_dimension: type: int
        Embedding dimension of the embedding type
        """
        self.instances = instances
        self.max_num_tokens = max_num_tokens
        self.min_count = min_count
        self.unk_token = unk_token
        self.pad_token = pad_token
        self.start_token = start_token
        self.end_token = end_token
        self.special_token_freq = special_token_freq
        self.vocab = None
        self.orig_vocab = None
        self.idx2token = None
        self.token2idx = None
        self.store_location = store_location
        self.embedding_type = embedding_type
        self.embedding_dimension = embedding_dimension

        self.msg_printer = Printer()

        # store the special tokens
        self.special_vocab = {
            self.unk_token: (self.special_token_freq + 3, 0),
            self.pad_token: (self.special_token_freq + 2, 1),
            self.start_token: (self.special_token_freq + 1, 2),
            self.end_token: (self.special_token_freq, 3),
        }

    def map_tokens_to_freq_idx(self) -> Dict[str, Tuple[int, int]]:
        """
        Build vocab from instances
        return the word -> (freq, idx)
        :return:
        """
        all_tokens = []
        for instance in self.instances:
            all_tokens.extend(instance)

        # counter will map a list to Dict[str, count] values
        counter = Counter(all_tokens)

        # order the order in decreasing order of their frequencies
        # List[Tuple]
        counter = sorted(counter.items(), key=itemgetter(1), reverse=True)

        vocab = {}

        for idx, (token, freq) in enumerate(counter):
            vocab[token] = (freq, len(self.special_vocab) + idx)

        # merge the two dictionaries
        # courtesy https://stackoverflow.com/questions/38987/how-to-merge-two-dictionaries-in-a-single-expression
        vocab = {**vocab, **self.special_vocab}

        # BUG: if vocab and special vocab share same token, then
        # the index of the vocab will get overwritten by special vocab
        # the only way now is to recalculate indices based on frequencies
        vocab = sorted(vocab.items(), key=itemgetter(1), reverse=True)
        new_vocab = {}
        for idx, (token, (freq, _)) in enumerate(vocab):
            new_vocab[token] = (freq, idx)
        return new_vocab

    def clip_on_mincount(
            self, vocab: Dict[str, Tuple[int,
                                         int]]) -> Dict[str, Tuple[int, int]]:
        """
        Clip the vocab based on min count
        We decide to keep the word and it count
        We just change the idx of the token to idx of the unknown token
        :return: vocab: type: Dict[str, Tuple[int, int]]
        The new vocab
        """
        for key, (freq, idx) in vocab.items():
            if freq < self.min_count:
                vocab[key] = (freq, vocab[self.unk_token][1])

        return vocab

    def clip_on_max_num(
            self, vocab: Dict[str, Tuple[int,
                                         int]]) -> Dict[str, Tuple[int, int]]:
        """
        Clip the vocab based on the maximum number of words
        We return `max_num_words + len(self.special_vocab)` words effectively
        The rest of them will be mapped to `self.unk_token`
        :param vocab: type: Dict[str, Tuple[int, int]]
        :return: vocab: type: Dict[str, Tuple[int, int]]
        The new vocab
        """
        for key, (freq, idx) in vocab.items():
            if idx >= len(self.special_vocab) + self.max_num_tokens:
                vocab[key] = (freq, vocab[self.unk_token][1])

        return vocab

    def _add_token(self, token: str, save_vocab: bool = False):
        """
        Add token to an already existing vocabulary
        :param token: type str
        :return:
        """
        try:
            vocab = self.vocab
        except AttributeError:
            self.msg_printer.fail("Please build vocab using build vocab")
        tokens = vocab.keys()
        indices = [idx for freq, idx in vocab.values()]
        indices = sorted(indices, reverse=True)
        highest_idx = indices[0]

        if token not in tokens:
            self.vocab[token] = (1, highest_idx + 1)
            self.idx2token[highest_idx + 1] = token
            self.token2idx[token] = highest_idx + 1
            if save_vocab:
                self.save_to_file(
                    self.store_location)  # this can be expensive.

    def add_tokens(self, tokens: List[str]):
        try:
            vocab = self.vocab
        except AttributeError:
            self.msg_printer.fail("Please build vocab first")

        for token in tokens:
            self._add_token(token, save_vocab=False)

        if self.store_location:
            self.save_to_file(self.store_location)

    def build_vocab(self) -> Dict[str, Tuple[int, int]]:

        if self.store_location and os.path.isfile(self.store_location):
            vocab_object = self.load_from_file(self.store_location)
            self.msg_printer.good("Loaded vocab from file {0}".format(
                self.store_location))
            self.vocab = vocab_object.vocab
            self.orig_vocab = vocab_object.orig_vocab
            self.idx2token = vocab_object.idx2token
            self.token2idx = vocab_object.token2idx
            vocab = vocab_object.vocab

        else:
            self.msg_printer.info("BUILDING VOCAB")
            vocab = self.map_tokens_to_freq_idx()
            self.orig_vocab = deepcopy(
                vocab)  # dictionary are passed by reference. Be careful
            vocab = self.clip_on_mincount(vocab)
            vocab = self.clip_on_max_num(vocab)
            self.vocab = vocab
            self.idx2token = self.get_idx2token_mapping()
            self.token2idx = self.get_token2idx_mapping()

            if self.store_location:
                self.msg_printer.info("SAVING VOCAB TO FILE")
                self.save_to_file(self.store_location)
        return vocab

    def get_vocab_len(self) -> int:
        if not self.vocab:
            raise ValueError("Build vocab first by calling build_vocab()")

        length = len(set(idx for freq, idx in self.vocab.values()))
        return length

    def get_orig_vocab_len(self) -> int:
        if not self.orig_vocab:
            raise ValueError("Build vocab first by calling build_vocab()")

        length = len(set(idx for freq, idx in self.orig_vocab.values()))
        return length

    def get_token2idx_mapping(self) -> Dict[str, int]:
        if not self.vocab:
            raise ValueError("Build vocab first by calling build_vocab()")

        token2idx = {}
        for word, (freq, idx) in self.vocab.items():
            token2idx[word] = idx

        return token2idx

    def get_idx2token_mapping(self) -> Dict[int, str]:
        if not self.vocab:
            raise ValueError("Build vocab first by calling build_vocab()")

        idx2words = {}
        for word, (freq, idx) in self.vocab.items():
            idx2words[idx] = word
        return idx2words

    def save_to_file(self, filename: str):
        """
        :param filename: str
        The filename where the result to the file will be stored
        The vocab will be stored in the json file name
        Please make sure that this is a json filename

        :return: None
        The whole vocab object will be saved to the file
        """

        if not self.vocab:
            raise ValueError("Build vocab first by calling build_vocab()")

        vocab_state = dict()
        vocab_state["options"] = {
            "max_num_words": self.max_num_tokens,
            "min_count": self.min_count,
            "unk_token": self.unk_token,
            "pad_token": self.pad_token,
            "start_token": self.start_token,
            "end_token": self.end_token,
            "special_token_freq": self.special_token_freq,
            "embedding_type": self.embedding_type,
            "embedding_dimension": self.embedding_dimension,
            "special_vocab": self.special_vocab,
        }
        vocab_state["vocab"] = self.vocab
        vocab_state["orig_vocab"] = self.orig_vocab
        try:
            with open(filename, "w") as fp:
                json.dump(vocab_state, fp)

        except FileNotFoundError:
            print("You passed {0} for the filename. Please check whether "
                  "the path exists and try again".format(filename))

    @classmethod
    def load_from_file(cls, filename: str) -> "Vocab":
        try:
            with open(filename, "r") as fp:
                vocab_state = json.load(fp)
                vocab_options = vocab_state["options"]
                vocab_dict = vocab_state["vocab"]
                orig_vocab_dict = vocab_state["orig_vocab"]

                # restore the object
                # restore all the property values from the file

                max_num_tokens = vocab_options["max_num_words"]
                min_count = vocab_options["min_count"]
                unk_token = vocab_options["unk_token"]
                pad_token = vocab_options["pad_token"]
                start_token = vocab_options["start_token"]
                end_token = vocab_options["end_token"]
                special_token_freq = vocab_options["special_token_freq"]
                store_location = filename
                embedding_type = vocab_options["embedding_type"]
                embedding_dimension = vocab_options["embedding_dimension"]
                vocab = cls(
                    max_num_tokens=max_num_tokens,
                    min_count=min_count,
                    unk_token=unk_token,
                    pad_token=pad_token,
                    start_token=start_token,
                    end_token=end_token,
                    instances=None,
                    special_token_freq=special_token_freq,
                    store_location=store_location,
                    embedding_type=embedding_type,
                    embedding_dimension=embedding_dimension,
                )

                # instead of building the vocab, set the vocab from vocab_dict
                vocab.set_vocab(vocab=vocab_dict)
                vocab.set_orig_vocab(orig_vocab_dict)
                idx2token = vocab.get_idx2token_mapping()
                token2idx = vocab.get_token2idx_mapping()
                vocab.set_idx2token(idx2token)
                vocab.set_token2idx(token2idx)

                return vocab
        except FileNotFoundError:
            print("You passed {0} for the filename. Please check whether "
                  "the path exists and try again. Please pass "
                  "a json file".format(filename))

    def get_token_from_idx(self, idx: int) -> str:
        if not self.vocab:
            raise ValueError("Please build the vocab first")

        if not self.idx2token:
            self.idx2token = self.get_idx2token_mapping()

        try:
            if idx == self.special_vocab[self.unk_token][1]:
                return self.unk_token
            else:
                token = self.idx2token[idx]
                return token
        except KeyError:
            vocab_len = self.get_vocab_len()
            message = ("You tried to access idx {0} of the vocab "
                       "The length of the vocab is {1}. Please Provide "
                       "Number between {2}".format(idx, vocab_len,
                                                   vocab_len - 1))
            raise ValueError(message)

    def get_idx_from_token(self, token: str) -> int:
        if not self.vocab:
            raise ValueError("Please build the vocab first")

        if not self.token2idx:
            self.token2idx = self.get_token2idx_mapping()

        try:
            return self.token2idx[token]
        except KeyError:
            return self.token2idx[self.unk_token]

    def get_topn_frequent_words(self, n: int = 5) -> List[Tuple[str, int]]:
        idx2token = self.idx2token
        token_freqs = []
        max_n = min(len(self.special_vocab) + n, self.get_vocab_len())
        for idx in range(len(self.special_vocab), max_n):
            token = idx2token[idx]
            freq = self.orig_vocab[token][0]
            token_freqs.append((token, freq))

        return token_freqs

    def print_stats(self) -> None:
        orig_vocab_len = self.get_orig_vocab_len()
        vocab_len = self.get_vocab_len()
        N = 5
        top_n = self.get_topn_frequent_words(n=N)

        data = [
            ("Original vocab length", orig_vocab_len),
            ("Clipped vocab length", vocab_len),
            ("Top {0} words".format(N), top_n),
        ]
        header = ("Stats Description", "#")
        table_string = wasabi.table(data=data, header=header, divider=True)
        self.msg_printer.divider("VOCAB STATS")
        print(table_string)

    def load_embedding(self) -> torch.FloatTensor:
        if not self.vocab:
            raise ValueError("Please build the vocab first")

        embedding_loader = EmbeddingLoader(
            token2idx=self.token2idx,
            embedding_type=self.embedding_type,
            embedding_dimension=self.embedding_dimension,
        )

        indices = [key for key in self.idx2token.keys()]
        indices = sorted(indices)

        embeddings = []
        for idx in indices:
            token = self.idx2token[idx]
            # numpy array appends to the embeddings array
            embedding = embedding_loader.vocab_embedding[token]
            embeddings.append(embedding)

        embeddings = torch.FloatTensor(embeddings)
        return embeddings

    def set_vocab(self, vocab: Dict[str, Tuple[int, int]]):
        self.vocab = vocab

    def set_orig_vocab(self, orig_vocab: Dict[str, Tuple[int, int]]):
        self.orig_vocab = orig_vocab

    def set_idx2token(self, idx2token: Dict[int, str]):
        self.idx2token = idx2token

    def set_token2idx(self, token2idx: Dict[str, int]):
        self.token2idx = token2idx

    def get_disp_sentence_from_indices(self, indices: List[int]) -> str:
        """ Given a set of indices in vocab, it returns a sentence mapping the index to string

        Parameters
        ----------
        indices : List[int]
            A list of indices where every index is between ``[0, vocab_len-1)``.

        Returns
        -------
        str
            A string representing the index
        """
        pad_token_index = self.get_idx_from_token(self.pad_token)
        start_token_index = self.get_idx_from_token(self.start_token)
        end_token_index = self.get_idx_from_token(self.end_token)
        special_indices = [pad_token_index, start_token_index, end_token_index]

        token = [
            self.get_token_from_idx(idx) for idx in indices
            if idx not in special_indices
        ]
        sentence = " ".join(token)
        return sentence