Example #1
0
def setup_gpu(use_gpu: int, silent=None) -> None:
    """Configure the GPU and log info."""
    if silent is None:
        local_msg = Printer()
    else:
        local_msg = Printer(no_print=silent, pretty=not silent)
    if use_gpu >= 0:
        local_msg.info(f"Using GPU: {use_gpu}")
        require_gpu(use_gpu)
    else:
        local_msg.info("Using CPU")
        if gpu_is_available():
            local_msg.info("To switch to GPU 0, use the option: --gpu-id 0")
Example #2
0
def main(uri, table_path, schema, write_mode):
    msg = Printer()
    project_id, dataset_id, _ = table_path.split(".")
    config = Config(project_id=project_id, dataset_id=dataset_id)
    client = config.client()
    table_ref = str_to_bq_ref(table_path)

    load_job_config = bq.LoadJobConfig()
    load_job_config.schema = client.schema_from_json(schema)
    load_job_config.source_format = bq.SourceFormat.NEWLINE_DELIMITED_JSON
    load_job_config.ignore_unknown_values = True
    load_job_config.write_disposition = "WRITE_APPEND"
    load_job_config.max_bad_records = 100

    assert write_mode in ["CREATE_NEW", "WRITE_APPEND"]
    table_id = table_path.split(".")[-1]
    exists = any([
        table_id == table.table_id
        for table in client.list_tables(client.dataset(dataset_id))
    ])

    if exists and write_mode == "CREATE_NEW":
        msg.info(f"{table_path} already exists. Write_mode: {write_mode}")
        client.delete_table(table_ref)
        table = bq.Table(table_ref, schema=client.schema_from_json(schema))
        client.create_table(table)

    load_job = client.load_table_from_uri(uri,
                                          table_ref,
                                          job_config=load_job_config)
    with msg.loading("Loading data..."):
        load_job.result()
    msg.good("Data succesfully loaded!")
Example #3
0
def cached_path(path: Union[pathlib.Path, str],
                url: str,
                unzip=True) -> pathlib.Path:

    if isinstance(path, str):
        path = pathlib.Path(path)
    msg_printer = Printer()
    if path.is_file() or path.is_dir():
        msg_printer.info(f"{path} exists.")
        return path

    download_file(url=url, dest_filename=str(path))

    if unzip:
        if zipfile.is_zipfile(str(path)):
            extract_zip(filename=str(path), destination_dir=str(path.parent))
        if tarfile.is_tarfile(str(path)):
            if "tar" in path.suffix:
                mode = "r"
            elif "gz" in path.suffix:
                mode = "r:gz"
            else:
                mode = "r"

            extract_tar(filename=str(path),
                        destination_dir=str(path.parent),
                        mode=mode)

    return path
Example #4
0
def dashboard(data_dir: Path) -> None:
    """Calculate statistics on a Corpus

    Args:
        data_dir (Path): Path to data folder
    """
    msg: Printer = Printer()

    # with msg.loading("Loading Corpus from Disk"):
    corpus = Corpus.from_disk(data_dir)
    # msg.good("Done")

    ner_stats = corpus.apply(get_ner_stats)

    # external_stylesheets = [
    #     "https://codepen.io/chriddyp/pen/bWLwgP.css",
    #     "https://cdn.jsdelivr.net/npm/[email protected]/dist/css/uikit.min.css"
    # ]

    # external_scripts = [
    #     "https://cdn.jsdelivr.net/npm/[email protected]/dist/js/uikit.min.js",
    #     "https://cdn.jsdelivr.net/npm/[email protected]/dist/js/uikit-icons.min.js"
    # ]

    # app = dash.Dash(__name__, external_stylesheets=external_stylesheets, external_scripts=external_scripts)

    # def generate_bar_chart_stats(id: str, ner_stats: NERStats, name: str = None):
    #     return dcc.Graph(
    #         id=id,
    #         figure={
    #             'data': [
    #                 go.Bar(
    #                     x = list(ner_stats.n_annotations_per_type.values()),
    #                     y = list(ner_stats.n_annotations_per_type.keys()),
    #                     orientation='h'
    #                 )
    #             ],
    #             'layout': {
    #                 'title': name or id.capitalize()
    #             }
    #         }
    #     )

    # app.layout = html.Div(children=[
    #     html.Div(className="uk-child-width-1-2@s uk-grid-match")
    #     html.H1(className="" children='Recon NER Dashboard'),

    #     html.Div(children='''
    #         This dashboard shows statistics for all your data.
    #     '''),

    #     html.Div(children=[
    #         generate_bar_chart_stats("train", ner_stats["train"]),
    #         generate_bar_chart_stats("dev", ner_stats["dev"]),
    #         generate_bar_chart_stats("test", ner_stats["test"]),
    #         generate_bar_chart_stats("all", ner_stats["all"])
    #     ], style={'columnCount': 4})
    # ])

    uvicorn.run(app, port=9090)
Example #5
0
def handle_scores_per_type(
    scores: Dict[str, Any],
    data: Dict[str, Any] = {},
    *,
    spans_key: str = "sc",
    silent: bool = False,
) -> Dict[str, Any]:
    msg = Printer(no_print=silent, pretty=not silent)
    if "morph_per_feat" in scores:
        if scores["morph_per_feat"]:
            print_prf_per_type(msg, scores["morph_per_feat"], "MORPH", "feat")
            data["morph_per_feat"] = scores["morph_per_feat"]
    if "dep_las_per_type" in scores:
        if scores["dep_las_per_type"]:
            print_prf_per_type(msg, scores["dep_las_per_type"], "LAS", "type")
            data["dep_las_per_type"] = scores["dep_las_per_type"]
    if "ents_per_type" in scores:
        if scores["ents_per_type"]:
            print_prf_per_type(msg, scores["ents_per_type"], "NER", "type")
            data["ents_per_type"] = scores["ents_per_type"]
    if f"spans_{spans_key}_per_type" in scores:
        if scores[f"spans_{spans_key}_per_type"]:
            print_prf_per_type(
                msg, scores[f"spans_{spans_key}_per_type"], "SPANS", "type"
            )
            data[f"spans_{spans_key}_per_type"] = scores[f"spans_{spans_key}_per_type"]
    if "cats_f_per_type" in scores:
        if scores["cats_f_per_type"]:
            print_prf_per_type(msg, scores["cats_f_per_type"], "Textcat F", "label")
            data["cats_f_per_type"] = scores["cats_f_per_type"]
    if "cats_auc_per_type" in scores:
        if scores["cats_auc_per_type"]:
            print_textcats_auc_per_cat(msg, scores["cats_auc_per_type"])
            data["cats_auc_per_type"] = scores["cats_auc_per_type"]
    return scores
Example #6
0
    def pipe_(self, operations: List[Union[str, OperationState]]) -> None:
        """Run a sequence of operations on dataset data.
        Internally calls Dataset.apply_ and will resolve named
        operations in registry.operations

        Args:
            operations (List[Union[str, OperationState]]): List of operations
        """

        msg = Printer(no_print=self.verbose == False)
        msg.text(f"Applying pipeline of operations inplace to the dataset: {self.name}")

        for op in operations:
            op_name = op.name if isinstance(op, OperationState) else op
            msg.text(f"|_ {op_name}")

        for op in operations:
            if isinstance(op, str):
                op_name = op
                args = []
                kwargs = {}
                initial_state = None
            elif isinstance(op, OperationState):
                op_name = op.name
                args = op.args
                kwargs = op.kwargs
                initial_state = op

            operation = registry.operations.get(op_name)

            self.apply_(operation, *args, initial_state=initial_state, **kwargs)
Example #7
0
def conllu_to_docs(
    input_data,
    n_sents=10,
    append_morphology=False,
    ner_map=None,
    merge_subtokens=False,
    no_print=False,
    **_
):
    """
    Convert conllu files into JSON format for use with train cli.
    append_morphology parameter enables appending morphology to tags, which is
    useful for languages such as Spanish, where UD tags are not so rich.

    Extract NER tags if available and convert them so that they follow
    BILUO and the Wikipedia scheme
    """
    MISC_NER_PATTERN = "^((?:name|NE)=)?([BILU])-([A-Z_]+)|O$"
    msg = Printer(no_print=no_print)
    n_sents_info(msg, n_sents)
    sent_docs = read_conllx(
        input_data,
        append_morphology=append_morphology,
        ner_tag_pattern=MISC_NER_PATTERN,
        ner_map=ner_map,
        merge_subtokens=merge_subtokens,
    )
    sent_docs_to_merge = []
    for sent_doc in sent_docs:
        sent_docs_to_merge.append(sent_doc)
        if len(sent_docs_to_merge) % n_sents == 0:
            yield Doc.from_docs(sent_docs_to_merge)
            sent_docs_to_merge = []
    if sent_docs_to_merge:
        yield Doc.from_docs(sent_docs_to_merge)
Example #8
0
def profile(model, inputs=None, n_texts=10000):
    """
    Profile a spaCy pipeline, to find out which functions take the most time.
    Input should be formatted as one JSON object per line with a key "text".
    It can either be provided as a JSONL file, or be read from sys.sytdin.
    If no input file is specified, the IMDB dataset is loaded via Thinc.
    """
    msg = Printer()
    if inputs is not None:
        inputs = _read_inputs(inputs, msg)
    if inputs is None:
        n_inputs = 25000
        with msg.loading("Loading IMDB dataset via Thinc..."):
            imdb_train, _ = thinc.extra.datasets.imdb()
            inputs, _ = zip(*imdb_train)
        msg.info("Loaded IMDB dataset and using {} examples".format(n_inputs))
        inputs = inputs[:n_inputs]
    with msg.loading("Loading model '{}'...".format(model)):
        nlp = load_model(model)
    msg.good("Loaded model '{}'".format(model))
    texts = list(itertools.islice(inputs, n_texts))
    cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
    s = pstats.Stats("Profile.prof")
    msg.divider("Profile stats")
    s.strip_dirs().sort_stats("time").print_stats()
Example #9
0
    def __init__(self, idx2labelname_mapping: Optional[Dict[int, str]] = None):
        """

        Parameters
        ----------
        idx2labelname_mapping : Dict[int, str]
            Mapping from index to label. If this is not provided
            then we are going to use the class indices in all the reports
        """
        super(PrecisionRecallFMeasure, self).__init__()
        self.idx2labelname_mapping = idx2labelname_mapping
        self.msg_printer = Printer()
        self.classification_metrics_utils = ClassificationMetricsUtils(
            idx2labelname_mapping=idx2labelname_mapping
        )

        # setup counters to calculate true positives, false positives,
        # false negatives and true negatives
        # The keys are the different class indices in the dataset and the
        # values are the number of true positives, false positives, false negative
        # true negatvies for the dataset

        self.tp_counter = {}
        self.fp_counter = {}
        self.fn_counter = {}
        self.tn_counter = {}
Example #10
0
    def __init__(self, datasets_manager: DatasetsManager):
        """

        Parameters
        ----------
        datasets_manager : DatasetsManager
            The dataset manager managing the labels and other information
        """
        super(PrecisionRecallFMeasure,
              self).__init__(datasets_manager=datasets_manager)
        self.datasets_manager = datasets_manager
        self.idx2labelname_mapping = None
        self.msg_printer = Printer()
        self.classification_metrics_utils = ClassificationMetricsUtils()
        self.label_namespace = self.datasets_manager.label_namespaces[0]
        self.normalized_probs_namespace = "normalized_probs"
        self.label_numericalizer = self.datasets_manager.namespace_to_numericalizer[
            self.label_namespace]

        # setup counters to calculate true positives, false positives,
        # false negatives and true negatives
        # The keys are the different class indices in the dataset and the
        # values are the number of true positives, false positives, false negative
        # true negatvies for the dataset

        self.tp_counter = {}
        self.fp_counter = {}
        self.fn_counter = {}
        self.tn_counter = {}
Example #11
0
def op_iter(
        data: List[Example],
        pre: List[PreProcessor],
        verbose: bool = True) -> Iterator[Tuple[int, Example, Dict[str, Any]]]:
    """Iterate over list of examples for an operation
    yielding tuples of (example hash, example)

    Args:
        data (List[Example]): List of examples to iterate
        pre (List[PreProcessor]): List of preprocessors to run
        verbose (bool, optional): Show verbose output.

    Yields:
        Iterator[Tuple[int, Example]]: Tuples of (example hash, example)
    """
    msg = Printer(no_print=verbose == False, hide_animation=verbose == False)
    preprocessed_outputs: Dict[Example, Dict[str, Any]] = defaultdict(dict)
    for processor in pre:
        with msg.loading(f"\t=> Running preprocessor {processor.name}..."):
            processor_outputs = list(processor(data))
            msg.good("Done")

        for i, (example, output) in enumerate(zip(data, processor_outputs)):
            preprocessed_outputs[example][
                processor.name] = processor_outputs[i]

    for example in data:
        yield hash(example), example.copy(
            deep=True), preprocessed_outputs[example]
Example #12
0
    def __init__(
        self,
        encoder: nn.Module,
        encoding_dim: int,
        num_classes: int,
        classification_layer_bias: bool,
    ):
        """ SimpleClassifier is a linear classifier head on top of any encoder

        Parameters
        ----------
        encoder : nn.Module
            Any encoder that takes in instances
        encoding_dim : int
            The encoding dimension
        num_classes : int
            The number of classes
        classification_layer_bias : bool
            Whether to add classification layer bias or no
            This is set to false only for debugging purposes ff
        """
        super(SimpleClassifier, self).__init__()
        self.encoder = encoder
        self.encoding_dim = encoding_dim
        self.num_classes = num_classes
        print(self.num_classes)
        self.classification_layer_bias = classification_layer_bias
        self.classification_layer = nn.Linear(
            encoding_dim, num_classes, bias=self.classification_layer_bias)
        self._loss = CrossEntropyLoss()
        self.msg_printer = Printer()
Example #13
0
def print_summary(nlp, pretty=True, no_print=False):
    """Print a formatted summary for the current nlp object's pipeline. Shows
    a table with the pipeline components and why they assign and require, as
    well as any problems if available.
    nlp (Language): The nlp object.
    pretty (bool): Pretty-print the results (color etc).
    no_print (bool): Don't print anything, just return the data.
    RETURNS (dict): A dict with "overview" and "problems".
    """
    msg = Printer(pretty=pretty, no_print=no_print)
    overview = []
    problems = {}
    for i, (name, pipe) in enumerate(nlp.pipeline):
        requires = getattr(pipe, "requires", [])
        assigns = getattr(pipe, "assigns", [])
        retok = getattr(pipe, "retokenizes", False)
        overview.append((i, name, requires, assigns, retok))
        problems[name] = analyze_pipes(nlp.pipeline, name, pipe, i, warn=False)
    msg.divider("Pipeline Overview")
    header = ("#", "Component", "Requires", "Assigns", "Retokenizes")
    msg.table(overview, header=header, divider=True, multiline=True)
    n_problems = sum(len(p) for p in problems.values())
    if any(p for p in problems.values()):
        msg.divider("Problems ({})".format(n_problems))
        for name, problem in problems.items():
            if problem:
                problem = ", ".join(problem)
                msg.warn("'{}' requirements not met: {}".format(name, problem))
    else:
        msg.good("No problems found.")
    if no_print:
        return {"overview": overview, "problems": problems}
Example #14
0
def info(
    model: Optional[str] = None,
    *,
    markdown: bool = False,
    silent: bool = True,
    exclude: Optional[List[str]] = None,
) -> Union[str, dict]:
    msg = Printer(no_print=silent, pretty=not silent)
    if not exclude:
        exclude = []
    if model:
        title = f"Info about pipeline '{model}'"
        data = info_model(model, silent=silent)
    else:
        title = "Info about spaCy"
        data = info_spacy()
    raw_data = {k.lower().replace(" ", "_"): v for k, v in data.items()}
    if "Pipelines" in data and isinstance(data["Pipelines"], dict):
        data["Pipelines"] = ", ".join(f"{n} ({v})"
                                      for n, v in data["Pipelines"].items())
    markdown_data = get_markdown(data, title=title, exclude=exclude)
    if markdown:
        if not silent:
            print(markdown_data)
        return markdown_data
    if not silent:
        table_data = {k: v for k, v in data.items() if k not in exclude}
        msg.table(table_data, title=title)
    return raw_data
Example #15
0
File: base.py Project: yajiez/utify
def spinner(text='Loading...', clean=False):
    printer = Printer()
    spinchars = '⠙⠹⠸⠼⠴⠦⠧⠇⠏'

    def spin(s):
        for char in itertools.cycle(spinchars):
            sys.stdout.write("\r\033[96m{} {}".format(char, s))
            sys.stdout.flush()
            time.sleep(0.1)

    stime = time.time()
    t = Process(target=spin, args=(text, ))
    t.start()
    try:
        yield
    except Exception as e:
        t.terminate()
        printer.fail(text + ' failed.')
        raise e
    t.terminate()

    sys.stdout.write("\r")
    if clean:
        for _ in range(len(text) // 4 + 1):
            sys.stdout.write("\x1b[2K")
    else:
        time_used = strfsec(int(time.time() - stime))
        printer.good(f'{text} succeed in {time_used}.')
    sys.stdout.flush()
Example #16
0
def download_file(url: str, dest_filename: str) -> None:
    """ Download a file from the given url

    Parameters
    ----------
    url : str
        The url from which the file will be downloaded
    dest_filename : str
        The destination filename

    """
    # NOTE the stream=True parameter below
    msg_printer = Printer()
    block_size = 65536
    r = requests.get(url, stream=True)
    total_size = int(r.headers.get("content-length", 0))
    written = 0
    with open(dest_filename, "wb") as f:
        for chunk in tqdm(
                r.iter_content(chunk_size=block_size),
                total=math.ceil(total_size // block_size),
                desc=f"Downloading from {url}",
        ):
            if chunk:  # filter out keep-alive new chunks
                written = written + len(chunk)
                f.write(chunk)
    msg_printer.good(f"Finished downloading {url} to {dest_filename}")
Example #17
0
def run_on_all_states(f, index_slice=None):
    if index_slice is not None:
        states = list(us.STATES)[index_slice]
    else:
        states = list(us.STATES)
    run_task = catch_errors(f)
    results = [run_task(state) for state in states]

    successes = sum(result is Result.Success for result in results)
    errors = sum(result is Result.Error for result in results)
    printer = Printer()
    printer.info("Final result:")
    printer.info(f"{successes} were created successfully. {errors} errored.")
    printer.table(
        list(
            zip(
                [name for name in states],
                [
                    str(result) if result is not None else "Error"
                    for result in results
                ],
            )),
        header=("State", "Created"),
        divider=True,
    )
Example #18
0
def extract_tar(filename: str, destination_dir: str, mode="r"):
    """ Extracts tar, targz and other files

    Parameters
    ----------
    filename : str
        The tar zipped file
    destination_dir : str
        The destination directory in which the files should be placed
    mode : str
        A valid tar mode. You can refer to https://docs.python.org/3/library/tarfile.html
        for the different modes.

    Returns
    -------

    """
    msg_printer = Printer()
    try:
        with msg_printer.loading(
                f"Unzipping file {filename} to {destination_dir}"):
            stdout.flush()
            with tarfile.open(filename, mode) as t:
                t.extractall(destination_dir)

        msg_printer.good(
            f"Finished extraction {filename} to {destination_dir}")
    except tarfile.ExtractError:
        msg_printer.fail("Couldnot extract {filename} to {destination}")
Example #19
0
def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
    """Generate info about a specific model.

    model (str): Model name of path.
    silent (bool): Don't print anything, just return.
    RETURNS (dict): The model meta.
    """
    msg = Printer(no_print=silent, pretty=not silent)
    if util.is_package(model):
        model_path = util.get_package_path(model)
    else:
        model_path = Path(model)
    meta_path = model_path / "meta.json"
    if not meta_path.is_file():
        msg.fail("Can't find pipeline meta.json", meta_path, exits=1)
    meta = srsly.read_json(meta_path)
    if model_path.resolve() != model_path:
        meta["source"] = str(model_path.resolve())
    else:
        meta["source"] = str(model_path)
    return {
        k: v
        for k, v in meta.items()
        if k not in ("accuracy", "performance", "speed")
    }
Example #20
0
def convert(
    input_path: Union[str, Path],
    output_dir: Union[str, Path],
    *,
    file_type: str = "json",
    n_sents: int = 1,
    seg_sents: bool = False,
    model: Optional[str] = None,
    morphology: bool = False,
    merge_subtokens: bool = False,
    converter: str = "auto",
    ner_map: Optional[Path] = None,
    lang: Optional[str] = None,
    concatenate: bool = False,
    silent: bool = True,
    msg: Optional[Printer],
) -> None:
    if not msg:
        msg = Printer(no_print=silent)
    ner_map = srsly.read_json(ner_map) if ner_map is not None else None
    doc_files = []
    for input_loc in walk_directory(Path(input_path), converter):
        with input_loc.open("r", encoding="utf-8") as infile:
            input_data = infile.read()
        # Use converter function to convert data
        func = CONVERTERS[converter]
        docs = func(
            input_data,
            n_sents=n_sents,
            seg_sents=seg_sents,
            append_morphology=morphology,
            merge_subtokens=merge_subtokens,
            lang=lang,
            model=model,
            no_print=silent,
            ner_map=ner_map,
        )
        doc_files.append((input_loc, docs))
    if concatenate:
        all_docs = itertools.chain.from_iterable([docs for _, docs in doc_files])
        doc_files = [(input_path, all_docs)]
    for input_loc, docs in doc_files:
        if file_type == "json":
            data = [docs_to_json(docs)]
            len_docs = len(data)
        else:
            db = DocBin(docs=docs, store_user_data=True)
            len_docs = len(db)
            data = db.to_bytes()
        if output_dir == "-":
            _print_docs_to_stdout(data, file_type)
        else:
            if input_loc != input_path:
                subpath = input_loc.relative_to(input_path)
                output_file = Path(output_dir) / subpath.with_suffix(f".{file_type}")
            else:
                output_file = Path(output_dir) / input_loc.parts[-1]
                output_file = output_file.with_suffix(f".{file_type}")
            _write_docs_to_file(data, output_file, file_type)
            msg.good(f"Generated output file ({len_docs} documents): {output_file}")
Example #21
0
def evaluate(
    model,
    data_path,
    gpu_id=-1,
    gold_preproc=False,
    displacy_path=None,
    displacy_limit=25,
    return_scores=False,
):
    """
    Evaluate a model. To render a sample of parses in a HTML file, set an
    output directory as the displacy_path argument.
    """
    msg = Printer()
    util.fix_random_seed()
    if gpu_id >= 0:
        util.use_gpu(gpu_id)
    util.set_env_log(False)
    data_path = util.ensure_path(data_path)
    displacy_path = util.ensure_path(displacy_path)
    if not data_path.exists():
        msg.fail("Evaluation data not found", data_path, exits=1)
    if displacy_path and not displacy_path.exists():
        msg.fail("Visualization output directory not found", displacy_path, exits=1)
    corpus = GoldCorpus(data_path, data_path)
    nlp = util.load_model(model)
    dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
    begin = timer()
    scorer = nlp.evaluate(dev_docs, verbose=False)
    end = timer()
    nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
    results = {
        "Time": "%.2f s" % (end - begin),
        "Words": nwords,
        "Words/s": "%.0f" % (nwords / (end - begin)),
        "TOK": "%.2f" % scorer.token_acc,
        "POS": "%.2f" % scorer.tags_acc,
        "UAS": "%.2f" % scorer.uas,
        "LAS": "%.2f" % scorer.las,
        "NER P": "%.2f" % scorer.ents_p,
        "NER R": "%.2f" % scorer.ents_r,
        "NER F": "%.2f" % scorer.ents_f,
    }
    msg.table(results, title="Results")

    if displacy_path:
        docs, golds = zip(*dev_docs)
        render_deps = "parser" in nlp.meta.get("pipeline", [])
        render_ents = "ner" in nlp.meta.get("pipeline", [])
        render_parses(
            docs,
            displacy_path,
            model_name=model,
            limit=displacy_limit,
            deps=render_deps,
            ents=render_ents,
        )
        msg.good("Generated {} parses as HTML".format(displacy_limit), displacy_path)
    if return_scores:
        return scorer.scores
Example #22
0
def convert(
    input_file,
    output_dir="-",
    file_type="jsonl",
    n_sents=1,
    morphology=False,
    converter="auto",
    lang=None,
):
    """
    Convert files into JSON format for use with train command and other
    experiment management functions. If no output_dir is specified, the data
    is written to stdout, so you can pipe them forward to a JSONL file:
    $ spacy convert some_file.conllu > some_file.jsonl
    """
    msg = Printer()
    input_path = Path(input_file)
    if file_type not in FILE_TYPES:
        msg.fail(
            "Unknown file type: '{}'".format(file_type),
            "Supported file types: '{}'".format(", ".join(FILE_TYPES)),
            exits=1,
        )
    if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
        # TODO: support msgpack via stdout in srsly?
        msg.fail(
            "Can't write .{} data to stdout.".format(file_type),
            "Please specify an output directory.",
            exits=1,
        )
    if not input_path.exists():
        msg.fail("Input file not found", input_path, exits=1)
    if output_dir != "-" and not Path(output_dir).exists():
        msg.fail("Output directory not found", output_dir, exits=1)
    if converter == "auto":
        converter = input_path.suffix[1:]
    if converter not in CONVERTERS:
        msg.fail("Can't find converter for {}".format(converter), exits=1)
    # Use converter function to convert data
    func = CONVERTERS[converter]
    input_data = input_path.open("r", encoding="utf-8").read()
    data = func(input_data, n_sents=n_sents, use_morphology=morphology, lang=lang)
    if output_dir != "-":
        # Export data to a file
        suffix = ".{}".format(file_type)
        output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
        if file_type == "json":
            srsly.write_json(output_file, data)
        elif file_type == "jsonl":
            srsly.write_jsonl(output_file, data)
        elif file_type == "msg":
            srsly.write_msgpack(output_file, data)
        msg.good("Generated output file ({} documents)".format(len(data)), output_file)
    else:
        # Print to stdout
        if file_type == "json":
            srsly.write_json("-", data)
        elif file_type == "jsonl":
            srsly.write_jsonl("-", data)
Example #23
0
def get_build_formats(formats: List[str]) -> Tuple[bool, bool]:
    supported = ["sdist", "wheel", "none"]
    for form in formats:
        if form not in supported:
            msg = Printer()
            err = f"Unknown build format: {form}. Supported: {', '.join(supported)}"
            msg.fail(err, exits=1)
    if not formats or "none" in formats:
        return (False, False)
    return ("sdist" in formats, "wheel" in formats)
Example #24
0
def cached_path(path: pathlib.Path, url: str, unzip=True) -> pathlib.Path:

    msg_printer = Printer()
    if path.is_file() or path.is_dir():
        msg_printer.info(f"{path} exists.")
        return path

    download_file(url=url, dest_filename=f"{str(path)}.zip")

    if unzip:
        extract_zip(filename=f"{path}.zip", destination_dir=str(path.parent))
Example #25
0
def convert_sectlabel_to_json(filename: str) -> Dict:
    """ Converts the secthead file into more readable json format

    Parameters
    ----------
    filename : str
        The sectlabel file name available at WING-NUS website

    Returns
    -------
    Dict[str, Any]
        text
            The text of the line
        label
            The label of the file
        file_no
            A unique file number
        line_count
            A line count within the file

    """
    file_count = 1
    line_count = 1
    output_json = {"parse_sect": []}
    msg_printer = Printer()

    with open(filename) as fp:
        for line in tqdm(fp, desc="Converting SectLabel File to JSON"):
            line = line.replace("\n", "")

            # if the line is empty then the next line is the beginning of the
            if not line:
                file_count += 1
                continue

            # new file
            fields = line.split()
            line_content = fields[0]  # first column contains the content text
            line_content = line_content.replace(
                "|||", " "
            )  # every word in the line is sepearted by |||
            label = fields[-1]  # the last column contains the field marked
            line_json = {
                "text": line_content,
                "label": label,
                "file_no": file_count,
                "line_count": line_count,
            }
            line_count += 1

            output_json["parse_sect"].append(line_json)

    msg_printer.good("Finished converting sect label file to JSON")
    return output_json
Example #26
0
def convert_cli(
    # fmt: off
    input_path: str = Arg(..., help="Input file or directory", exists=True),
    output_dir: Path = Arg("-", help="Output directory. '-' for stdout.", allow_dash=True, exists=True),
    file_type: FileTypes = Opt("spacy", "--file-type", "-t", help="Type of data to produce"),
    n_sents: int = Opt(1, "--n-sents", "-n", help="Number of sentences per doc (0 to disable)"),
    seg_sents: bool = Opt(False, "--seg-sents", "-s", help="Segment sentences (for -c ner)"),
    model: Optional[str] = Opt(None, "--model", "--base", "-b", help="Trained spaCy pipeline for sentence segmentation to use as base (for --seg-sents)"),
    morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
    merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
    converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
    ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True),
    lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"),
    concatenate: bool = Opt(None, "--concatenate", "-C", help="Concatenate output to a single file"),
    # fmt: on
):
    """
    Convert files into json or DocBin format for training. The resulting .spacy
    file can be used with the train command and other experiment management
    functions.

    If no output_dir is specified and the output format is JSON, the data
    is written to stdout, so you can pipe them forward to a JSON file:
    $ spacy convert some_file.conllu --file-type json > some_file.json

    DOCS: https://spacy.io/api/cli#convert
    """
    if isinstance(file_type, FileTypes):
        # We get an instance of the FileTypes from the CLI so we need its string value
        file_type = file_type.value
    input_path = Path(input_path)
    output_dir = "-" if output_dir == Path("-") else output_dir
    silent = output_dir == "-"
    msg = Printer(no_print=silent)
    verify_cli_args(msg, input_path, output_dir, file_type, converter, ner_map)
    converter = _get_converter(msg, converter, input_path)
    convert(
        input_path,
        output_dir,
        file_type=file_type,
        n_sents=n_sents,
        seg_sents=seg_sents,
        model=model,
        morphology=morphology,
        merge_subtokens=merge_subtokens,
        converter=converter,
        ner_map=ner_map,
        lang=lang,
        concatenate=concatenate,
        silent=silent,
        msg=msg,
    )
def main(path, name="bert-base-uncased", lang="en"):
    msg = Printer()
    msg.info(f"Creating model for '{name}' ({lang})")
    with msg.loading(f"Setting up the pipeline..."):
        nlp = PyTT_Language(pytt_name=name, meta={"lang": lang})
        nlp.add_pipe(nlp.create_pipe("sentencizer"))
        nlp.add_pipe(PyTT_WordPiecer.from_pretrained(nlp.vocab, name))
        nlp.add_pipe(PyTT_TokenVectorEncoder.from_pretrained(nlp.vocab, name))
    msg.good("Initialized the model pipeline")
    nlp.to_disk(path)
    msg.good(f"Saved '{name}' ({lang})")
    msg.text(f"Pipeline: {nlp.pipe_names}")
    msg.text(f"Location: {path}")
Example #28
0
    def __init__(
            self,
            encoder: nn.Module,
            encoding_dim: int,
            num_classes: int,
            classification_layer_bias: bool = True,
            label_namespace: str = "label",
            datasets_manager: DatasetsManager = None,
            device: Union[torch.device, str] = torch.device("cpu"),
    ):
        """ SimpleClassifier is a linear classifier head on top of any encoder

        Parameters
        ----------
        encoder : nn.Module
            Any encoder that takes in lines and produces a single vector
            for every line.
        encoding_dim : int
            The encoding dimension
        num_classes : int
            The number of classes
        classification_layer_bias : bool
            Whether to add classification layer bias or no
            This is set to false only for debugging purposes ff
        label_namespace : str
            The namespace used for labels in the dataset
        datasets_manager: DatasetsManager
            The datasets manager for the model
        device: torch.device
            The device on which the model is run
        """
        super(SimpleClassifier, self).__init__()
        self.encoder = encoder
        self.encoding_dim = encoding_dim
        self.num_classes = num_classes
        self.classification_layer_bias = classification_layer_bias
        self.classification_layer = nn.Linear(
            self.encoding_dim,
            num_classes,
            bias=self.classification_layer_bias)
        self._loss = CrossEntropyLoss()
        self.label_namespace = label_namespace
        self.datasets_manager = datasets_manager
        self.label_numericalizer = self.datasets_manager.namespace_to_numericalizer[
            self.label_namespace]
        self.device = torch.device(device) if isinstance(device,
                                                         str) else device
        self.msg_printer = Printer()
def main(path, name="bert-base-uncased", lang="en"):
    msg = Printer()
    msg.info(f"Creating model for '{name}' ({lang})")
    with msg.loading(f"Setting up the pipeline..."):
        nlp = TransformersLanguage(trf_name=name, meta={"lang": lang})
        nlp.add_pipe(nlp.create_pipe("sentencizer"))
        nlp.add_pipe(TransformersWordPiecer.from_pretrained(nlp.vocab, name))
        nlp.add_pipe(TransformersTok2Vec.from_pretrained(nlp.vocab, name))
    msg.good("Initialized the model pipeline")
    nlp.to_disk(path)
    msg.good(f"Saved '{name}' ({lang})")
    msg.text(f"Pipeline: {nlp.pipe_names}")
    msg.text(f"Location: {path}")
    with msg.loading("Verifying model loads..."):
        nlp.from_disk(path)
    msg.good("Model loads!")
Example #30
0
def _resume_model(
    model: Model, resume_path: Path, epoch_resume: int, silent: bool = True
) -> None:
    msg = Printer(no_print=silent)
    msg.info(f"Resume training tok2vec from: {resume_path}")
    with resume_path.open("rb") as file_:
        weights_data = file_.read()
        model.get_ref("tok2vec").from_bytes(weights_data)
    # Parse the epoch number from the given weight file
    model_name = re.search(r"model\d+\.bin", str(resume_path))
    if model_name:
        # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
        epoch_resume = int(model_name.group(0)[5:][:-4]) + 1
        msg.info(f"Resuming from epoch: {epoch_resume}")
    else:
        msg.info(f"Resuming from epoch: {epoch_resume}")