Exemple #1
0
def create_a_labeled_jsonl_dataset(pattern_file_path, jsonl_datafile,
                                   annotated_jsonl_datafile):
    nlp = spacy.load("en_core_web_sm")
    matcher = Matcher(nlp.vocab)
    with open(pattern_file_path, "r") as fr:
        for idx, line in enumerate(fr):
            pattern_json = srsly.json_loads(line)
            pattern = pattern_json["pattern"]
            label = pattern_json["label"]
            # pattern_name = "_".join([x["lower"] for x in pattern])
            matcher.add(label, [pattern])

    with open(annotated_jsonl_datafile, "w") as fw:
        with open(jsonl_datafile, "r") as fr:
            for idx, line in enumerate(fr):
                line_json = srsly.json_loads(line)
                line_nlp = nlp(line_json["text"])
                matches = matcher(line_nlp)
                spans = []
                for match_id, start, end in matches:
                    span = Span(line_nlp, start, end, label=match_id)
                    spans.append(span)
                spans = filter_spans(spans)
                if spans:
                    print(f"{idx}, spans({len(spans)}):{spans}")
                spans_dicts_list = _spans_to_spans_dicts_list(spans)

                line_json["spans"] = spans_dicts_list
                fw.write(json.dumps(line_json) + "\n")
Exemple #2
0
    def from_bytes(self, bytes_data, exclude=tuple(), disable=None, **kwargs):
        """Load state from a binary string.

        bytes_data (bytes): The data to load from.
        exclude (list): Names of components or serialization fields to exclude.
        RETURNS (Language): The `Language` object.

        DOCS: https://spacy.io/api/language#from_bytes
        """
        if disable is not None:
            deprecation_warning(Warnings.W014)
            exclude = disable
        deserializers = OrderedDict()
        deserializers["meta.json"] = lambda b: self.meta.update(srsly.json_loads(b))
        deserializers["vocab"] = lambda b: self.vocab.from_bytes(b) and _fix_pretrained_vectors_name(self)
        deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes(b, exclude=["vocab"])
        for name, proc in self.pipeline:
            if name in exclude:
                continue
            if not hasattr(proc, "from_bytes"):
                continue
            deserializers[name] = lambda b, proc=proc: proc.from_bytes(b, exclude=["vocab"])
        exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
        util.from_bytes(bytes_data, deserializers, exclude)
        return self
Exemple #3
0
    def from_bytes(self, bytes_data, exclude=tuple(), disable=None, **kwargs):
        """Load state from a binary string.

        bytes_data (bytes): The data to load from.
        exclude (list): Names of components or serialization fields to exclude.
        RETURNS (Language): The `Language` object.

        DOCS: https://spacy.io/api/language#from_bytes
        """
        if disable is not None:
            warnings.warn(Warnings.W014, DeprecationWarning)
            exclude = disable
        deserializers = OrderedDict()
        deserializers["meta.json"] = lambda b: self.meta.update(
            srsly.json_loads(b))
        deserializers["vocab"] = lambda b: self.vocab.from_bytes(
            b) and _fix_pretrained_vectors_name(self)
        deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes(
            b, exclude=["vocab"])
        for name, proc in self.pipeline:
            if name in exclude:
                continue
            if not hasattr(proc, "from_bytes"):
                continue
            deserializers[name] = lambda b, proc=proc: proc.from_bytes(
                b, exclude=["vocab"])
        exclude = util.get_serialization_exclude(deserializers, exclude,
                                                 kwargs)
        util.from_bytes(bytes_data, deserializers, exclude)
        return self
Exemple #4
0
    def from_bytes(self, bytes_data, *, exclude=tuple()):
        """Load the pipe from a bytestring.

        exclude (Iterable[str]): String names of serialization fields to exclude.
        RETURNS (TrainablePipe): The loaded object.

        DOCS: https://spacy.io/api/entitylinker#from_bytes
        """
        self._validate_serialization_attrs()

        def load_model(b):
            try:
                self.model.from_bytes(b)
            except AttributeError:
                raise ValueError(Errors.E149) from None

        deserialize = {}
        if hasattr(self, "cfg") and self.cfg is not None:
            deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
        deserialize["vocab"] = lambda b: self.vocab.from_bytes(b,
                                                               exclude=exclude)
        deserialize["kb"] = lambda b: self.kb.from_bytes(b)
        deserialize["model"] = load_model
        util.from_bytes(bytes_data, deserialize, exclude)
        return self
Exemple #5
0
def _parse_overrides(args: List[str], is_cli: bool = False) -> Dict[str, Any]:
    result = {}
    while args:
        opt = args.pop(0)
        err = f"Invalid config override '{opt}'"
        if opt.startswith("--"):  # new argument
            orig_opt = opt
            opt = opt.replace("--", "")
            if "." not in opt:
                if is_cli:
                    raise NoSuchOption(orig_opt)
                else:
                    msg.fail(f"{err}: can't override top-level sections",
                             exits=1)
            if "=" in opt:  # we have --opt=value
                opt, value = opt.split("=", 1)
                opt = opt.replace("-", "_")
            else:
                if not args or args[0].startswith("--"):  # flag with no value
                    value = "true"
                else:
                    value = args.pop(0)
            # Just like we do in the config, we're calling json.loads on the
            # values. But since they come from the CLI, it'd be unintuitive to
            # explicitly mark strings with escaped quotes. So we're working
            # around that here by falling back to a string if parsing fails.
            # TODO: improve logic to handle simple types like list of strings?
            try:
                result[opt] = srsly.json_loads(value)
            except ValueError:
                result[opt] = str(value)
        else:
            msg.fail(f"{err}: name should start with --", exits=1)
    return result
Exemple #6
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, "r") as smart_file:
            dataset = (srsly.json_loads(line)
                       for line in smart_file.readlines())

            for sample in dataset:
                text = sample["text"]
                words = [
                    text[t["span"]["start"]:t["span"]["end"]]
                    for t in sample["tokens"]
                ]
                tokens = [Token(w) for w in words]

                if self.tag_label == "ner":
                    entities = extract_entities(sample)
                elif self.tag_label == "relation":
                    relations = extract_relations_from_smart_sample(
                        sample, include_trigger=self.include_trigger)
                    entities = [
                        e for relation in relations for e in relation.entities
                    ]

                yield self.text_to_instance(tokens,
                                            entities=entities,
                                            relations=relations)
Exemple #7
0
def ner_jsonl2json(input_data,
                   lang=None,
                   n_sents=10,
                   use_morphology=False,
                   **_):
    if lang is None:
        raise ValueError("No --lang specified, but tokenization required")
    json_docs = []
    input_examples = [
        srsly.json_loads(line) for line in input_data.strip().split("\n")
    ]
    nlp = get_lang_class(lang)()
    sentencizer = nlp.create_pipe("sentencizer")
    for i, batch in enumerate(minibatch(input_examples, size=n_sents)):
        docs = []
        for record in batch:
            raw_text = record["text"]
            if "entities" in record:
                ents = record["entities"]
            else:
                ents = record["spans"]
            ents = [(e["start"], e["end"], e["label"]) for e in ents]
            doc = nlp.make_doc(raw_text)
            sentencizer(doc)
            spans = [doc.char_span(s, e, label=L) for s, e, L in ents]
            doc.ents = _cleanup_spans(spans)
            docs.append(doc)
        json_docs.append(docs_to_json(docs, id=i))
    return json_docs
Exemple #8
0
def to_patterns(dataset=None, label=None, output_file=None):
    """
    Convert a list of seed phrases to a list of match patterns that can be used
    with ner.match. If no output file is specified, each pattern is printed
    so the recipe's output can be piped forward to ner.match.

    This is pretty much an exact copy of terms.to-patterns.
    The pattern for each example is just split on whitespace so instead of:

        {"label": "SHOE_BRAND", "pattern": [{"LOWER": "new balance"}]}


    which won't match anything you'll get:

        {"label": "SHOE_BRAND", "pattern": [{"LOWER": "new"}, {"LOWER": "balance"}]}
    """
    if label is None:
        prints(
            "--label is a required argument",
            "This is the label that will be assigned to all patterns "
            "created from terms collected in this dataset. ",
            exits=1,
            error=True,
        )

    DB = connect()

    def get_pattern(term, label):
        return {
            "label": label,
            "pattern": [{
                "lower": t.lower()
            } for t in term["text"].split()]
        }

    log("RECIPE: Starting recipe phrases.to-patterns", locals())
    if dataset is None:
        log("RECIPE: Reading input terms from sys.stdin")
        terms = (srsly.json_loads(line) for line in sys.stdin)
    else:
        if dataset not in DB:
            prints("Can't find dataset '{}'".format(dataset),
                   exits=1,
                   error=True)
        terms = DB.get_dataset(dataset)
        log("RECIPE: Reading {} input phrases from dataset {}".format(
            len(terms), dataset))
    if output_file:
        patterns = [
            get_pattern(term, label) for term in terms
            if term["answer"] == "accept"
        ]
        log("RECIPE: Generated {} patterns".format(len(patterns)))
        srsly.write_jsonl(output_file, patterns)
        prints("Exported {} patterns".format(len(patterns)), output_file)
    else:
        log("RECIPE: Outputting patterns")
        for term in terms:
            if term["answer"] == "accept":
                print(srsly.json_dumps(get_pattern(term, label)))
Exemple #9
0
def main(model: str, origin_jsonl_path: str, label_by_model_jsonl_path: str):
    nlp = spacy.load(model)
    origin_jsonl_path = Path(origin_jsonl_path)
    label_by_model_jsonl_path = Path(label_by_model_jsonl_path)
    print(
        f"annotate with {model}:\n{origin_jsonl_path}->{label_by_model_jsonl_path}"
    )
    with open(label_by_model_jsonl_path, "w") as fw:
        with open(origin_jsonl_path, "r") as fr:
            for count_lines, _ in enumerate(tqdm(fr)):
                pass
        with open(origin_jsonl_path, "r") as fr:
            for idx, line in enumerate(tqdm(fr, total=count_lines + 1)):
                line_json = srsly.json_loads(line)
                line_json["spans"] = list(
                )  # delete any existing spans (labels)
                line_nlp = nlp(line_json["text"])
                spans = []
                for ent in line_nlp.ents:
                    span = Span(line_nlp, ent.start, ent.end, label=ent.label_)
                    spans.append(span)
                spans = filter_spans(
                    spans
                )  # useless line, NER model should not output problematic spans
                # if spans:
                #     print(f"{idx}, spans({len(spans)}):{spans}")
                spans_dicts_list = _spans_to_spans_dicts_list(spans)
                line_json["spans"] = spans_dicts_list
                fw.write(json.dumps(line_json) + "\n")
Exemple #10
0
 def from_bytes(self, data: bytes, **kwargs) -> "JapaneseTokenizer":
     deserializers = {
         "cfg": lambda b: self._set_config(srsly.json_loads(b))
     }
     util.from_bytes(data, deserializers, [])
     self.tokenizer = try_sudachi_import(self.split_mode)
     return self
Exemple #11
0
def create_df_index_of_jsonl_by_meta_and_spans(annotated_jsonl_datafile):
    labels_dict = _get_mitre_labels_dict()
    list_of_labels = labels_dict.keys()
    df_columns = ["git_repo", "year", "filename"] + list(list_of_labels)
    print(df_columns)

    df_list_of_lists = []
    with open(annotated_jsonl_datafile, "r") as fr:
        for idx, line in enumerate(fr):
            if idx % 100000 == 0:
                print("create_df_index: line =", idx)
            line_json = srsly.json_loads(line)
            try:
                line_text = line_json["text"]
                git_repo = line_json["meta"]["git_repo"]
                year = line_json["meta"]["year"]
                filename = line_json["meta"]["filename"]
                spans = line_json["spans"]
            except ValueError:
                raise

            spans_dict = collections.defaultdict(list)
            for span in spans:
                span_text = str(span["text"]).lower()  # save just lower-case values
                span_label = span["label"]
                spans_dict[span_label].append(span_text)

            df_row_list = [git_repo, year, filename] + [spans_dict[label] for label in
                                                        df_columns[-len(list_of_labels):]]
            df_list_of_lists.append(df_row_list)

        df = pd.DataFrame(df_list_of_lists, columns=df_columns)
        _save_zip_of_df(df, annotated_jsonl_datafile)
        _save_zip_of_df(df, "index_df.zip")
Exemple #12
0
def create_df_index_of_jsonl_by_meta_and_spans(annotated_jsonl_datafile, dst_zip_path):
    list_of_labels = get_list_of_entities_types(annotated_jsonl_datafile)
    df_columns = ["git_repo", "year", "filename"] + list(list_of_labels)
    df_list_of_lists = []
    with open(annotated_jsonl_datafile, "r") as fr:
        for count_lines, _ in enumerate(fr):
            pass

    with open(annotated_jsonl_datafile, "r") as fr:
        for line in tqdm(fr, total=count_lines + 1):
            line_json = srsly.json_loads(line)
            try:
                line_text = line_json["text"]
                git_repo = line_json["meta"]["git_repo"]
                year = line_json["meta"]["year"]
                filename = line_json["meta"]["filename"]
                spans = line_json["spans"]
            except ValueError:
                raise

            spans_dict = collections.defaultdict(list)
            for span in spans:
                span_text = str(span["text"]).lower()
                span_label = span["label"]
                spans_dict[span_label].append(span_text)

            df_row_list = [git_repo, year, filename] + [spans_dict[label] for label in
                                                        df_columns[-len(list_of_labels):]]
            df_list_of_lists.append(df_row_list)

        df = pd.DataFrame(df_list_of_lists, columns=df_columns)
        save_zip_of_df(df, dst_zip_path)
Exemple #13
0
 def from_bytes(self, data, **kwargs):
     deserializers = OrderedDict(
         (
             ("cfg", lambda b: self._set_config(srsly.json_loads(b))),
         )
     )
     util.from_bytes(data, deserializers, [])
     self.tokenizer = try_sudachi_import(self.split_mode)
     return self
Exemple #14
0
 def before_read(self, parser, section, option, value):
     # If we're dealing with a quoted string as the interpolation value,
     # make sure we load and unquote it so we don't end up with '"value"'
     try:
         json_value = srsly.json_loads(value)
         if isinstance(json_value, str) and json_value not in JSON_EXCEPTIONS:
             value = json_value
     except Exception:
         pass
     return super().before_read(parser, section, option, value)
Exemple #15
0
def _parse_override(value: Any) -> Any:
    # Just like we do in the config, we're calling json.loads on the
    # values. But since they come from the CLI, it'd be unintuitive to
    # explicitly mark strings with escaped quotes. So we're working
    # around that here by falling back to a string if parsing fails.
    # TODO: improve logic to handle simple types like list of strings?
    try:
        return srsly.json_loads(value)
    except ValueError:
        return str(value)
Exemple #16
0
def stack_exchange(loc=None):
    if loc is None:
        raise ValueError("No default path for Stack Exchange yet")
    rows = []
    with loc.open("r", encoding="utf8") as file_:
        for line in file_:
            eg = json_loads(line)
            rows.append(((eg["text1"], eg["text2"]), int(eg["label"])))
    train, dev = partition(rows, 0.7)
    return train, dev
Exemple #17
0
def get_list_of_entities_types(annotated_jsonl_datafile):
    entities_types = set()
    with open(annotated_jsonl_datafile, "r") as fr:
        for line in fr:
            line_json = srsly.json_loads(line)
            for span in line_json.get("spans"):
                entity_type = span.get("label")
                if entity_type:
                    entities_types.add(entity_type)
    return list(entities_types)
Exemple #18
0
 def __iter__(self):
     for file_path in self.iter_files():
         with bz2.open(str(file_path)) as f:
             for line in f:
                 line = line.strip()
                 if not line:
                     continue
                 comment = srsly.json_loads(line)
                 if self.is_valid(comment):
                     text = self.strip_tags(comment["body"])
                     yield {"text": text}
    def from_bytes(self, bytes_data, *, exclude=tuple()):
        deserializers = {
            "cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
            "model": lambda b: self.model.from_bytes(b),
            "vocab": lambda b: self.vocab.from_bytes(b, exclude=exclude),
            "trees": lambda b: self.trees.from_bytes(b),
        }

        util.from_bytes(bytes_data, deserializers, exclude)

        return self
Exemple #20
0
 def __iter__(self):
     for file_path in self.iter_files():
         with bz2.open(str(file_path)) as f:
             for line in f:
                 line = line.strip()
                 if not line:
                     continue
                 comment = srsly.json_loads(line)
                 if self.is_valid(comment):
                     text = self.strip_tags(comment["body"])
                     yield {"text": text}
Exemple #21
0
def read_snli(loc, label_scheme):
    rows = []
    with loc.open("r", encoding="utf8") as file_:
        for line in file_:
            eg = json_loads(line)
            label = eg["gold_label"]
            if label == "-":
                continue
            rows.append(
                ((eg["sentence1"], eg["sentence2"]), label_scheme[label]))
    return rows
Exemple #22
0
    def from_bytes(self, data, **kwargs):
        pkuseg_features_b = b""
        pkuseg_weights_b = b""
        pkuseg_processors_data = None

        def deserialize_pkuseg_features(b):
            nonlocal pkuseg_features_b
            pkuseg_features_b = b

        def deserialize_pkuseg_weights(b):
            nonlocal pkuseg_weights_b
            pkuseg_weights_b = b

        def deserialize_pkuseg_processors(b):
            nonlocal pkuseg_processors_data
            pkuseg_processors_data = srsly.msgpack_loads(b)

        deserializers = OrderedDict((
            ("cfg", lambda b: self._set_config(srsly.json_loads(b))),
            ("pkuseg_features", deserialize_pkuseg_features),
            ("pkuseg_weights", deserialize_pkuseg_weights),
            ("pkuseg_processors", deserialize_pkuseg_processors),
        ))
        util.from_bytes(data, deserializers, [])

        if pkuseg_features_b and pkuseg_weights_b:
            with tempfile.TemporaryDirectory() as tempdir:
                tempdir = Path(tempdir)
                with open(tempdir / "features.pkl", "wb") as fileh:
                    fileh.write(pkuseg_features_b)
                with open(tempdir / "weights.npz", "wb") as fileh:
                    fileh.write(pkuseg_weights_b)
                try:
                    import pkuseg
                except ImportError:
                    raise ImportError(
                        "pkuseg not installed. To use this model, " +
                        _PKUSEG_INSTALL_MSG)
                self.pkuseg_seg = pkuseg.pkuseg(str(tempdir))
            if pkuseg_processors_data:
                (
                    user_dict,
                    do_process,
                    common_words,
                    other_words,
                ) = pkuseg_processors_data
                self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
                self.pkuseg_seg.postprocesser.do_process = do_process
                self.pkuseg_seg.postprocesser.common_words = set(common_words)
                self.pkuseg_seg.postprocesser.other_words = set(other_words)

        return self
Exemple #23
0
def package_wikigraph(input_path: Path, output_path: Path, force: bool = None):
    """
    Generate an installable Python package for a `WikiGraph`.

    After packaging, "python setup.py sdist" must be run in the package directory,
    which will create a .tar.gz archive that can be installed via "pip install".

    Parameters
    ----------
    input_path : Path
        [description]
    output_path : Path
        [description]
    force : bool, optional
        [description], by default None
    """
    if not input_path or not input_path.exists():
        msg.fail("Can't locate graph data", input_path, exits=1)
    if not output_path:
        msg.fail("Output directory is missing", output_path, exits=1)
    if not output_path.exists():
        output_path.mkdir()
        msg.good("Created output directory: {}".format(output_path))
    meta_path = input_path / "meta.json"
    if not meta_path.exists():
        msg.fail("Can't find graph meta.json", meta_path, exits=1)
    meta = json_loads(meta_path.read_text())
    graph_fullname = meta["fullname"]
    package_path = output_path / graph_fullname
    if package_path.exists():
        if not force:
            msg.fail(
                "Package directory already exists",
                "Please delete the directory and try again, or use the "
                "`--force` flag to overwrite existing "
                "directories.".format(path=package_path),
                exits=1,
            )
        shutil.rmtree(package_path)
    package_path.mkdir()
    shutil.copy(meta_path, package_path)
    copy_tree(str(pkg_path), str(package_path))
    graph_name = meta["name"]
    rename(package_path / "graph-name", package_path / graph_name)
    module_path = package_path / graph_name
    copy_tree(str(input_path), str(module_path / graph_fullname))
    msg.good("Successfully created package {}".format(graph_name),
             package_path)
    msg.text(
        "To build the package, run `python setup.py sdist` in this directory.")
Exemple #24
0
    def from_bytes(self, data, **kwargs):
        pkuseg_data = {
            "features_b": b"",
            "weights_b": b"",
            "processors_data": None
        }

        def deserialize_pkuseg_features(b):
            pkuseg_data["features_b"] = b

        def deserialize_pkuseg_weights(b):
            pkuseg_data["weights_b"] = b

        def deserialize_pkuseg_processors(b):
            pkuseg_data["processors_data"] = srsly.msgpack_loads(b)

        deserializers = {
            "cfg": lambda b: self._set_config(srsly.json_loads(b)),
            "pkuseg_features": deserialize_pkuseg_features,
            "pkuseg_weights": deserialize_pkuseg_weights,
            "pkuseg_processors": deserialize_pkuseg_processors,
        }
        util.from_bytes(data, deserializers, [])

        if pkuseg_data["features_b"] and pkuseg_data["weights_b"]:
            with tempfile.TemporaryDirectory() as tempdir:
                tempdir = Path(tempdir)
                with open(tempdir / "features.msgpack", "wb") as fileh:
                    fileh.write(pkuseg_data["features_b"])
                with open(tempdir / "weights.npz", "wb") as fileh:
                    fileh.write(pkuseg_data["weights_b"])
                try:
                    import spacy_pkuseg
                except ImportError:
                    raise ImportError(
                        "spacy-pkuseg not installed. To use this model, " +
                        _PKUSEG_INSTALL_MSG) from None
                self.pkuseg_seg = spacy_pkuseg.pkuseg(str(tempdir))
            if pkuseg_data["processors_data"]:
                processors_data = pkuseg_data["processors_data"]
                (user_dict, do_process, common_words,
                 other_words) = processors_data
                self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(
                    user_dict)
                self.pkuseg_seg.postprocesser.do_process = do_process
                self.pkuseg_seg.postprocesser.common_words = set(common_words)
                self.pkuseg_seg.postprocesser.other_words = set(other_words)

        return self
Exemple #25
0
def _read_inputs(loc: Union[Path, str], msg: Printer) -> Iterator[str]:
    if loc == "-":
        msg.info("Reading input from sys.stdin")
        file_ = sys.stdin
        file_ = (line.encode("utf8") for line in file_)
    else:
        input_path = Path(loc)
        if not input_path.exists() or not input_path.is_file():
            msg.fail("Not a valid input data file", loc, exits=1)
        msg.info(f"Using data from {input_path.parts[-1]}")
        file_ = input_path.open()  # type: ignore[assignment]
    for line in file_:
        data = srsly.json_loads(line)
        text = data["text"]
        yield text
Exemple #26
0
def _read_inputs(loc, msg):
    if loc == "-":
        msg.info("Reading input from sys.stdin")
        file_ = sys.stdin
        file_ = (line.encode("utf8") for line in file_)
    else:
        input_path = Path(loc)
        if not input_path.exists() or not input_path.is_file():
            msg.fail("Not a valid input data file", loc, exits=1)
        msg.info("Using data from {}".format(input_path.parts[-1]))
        file_ = input_path.open()
    for line in file_:
        data = srsly.json_loads(line)
        text = data["text"]
        yield text
Exemple #27
0
    def from_bytes(
        self,
        bytes_data: bytes,
        *,
        exclude: Iterable[str] = SimpleFrozenList()) -> "SpanRuler":
        """Load the span ruler from a bytestring.

        bytes_data (bytes): The bytestring to load.
        RETURNS (SpanRuler): The loaded span ruler.

        DOCS: https://spacy.io/api/spanruler#from_bytes
        """
        self.clear()
        deserializers = {
            "patterns": lambda b: self.add_patterns(srsly.json_loads(b)),
        }
        util.from_bytes(bytes_data, deserializers, exclude)
        return self
Exemple #28
0
 def interpret_config(self, config: Union[Dict[str, Any], "ConfigParser"]):
     """Interpret a config, parse nested sections and parse the values
     as JSON. Mostly used internally and modifies the config in place.
     """
     for section, values in config.items():
         if section == "DEFAULT":
             # Skip [DEFAULT] section for now since it causes validation
             # errors and we don't want to use it
             continue
         parts = section.split(".")
         node = self
         for part in parts:
             node = node.setdefault(part, {})
         if not isinstance(node, dict):
             # Happens if both value *and* subsection were defined for a key
             err = [{"loc": parts, "msg": "found conflicting values"}]
             raise ConfigValidationError(
                 f"{self}\n{({part: dict(values)})}", err)
         for key, value in values.items():
             node[key] = srsly.json_loads(config.get(section, key))
Exemple #29
0
def setup_package():
    root_path = Path(__file__).parent.absolute()
    meta_path = root_path / "meta.json"
    meta = json_loads(meta_path.read_text())
    graph_name = meta["name"]
    pkg_data = list_files(path.join(graph_name, meta["fullname"]))
    requirements = [f"spikex{meta['spikex_version']}"]
    copy(meta_path, path.join(graph_name))
    setup(
        name=graph_name,
        description=meta["description"],
        author=meta["author"],
        author_email=meta["email"],
        url=meta["url"],
        version=meta["version"],
        license=meta["license"],
        packages=[graph_name],
        package_data={graph_name: pkg_data},
        install_requires=requirements,
        zip_safe=False,
    )
Exemple #30
0
def dataset():
    smart_training_data_path = "https://fh-aachen.sciebo.de/s/MjcrDC3gDjwU7Vd/download"
    smart_validation_data_path = "https://fh-aachen.sciebo.de/s/3GpXCZLhjwm2SJU/download"
    smart_test_data_path = "https://fh-aachen.sciebo.de/s/9ghU4Qi1azUMFPW/download"

    data_paths = [
        smart_training_data_path, smart_validation_data_path,
        smart_test_data_path
    ]

    dataset = []

    for file_path in data_paths:

        file_path = cached_path(file_path)

        with open(file_path, "r") as file:

            dataset += [srsly.json_loads(line) for line in file.readlines()]

    return dataset
Exemple #31
0
    def _read(self, file_path: str) -> Iterable[Instance]:

        file_path = cached_path(file_path)

        with open(file_path, "r") as smart_file:
            logger.info("Reading instances from lines in file at: %s",
                        file_path)

            dataset = (srsly.json_loads(line)
                       for line in smart_file.readlines())

            for sample in dataset:
                text = sample["text"]
                words = [
                    text[t["span"]["start"]:t["span"]["end"]]
                    for t in sample["tokens"]
                ]
                tokens = [Token(w) for w in words]

                entities = extract_entities(sample)
                entity_tags = combine_spans_to_entity_tags(
                    entities, len(words))

                relations = extract_relations_from_smart_sample(
                    sample,
                    only_mandatory=False,
                    include_trigger=self.include_trigger)
                relation_tags = combined_relation_tags(
                    relations,
                    len(words),
                    include_mode=self._include_mode,
                )

                idx = sample["id"]
                yield self.text_to_instance(tokens,
                                            relation_tags=relation_tags,
                                            relations=relations,
                                            entity_tags=entity_tags,
                                            entities=entities,
                                            idx=idx)
Exemple #32
0
 def interpret_config(self, config: Union[Dict[str, Any], "ConfigParser"]):
     """Interpret a config, parse nested sections and parse the values
     as JSON. Mostly used internally and modifies the config in place.
     """
     # Sort sections by depth, so that we can iterate breadth-first. This
     # allows us to check that we're not expanding an undefined block.
     get_depth = lambda item: len(item[0].split("."))
     for section, values in sorted(config.items(), key=get_depth):
         if section == "DEFAULT":
             # Skip [DEFAULT] section for now since it causes validation
             # errors and we don't want to use it
             continue
         parts = section.split(".")
         node = self
         for part in parts[:-1]:
             if part == "*":
                 node = node.setdefault(part, {})
             elif part not in node:
                 err_title = f"Error parsing config section. Perhaps a section name is wrong?"
                 err = [{
                     "loc": parts,
                     "msg": f"Section '{part}' is not defined"
                 }]
                 raise ConfigValidationError(self, err, message=err_title)
             else:
                 node = node[part]
         node = node.setdefault(parts[-1], {})
         if not isinstance(node, dict):
             # Happens if both value *and* subsection were defined for a key
             err = [{"loc": parts, "msg": "found conflicting values"}]
             raise ConfigValidationError(
                 f"{self}\n{({part: dict(values)})}", err)
         for key, value in values.items():
             try:
                 node[key] = srsly.json_loads(config.get(section, key))
             except Exception as e:
                 raise ValueError(
                     f"Error reading key '{key}' in section '{section}': {e}"
                 )
Exemple #33
0
def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False):
    if lang is None:
        raise ValueError("No --lang specified, but tokenization required")
    json_docs = []
    input_examples = [srsly.json_loads(line) for line in input_data.strip().split("\n")]
    nlp = get_lang_class(lang)()
    sentencizer = nlp.create_pipe("sentencizer")
    for i, batch in enumerate(minibatch(input_examples, size=n_sents)):
        docs = []
        for record in batch:
            raw_text = record["text"]
            if "entities" in record:
                ents = record["entities"]
            else:
                ents = record["spans"]
            ents = [(e["start"], e["end"], e["label"]) for e in ents]
            doc = nlp.make_doc(raw_text)
            sentencizer(doc)
            spans = [doc.char_span(s, e, label=L) for s, e, L in ents]
            doc.ents = _cleanup_spans(spans)
            docs.append(doc)
        json_docs.append(docs_to_json(docs, id=i))
    return json_docs