def create_a_labeled_jsonl_dataset(pattern_file_path, jsonl_datafile, annotated_jsonl_datafile): nlp = spacy.load("en_core_web_sm") matcher = Matcher(nlp.vocab) with open(pattern_file_path, "r") as fr: for idx, line in enumerate(fr): pattern_json = srsly.json_loads(line) pattern = pattern_json["pattern"] label = pattern_json["label"] # pattern_name = "_".join([x["lower"] for x in pattern]) matcher.add(label, [pattern]) with open(annotated_jsonl_datafile, "w") as fw: with open(jsonl_datafile, "r") as fr: for idx, line in enumerate(fr): line_json = srsly.json_loads(line) line_nlp = nlp(line_json["text"]) matches = matcher(line_nlp) spans = [] for match_id, start, end in matches: span = Span(line_nlp, start, end, label=match_id) spans.append(span) spans = filter_spans(spans) if spans: print(f"{idx}, spans({len(spans)}):{spans}") spans_dicts_list = _spans_to_spans_dicts_list(spans) line_json["spans"] = spans_dicts_list fw.write(json.dumps(line_json) + "\n")
def from_bytes(self, bytes_data, exclude=tuple(), disable=None, **kwargs): """Load state from a binary string. bytes_data (bytes): The data to load from. exclude (list): Names of components or serialization fields to exclude. RETURNS (Language): The `Language` object. DOCS: https://spacy.io/api/language#from_bytes """ if disable is not None: deprecation_warning(Warnings.W014) exclude = disable deserializers = OrderedDict() deserializers["meta.json"] = lambda b: self.meta.update(srsly.json_loads(b)) deserializers["vocab"] = lambda b: self.vocab.from_bytes(b) and _fix_pretrained_vectors_name(self) deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes(b, exclude=["vocab"]) for name, proc in self.pipeline: if name in exclude: continue if not hasattr(proc, "from_bytes"): continue deserializers[name] = lambda b, proc=proc: proc.from_bytes(b, exclude=["vocab"]) exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) util.from_bytes(bytes_data, deserializers, exclude) return self
def from_bytes(self, bytes_data, exclude=tuple(), disable=None, **kwargs): """Load state from a binary string. bytes_data (bytes): The data to load from. exclude (list): Names of components or serialization fields to exclude. RETURNS (Language): The `Language` object. DOCS: https://spacy.io/api/language#from_bytes """ if disable is not None: warnings.warn(Warnings.W014, DeprecationWarning) exclude = disable deserializers = OrderedDict() deserializers["meta.json"] = lambda b: self.meta.update( srsly.json_loads(b)) deserializers["vocab"] = lambda b: self.vocab.from_bytes( b) and _fix_pretrained_vectors_name(self) deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes( b, exclude=["vocab"]) for name, proc in self.pipeline: if name in exclude: continue if not hasattr(proc, "from_bytes"): continue deserializers[name] = lambda b, proc=proc: proc.from_bytes( b, exclude=["vocab"]) exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) util.from_bytes(bytes_data, deserializers, exclude) return self
def from_bytes(self, bytes_data, *, exclude=tuple()): """Load the pipe from a bytestring. exclude (Iterable[str]): String names of serialization fields to exclude. RETURNS (TrainablePipe): The loaded object. DOCS: https://spacy.io/api/entitylinker#from_bytes """ self._validate_serialization_attrs() def load_model(b): try: self.model.from_bytes(b) except AttributeError: raise ValueError(Errors.E149) from None deserialize = {} if hasattr(self, "cfg") and self.cfg is not None: deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b)) deserialize["vocab"] = lambda b: self.vocab.from_bytes(b, exclude=exclude) deserialize["kb"] = lambda b: self.kb.from_bytes(b) deserialize["model"] = load_model util.from_bytes(bytes_data, deserialize, exclude) return self
def _parse_overrides(args: List[str], is_cli: bool = False) -> Dict[str, Any]: result = {} while args: opt = args.pop(0) err = f"Invalid config override '{opt}'" if opt.startswith("--"): # new argument orig_opt = opt opt = opt.replace("--", "") if "." not in opt: if is_cli: raise NoSuchOption(orig_opt) else: msg.fail(f"{err}: can't override top-level sections", exits=1) if "=" in opt: # we have --opt=value opt, value = opt.split("=", 1) opt = opt.replace("-", "_") else: if not args or args[0].startswith("--"): # flag with no value value = "true" else: value = args.pop(0) # Just like we do in the config, we're calling json.loads on the # values. But since they come from the CLI, it'd be unintuitive to # explicitly mark strings with escaped quotes. So we're working # around that here by falling back to a string if parsing fails. # TODO: improve logic to handle simple types like list of strings? try: result[opt] = srsly.json_loads(value) except ValueError: result[opt] = str(value) else: msg.fail(f"{err}: name should start with --", exits=1) return result
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, "r") as smart_file: dataset = (srsly.json_loads(line) for line in smart_file.readlines()) for sample in dataset: text = sample["text"] words = [ text[t["span"]["start"]:t["span"]["end"]] for t in sample["tokens"] ] tokens = [Token(w) for w in words] if self.tag_label == "ner": entities = extract_entities(sample) elif self.tag_label == "relation": relations = extract_relations_from_smart_sample( sample, include_trigger=self.include_trigger) entities = [ e for relation in relations for e in relation.entities ] yield self.text_to_instance(tokens, entities=entities, relations=relations)
def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False, **_): if lang is None: raise ValueError("No --lang specified, but tokenization required") json_docs = [] input_examples = [ srsly.json_loads(line) for line in input_data.strip().split("\n") ] nlp = get_lang_class(lang)() sentencizer = nlp.create_pipe("sentencizer") for i, batch in enumerate(minibatch(input_examples, size=n_sents)): docs = [] for record in batch: raw_text = record["text"] if "entities" in record: ents = record["entities"] else: ents = record["spans"] ents = [(e["start"], e["end"], e["label"]) for e in ents] doc = nlp.make_doc(raw_text) sentencizer(doc) spans = [doc.char_span(s, e, label=L) for s, e, L in ents] doc.ents = _cleanup_spans(spans) docs.append(doc) json_docs.append(docs_to_json(docs, id=i)) return json_docs
def to_patterns(dataset=None, label=None, output_file=None): """ Convert a list of seed phrases to a list of match patterns that can be used with ner.match. If no output file is specified, each pattern is printed so the recipe's output can be piped forward to ner.match. This is pretty much an exact copy of terms.to-patterns. The pattern for each example is just split on whitespace so instead of: {"label": "SHOE_BRAND", "pattern": [{"LOWER": "new balance"}]} which won't match anything you'll get: {"label": "SHOE_BRAND", "pattern": [{"LOWER": "new"}, {"LOWER": "balance"}]} """ if label is None: prints( "--label is a required argument", "This is the label that will be assigned to all patterns " "created from terms collected in this dataset. ", exits=1, error=True, ) DB = connect() def get_pattern(term, label): return { "label": label, "pattern": [{ "lower": t.lower() } for t in term["text"].split()] } log("RECIPE: Starting recipe phrases.to-patterns", locals()) if dataset is None: log("RECIPE: Reading input terms from sys.stdin") terms = (srsly.json_loads(line) for line in sys.stdin) else: if dataset not in DB: prints("Can't find dataset '{}'".format(dataset), exits=1, error=True) terms = DB.get_dataset(dataset) log("RECIPE: Reading {} input phrases from dataset {}".format( len(terms), dataset)) if output_file: patterns = [ get_pattern(term, label) for term in terms if term["answer"] == "accept" ] log("RECIPE: Generated {} patterns".format(len(patterns))) srsly.write_jsonl(output_file, patterns) prints("Exported {} patterns".format(len(patterns)), output_file) else: log("RECIPE: Outputting patterns") for term in terms: if term["answer"] == "accept": print(srsly.json_dumps(get_pattern(term, label)))
def main(model: str, origin_jsonl_path: str, label_by_model_jsonl_path: str): nlp = spacy.load(model) origin_jsonl_path = Path(origin_jsonl_path) label_by_model_jsonl_path = Path(label_by_model_jsonl_path) print( f"annotate with {model}:\n{origin_jsonl_path}->{label_by_model_jsonl_path}" ) with open(label_by_model_jsonl_path, "w") as fw: with open(origin_jsonl_path, "r") as fr: for count_lines, _ in enumerate(tqdm(fr)): pass with open(origin_jsonl_path, "r") as fr: for idx, line in enumerate(tqdm(fr, total=count_lines + 1)): line_json = srsly.json_loads(line) line_json["spans"] = list( ) # delete any existing spans (labels) line_nlp = nlp(line_json["text"]) spans = [] for ent in line_nlp.ents: span = Span(line_nlp, ent.start, ent.end, label=ent.label_) spans.append(span) spans = filter_spans( spans ) # useless line, NER model should not output problematic spans # if spans: # print(f"{idx}, spans({len(spans)}):{spans}") spans_dicts_list = _spans_to_spans_dicts_list(spans) line_json["spans"] = spans_dicts_list fw.write(json.dumps(line_json) + "\n")
def from_bytes(self, data: bytes, **kwargs) -> "JapaneseTokenizer": deserializers = { "cfg": lambda b: self._set_config(srsly.json_loads(b)) } util.from_bytes(data, deserializers, []) self.tokenizer = try_sudachi_import(self.split_mode) return self
def create_df_index_of_jsonl_by_meta_and_spans(annotated_jsonl_datafile): labels_dict = _get_mitre_labels_dict() list_of_labels = labels_dict.keys() df_columns = ["git_repo", "year", "filename"] + list(list_of_labels) print(df_columns) df_list_of_lists = [] with open(annotated_jsonl_datafile, "r") as fr: for idx, line in enumerate(fr): if idx % 100000 == 0: print("create_df_index: line =", idx) line_json = srsly.json_loads(line) try: line_text = line_json["text"] git_repo = line_json["meta"]["git_repo"] year = line_json["meta"]["year"] filename = line_json["meta"]["filename"] spans = line_json["spans"] except ValueError: raise spans_dict = collections.defaultdict(list) for span in spans: span_text = str(span["text"]).lower() # save just lower-case values span_label = span["label"] spans_dict[span_label].append(span_text) df_row_list = [git_repo, year, filename] + [spans_dict[label] for label in df_columns[-len(list_of_labels):]] df_list_of_lists.append(df_row_list) df = pd.DataFrame(df_list_of_lists, columns=df_columns) _save_zip_of_df(df, annotated_jsonl_datafile) _save_zip_of_df(df, "index_df.zip")
def create_df_index_of_jsonl_by_meta_and_spans(annotated_jsonl_datafile, dst_zip_path): list_of_labels = get_list_of_entities_types(annotated_jsonl_datafile) df_columns = ["git_repo", "year", "filename"] + list(list_of_labels) df_list_of_lists = [] with open(annotated_jsonl_datafile, "r") as fr: for count_lines, _ in enumerate(fr): pass with open(annotated_jsonl_datafile, "r") as fr: for line in tqdm(fr, total=count_lines + 1): line_json = srsly.json_loads(line) try: line_text = line_json["text"] git_repo = line_json["meta"]["git_repo"] year = line_json["meta"]["year"] filename = line_json["meta"]["filename"] spans = line_json["spans"] except ValueError: raise spans_dict = collections.defaultdict(list) for span in spans: span_text = str(span["text"]).lower() span_label = span["label"] spans_dict[span_label].append(span_text) df_row_list = [git_repo, year, filename] + [spans_dict[label] for label in df_columns[-len(list_of_labels):]] df_list_of_lists.append(df_row_list) df = pd.DataFrame(df_list_of_lists, columns=df_columns) save_zip_of_df(df, dst_zip_path)
def from_bytes(self, data, **kwargs): deserializers = OrderedDict( ( ("cfg", lambda b: self._set_config(srsly.json_loads(b))), ) ) util.from_bytes(data, deserializers, []) self.tokenizer = try_sudachi_import(self.split_mode) return self
def before_read(self, parser, section, option, value): # If we're dealing with a quoted string as the interpolation value, # make sure we load and unquote it so we don't end up with '"value"' try: json_value = srsly.json_loads(value) if isinstance(json_value, str) and json_value not in JSON_EXCEPTIONS: value = json_value except Exception: pass return super().before_read(parser, section, option, value)
def _parse_override(value: Any) -> Any: # Just like we do in the config, we're calling json.loads on the # values. But since they come from the CLI, it'd be unintuitive to # explicitly mark strings with escaped quotes. So we're working # around that here by falling back to a string if parsing fails. # TODO: improve logic to handle simple types like list of strings? try: return srsly.json_loads(value) except ValueError: return str(value)
def stack_exchange(loc=None): if loc is None: raise ValueError("No default path for Stack Exchange yet") rows = [] with loc.open("r", encoding="utf8") as file_: for line in file_: eg = json_loads(line) rows.append(((eg["text1"], eg["text2"]), int(eg["label"]))) train, dev = partition(rows, 0.7) return train, dev
def get_list_of_entities_types(annotated_jsonl_datafile): entities_types = set() with open(annotated_jsonl_datafile, "r") as fr: for line in fr: line_json = srsly.json_loads(line) for span in line_json.get("spans"): entity_type = span.get("label") if entity_type: entities_types.add(entity_type) return list(entities_types)
def __iter__(self): for file_path in self.iter_files(): with bz2.open(str(file_path)) as f: for line in f: line = line.strip() if not line: continue comment = srsly.json_loads(line) if self.is_valid(comment): text = self.strip_tags(comment["body"]) yield {"text": text}
def from_bytes(self, bytes_data, *, exclude=tuple()): deserializers = { "cfg": lambda b: self.cfg.update(srsly.json_loads(b)), "model": lambda b: self.model.from_bytes(b), "vocab": lambda b: self.vocab.from_bytes(b, exclude=exclude), "trees": lambda b: self.trees.from_bytes(b), } util.from_bytes(bytes_data, deserializers, exclude) return self
def __iter__(self): for file_path in self.iter_files(): with bz2.open(str(file_path)) as f: for line in f: line = line.strip() if not line: continue comment = srsly.json_loads(line) if self.is_valid(comment): text = self.strip_tags(comment["body"]) yield {"text": text}
def read_snli(loc, label_scheme): rows = [] with loc.open("r", encoding="utf8") as file_: for line in file_: eg = json_loads(line) label = eg["gold_label"] if label == "-": continue rows.append( ((eg["sentence1"], eg["sentence2"]), label_scheme[label])) return rows
def from_bytes(self, data, **kwargs): pkuseg_features_b = b"" pkuseg_weights_b = b"" pkuseg_processors_data = None def deserialize_pkuseg_features(b): nonlocal pkuseg_features_b pkuseg_features_b = b def deserialize_pkuseg_weights(b): nonlocal pkuseg_weights_b pkuseg_weights_b = b def deserialize_pkuseg_processors(b): nonlocal pkuseg_processors_data pkuseg_processors_data = srsly.msgpack_loads(b) deserializers = OrderedDict(( ("cfg", lambda b: self._set_config(srsly.json_loads(b))), ("pkuseg_features", deserialize_pkuseg_features), ("pkuseg_weights", deserialize_pkuseg_weights), ("pkuseg_processors", deserialize_pkuseg_processors), )) util.from_bytes(data, deserializers, []) if pkuseg_features_b and pkuseg_weights_b: with tempfile.TemporaryDirectory() as tempdir: tempdir = Path(tempdir) with open(tempdir / "features.pkl", "wb") as fileh: fileh.write(pkuseg_features_b) with open(tempdir / "weights.npz", "wb") as fileh: fileh.write(pkuseg_weights_b) try: import pkuseg except ImportError: raise ImportError( "pkuseg not installed. To use this model, " + _PKUSEG_INSTALL_MSG) self.pkuseg_seg = pkuseg.pkuseg(str(tempdir)) if pkuseg_processors_data: ( user_dict, do_process, common_words, other_words, ) = pkuseg_processors_data self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict) self.pkuseg_seg.postprocesser.do_process = do_process self.pkuseg_seg.postprocesser.common_words = set(common_words) self.pkuseg_seg.postprocesser.other_words = set(other_words) return self
def package_wikigraph(input_path: Path, output_path: Path, force: bool = None): """ Generate an installable Python package for a `WikiGraph`. After packaging, "python setup.py sdist" must be run in the package directory, which will create a .tar.gz archive that can be installed via "pip install". Parameters ---------- input_path : Path [description] output_path : Path [description] force : bool, optional [description], by default None """ if not input_path or not input_path.exists(): msg.fail("Can't locate graph data", input_path, exits=1) if not output_path: msg.fail("Output directory is missing", output_path, exits=1) if not output_path.exists(): output_path.mkdir() msg.good("Created output directory: {}".format(output_path)) meta_path = input_path / "meta.json" if not meta_path.exists(): msg.fail("Can't find graph meta.json", meta_path, exits=1) meta = json_loads(meta_path.read_text()) graph_fullname = meta["fullname"] package_path = output_path / graph_fullname if package_path.exists(): if not force: msg.fail( "Package directory already exists", "Please delete the directory and try again, or use the " "`--force` flag to overwrite existing " "directories.".format(path=package_path), exits=1, ) shutil.rmtree(package_path) package_path.mkdir() shutil.copy(meta_path, package_path) copy_tree(str(pkg_path), str(package_path)) graph_name = meta["name"] rename(package_path / "graph-name", package_path / graph_name) module_path = package_path / graph_name copy_tree(str(input_path), str(module_path / graph_fullname)) msg.good("Successfully created package {}".format(graph_name), package_path) msg.text( "To build the package, run `python setup.py sdist` in this directory.")
def from_bytes(self, data, **kwargs): pkuseg_data = { "features_b": b"", "weights_b": b"", "processors_data": None } def deserialize_pkuseg_features(b): pkuseg_data["features_b"] = b def deserialize_pkuseg_weights(b): pkuseg_data["weights_b"] = b def deserialize_pkuseg_processors(b): pkuseg_data["processors_data"] = srsly.msgpack_loads(b) deserializers = { "cfg": lambda b: self._set_config(srsly.json_loads(b)), "pkuseg_features": deserialize_pkuseg_features, "pkuseg_weights": deserialize_pkuseg_weights, "pkuseg_processors": deserialize_pkuseg_processors, } util.from_bytes(data, deserializers, []) if pkuseg_data["features_b"] and pkuseg_data["weights_b"]: with tempfile.TemporaryDirectory() as tempdir: tempdir = Path(tempdir) with open(tempdir / "features.msgpack", "wb") as fileh: fileh.write(pkuseg_data["features_b"]) with open(tempdir / "weights.npz", "wb") as fileh: fileh.write(pkuseg_data["weights_b"]) try: import spacy_pkuseg except ImportError: raise ImportError( "spacy-pkuseg not installed. To use this model, " + _PKUSEG_INSTALL_MSG) from None self.pkuseg_seg = spacy_pkuseg.pkuseg(str(tempdir)) if pkuseg_data["processors_data"]: processors_data = pkuseg_data["processors_data"] (user_dict, do_process, common_words, other_words) = processors_data self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser( user_dict) self.pkuseg_seg.postprocesser.do_process = do_process self.pkuseg_seg.postprocesser.common_words = set(common_words) self.pkuseg_seg.postprocesser.other_words = set(other_words) return self
def _read_inputs(loc: Union[Path, str], msg: Printer) -> Iterator[str]: if loc == "-": msg.info("Reading input from sys.stdin") file_ = sys.stdin file_ = (line.encode("utf8") for line in file_) else: input_path = Path(loc) if not input_path.exists() or not input_path.is_file(): msg.fail("Not a valid input data file", loc, exits=1) msg.info(f"Using data from {input_path.parts[-1]}") file_ = input_path.open() # type: ignore[assignment] for line in file_: data = srsly.json_loads(line) text = data["text"] yield text
def _read_inputs(loc, msg): if loc == "-": msg.info("Reading input from sys.stdin") file_ = sys.stdin file_ = (line.encode("utf8") for line in file_) else: input_path = Path(loc) if not input_path.exists() or not input_path.is_file(): msg.fail("Not a valid input data file", loc, exits=1) msg.info("Using data from {}".format(input_path.parts[-1])) file_ = input_path.open() for line in file_: data = srsly.json_loads(line) text = data["text"] yield text
def from_bytes( self, bytes_data: bytes, *, exclude: Iterable[str] = SimpleFrozenList()) -> "SpanRuler": """Load the span ruler from a bytestring. bytes_data (bytes): The bytestring to load. RETURNS (SpanRuler): The loaded span ruler. DOCS: https://spacy.io/api/spanruler#from_bytes """ self.clear() deserializers = { "patterns": lambda b: self.add_patterns(srsly.json_loads(b)), } util.from_bytes(bytes_data, deserializers, exclude) return self
def interpret_config(self, config: Union[Dict[str, Any], "ConfigParser"]): """Interpret a config, parse nested sections and parse the values as JSON. Mostly used internally and modifies the config in place. """ for section, values in config.items(): if section == "DEFAULT": # Skip [DEFAULT] section for now since it causes validation # errors and we don't want to use it continue parts = section.split(".") node = self for part in parts: node = node.setdefault(part, {}) if not isinstance(node, dict): # Happens if both value *and* subsection were defined for a key err = [{"loc": parts, "msg": "found conflicting values"}] raise ConfigValidationError( f"{self}\n{({part: dict(values)})}", err) for key, value in values.items(): node[key] = srsly.json_loads(config.get(section, key))
def setup_package(): root_path = Path(__file__).parent.absolute() meta_path = root_path / "meta.json" meta = json_loads(meta_path.read_text()) graph_name = meta["name"] pkg_data = list_files(path.join(graph_name, meta["fullname"])) requirements = [f"spikex{meta['spikex_version']}"] copy(meta_path, path.join(graph_name)) setup( name=graph_name, description=meta["description"], author=meta["author"], author_email=meta["email"], url=meta["url"], version=meta["version"], license=meta["license"], packages=[graph_name], package_data={graph_name: pkg_data}, install_requires=requirements, zip_safe=False, )
def dataset(): smart_training_data_path = "https://fh-aachen.sciebo.de/s/MjcrDC3gDjwU7Vd/download" smart_validation_data_path = "https://fh-aachen.sciebo.de/s/3GpXCZLhjwm2SJU/download" smart_test_data_path = "https://fh-aachen.sciebo.de/s/9ghU4Qi1azUMFPW/download" data_paths = [ smart_training_data_path, smart_validation_data_path, smart_test_data_path ] dataset = [] for file_path in data_paths: file_path = cached_path(file_path) with open(file_path, "r") as file: dataset += [srsly.json_loads(line) for line in file.readlines()] return dataset
def _read(self, file_path: str) -> Iterable[Instance]: file_path = cached_path(file_path) with open(file_path, "r") as smart_file: logger.info("Reading instances from lines in file at: %s", file_path) dataset = (srsly.json_loads(line) for line in smart_file.readlines()) for sample in dataset: text = sample["text"] words = [ text[t["span"]["start"]:t["span"]["end"]] for t in sample["tokens"] ] tokens = [Token(w) for w in words] entities = extract_entities(sample) entity_tags = combine_spans_to_entity_tags( entities, len(words)) relations = extract_relations_from_smart_sample( sample, only_mandatory=False, include_trigger=self.include_trigger) relation_tags = combined_relation_tags( relations, len(words), include_mode=self._include_mode, ) idx = sample["id"] yield self.text_to_instance(tokens, relation_tags=relation_tags, relations=relations, entity_tags=entity_tags, entities=entities, idx=idx)
def interpret_config(self, config: Union[Dict[str, Any], "ConfigParser"]): """Interpret a config, parse nested sections and parse the values as JSON. Mostly used internally and modifies the config in place. """ # Sort sections by depth, so that we can iterate breadth-first. This # allows us to check that we're not expanding an undefined block. get_depth = lambda item: len(item[0].split(".")) for section, values in sorted(config.items(), key=get_depth): if section == "DEFAULT": # Skip [DEFAULT] section for now since it causes validation # errors and we don't want to use it continue parts = section.split(".") node = self for part in parts[:-1]: if part == "*": node = node.setdefault(part, {}) elif part not in node: err_title = f"Error parsing config section. Perhaps a section name is wrong?" err = [{ "loc": parts, "msg": f"Section '{part}' is not defined" }] raise ConfigValidationError(self, err, message=err_title) else: node = node[part] node = node.setdefault(parts[-1], {}) if not isinstance(node, dict): # Happens if both value *and* subsection were defined for a key err = [{"loc": parts, "msg": "found conflicting values"}] raise ConfigValidationError( f"{self}\n{({part: dict(values)})}", err) for key, value in values.items(): try: node[key] = srsly.json_loads(config.get(section, key)) except Exception as e: raise ValueError( f"Error reading key '{key}' in section '{section}': {e}" )
def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False): if lang is None: raise ValueError("No --lang specified, but tokenization required") json_docs = [] input_examples = [srsly.json_loads(line) for line in input_data.strip().split("\n")] nlp = get_lang_class(lang)() sentencizer = nlp.create_pipe("sentencizer") for i, batch in enumerate(minibatch(input_examples, size=n_sents)): docs = [] for record in batch: raw_text = record["text"] if "entities" in record: ents = record["entities"] else: ents = record["spans"] ents = [(e["start"], e["end"], e["label"]) for e in ents] doc = nlp.make_doc(raw_text) sentencizer(doc) spans = [doc.char_span(s, e, label=L) for s, e, L in ents] doc.ents = _cleanup_spans(spans) docs.append(doc) json_docs.append(docs_to_json(docs, id=i)) return json_docs