def convert( input_file, output_dir="-", file_type="jsonl", n_sents=1, morphology=False, converter="auto", lang=None, ): """ Convert files into JSON format for use with train command and other experiment management functions. If no output_dir is specified, the data is written to stdout, so you can pipe them forward to a JSONL file: $ spacy convert some_file.conllu > some_file.jsonl """ msg = Printer() input_path = Path(input_file) if file_type not in FILE_TYPES: msg.fail( "Unknown file type: '{}'".format(file_type), "Supported file types: '{}'".format(", ".join(FILE_TYPES)), exits=1, ) if file_type not in FILE_TYPES_STDOUT and output_dir == "-": # TODO: support msgpack via stdout in srsly? msg.fail( "Can't write .{} data to stdout.".format(file_type), "Please specify an output directory.", exits=1, ) if not input_path.exists(): msg.fail("Input file not found", input_path, exits=1) if output_dir != "-" and not Path(output_dir).exists(): msg.fail("Output directory not found", output_dir, exits=1) if converter == "auto": converter = input_path.suffix[1:] if converter not in CONVERTERS: msg.fail("Can't find converter for {}".format(converter), exits=1) # Use converter function to convert data func = CONVERTERS[converter] input_data = input_path.open("r", encoding="utf-8").read() data = func(input_data, n_sents=n_sents, use_morphology=morphology, lang=lang) if output_dir != "-": # Export data to a file suffix = ".{}".format(file_type) output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix) if file_type == "json": srsly.write_json(output_file, data) elif file_type == "jsonl": srsly.write_jsonl(output_file, data) elif file_type == "msg": srsly.write_msgpack(output_file, data) msg.good("Generated output file ({} documents)".format(len(data)), output_file) else: # Print to stdout if file_type == "json": srsly.write_json("-", data) elif file_type == "jsonl": srsly.write_jsonl("-", data)
def save_pkuseg_processors(path): if self.pkuseg_seg: data = ( _get_pkuseg_trie_data(self.pkuseg_seg.preprocesser.trie), self.pkuseg_seg.postprocesser.do_process, sorted(list(self.pkuseg_seg.postprocesser.common_words)), sorted(list(self.pkuseg_seg.postprocesser.other_words)), ) srsly.write_msgpack(path, data)
def to_disk(self, path, **kwargs): """Serialize waterwheel data to a file. Parameters ---------- path : Path path to file. """ path = ensure_path(path) serial = self.to_bytes() srsly.write_msgpack(path, serial)
def to_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()): """Serialize a Sense2Vec object to a directory. path (unicode / Path): The path. exclude (list): Names of serialization fields to exclude. """ path = Path(path) self.vectors.to_disk(path) srsly.write_json(path / "cfg", self.cfg) srsly.write_json(path / "freqs.json", list(self.freqs.items())) if "strings" not in exclude: self.strings.to_disk(path / "strings.json") if "cache" not in exclude and self.cache: srsly.write_msgpack(path / "cache", self.cache)
def write_data_files(vocab: Dict, wikidata: Dict, stop_words: set, doc_bins_bytes: Dict): """Writes necessary data to resource files. Parameters ---------- vocab: Dict A dictionary containing the different water body types and their hash value. wikidata: Dict A dictionary with each wikilink data for every water_body stop_words: set A set of commong words in English. doc_bins_bytes: Dict A dictionary of DocBin bytes for each water body type. """ serial = OrderedDict(( ('stop_words', list(stop_words)), ('vocab', vocab), ('wikidata', wikidata), ('doc_bins', doc_bins_bytes), )) srsly.write_msgpack(doc_bins_file, serial)
def to_disk( self, path: Union[Path, str], exclude: Iterable[str] = SimpleFrozenList() ) -> None: """Serialize the AttributeRuler to disk. path (Union[Path, str]): A path to a directory. exclude (Iterable[str]): String names of serialization fields to exclude. DOCS: https://spacy.io/api/attributeruler#to_disk """ serialize = { "vocab": lambda p: self.vocab.to_disk(p, exclude=exclude), "patterns": lambda p: srsly.write_msgpack(p, self.patterns), } util.to_disk(path, serialize, exclude)
def main(vectors, gpu_id=-1, n_neighbors=100, batch_size=1024, cutoff=0, start=0, end=None): """ Step 6: Precompute nearest-neighbor queries (optional) Precompute nearest-neighbor queries for every entry in the vocab to make Sense2Vec.most_similar faster. The --cutoff option lets you define the number of earliest rows to limit the neighbors to. For instance, if cutoff is 100000, no word will have a nearest neighbor outside of the top 100k vectors. """ if gpu_id == -1: xp = numpy else: import cupy as xp import cupy.cuda.device cupy.take_along_axis = take_along_axis device = cupy.cuda.device.Device(gpu_id) device.use() vectors_dir = Path(vectors) vectors_file = vectors_dir / "vectors" if not vectors_dir.is_dir() or not vectors_file.exists(): err = "Are you passing in the exported sense2vec directory containing a vectors file?" msg.fail(f"Can't load vectors from {vectors}", err, exits=1) with msg.loading(f"Loading vectors from {vectors}"): vectors = xp.load(str(vectors_file)) msg.good( f"Loaded {vectors.shape[0]:,} vectors with dimension {vectors.shape[1]}" ) norms = xp.linalg.norm(vectors, axis=1, keepdims=True) norms[norms == 0] = 1 # Normalize to unit norm vectors /= norms if cutoff < 1: cutoff = vectors.shape[0] if end is None: end = vectors.shape[0] mean = float(norms.mean()) var = float(norms.var()) msg.good(f"Normalized (mean {mean:,.2f}, variance {var:,.2f})") msg.info( f"Finding {n_neighbors:,} neighbors among {cutoff:,} most frequent") n = min(n_neighbors, vectors.shape[0]) subset = vectors[:cutoff] best_rows = xp.zeros((end - start, n), dtype="i") scores = xp.zeros((end - start, n), dtype="f") for i in tqdm.tqdm(list(range(start, end, batch_size))): size = min(batch_size, end - i) batch = vectors[i:i + size] sims = xp.dot(batch, subset.T) # Set self-similarities to -inf, so that we don't return them. for j in range(size): if i + j < sims.shape[1]: sims[j, i + j] = -xp.inf # This used to use argpartition, to do a partial sort...But this ended # up being a ratsnest of terrible numpy crap. Just sorting the whole # list isn't really slower, and it's much simpler to read. ranks = xp.argsort(sims, axis=1) batch_rows = ranks[:, -n:] # Reverse batch_rows = batch_rows[:, ::-1] batch_scores = xp.take_along_axis(sims, batch_rows, axis=1) best_rows[i:i + size] = batch_rows scores[i:i + size] = batch_scores msg.info("Saving output") if not isinstance(best_rows, numpy.ndarray): best_rows = best_rows.get() if not isinstance(scores, numpy.ndarray): scores = scores.get() output = { "indices": best_rows, "scores": scores.astype("float16"), "start": start, "end": end, "cutoff": cutoff, } output_file = vectors_dir / "cache" with msg.loading("Saving output..."): srsly.write_msgpack(output_file, output) msg.good(f"Saved cache to {output_file}")
def convert( input_file, output_dir="-", file_type="json", n_sents=1, seg_sents=False, model=None, morphology=False, converter="auto", lang=None, ): """ Convert files into JSON format for use with train command and other experiment management functions. If no output_dir is specified, the data is written to stdout, so you can pipe them forward to a JSON file: $ spacy convert some_file.conllu > some_file.json """ no_print = output_dir == "-" msg = Printer(no_print=no_print) input_path = Path(input_file) if file_type not in FILE_TYPES: msg.fail( "Unknown file type: '{}'".format(file_type), "Supported file types: '{}'".format(", ".join(FILE_TYPES)), exits=1, ) if file_type not in FILE_TYPES_STDOUT and output_dir == "-": # TODO: support msgpack via stdout in srsly? msg.fail( "Can't write .{} data to stdout.".format(file_type), "Please specify an output directory.", exits=1, ) if not input_path.exists(): msg.fail("Input file not found", input_path, exits=1) if output_dir != "-" and not Path(output_dir).exists(): msg.fail("Output directory not found", output_dir, exits=1) input_data = input_path.open("r", encoding="utf-8").read() if converter == "auto": converter = input_path.suffix[1:] if converter == "ner" or converter == "iob": converter_autodetect = autodetect_ner_format(input_data) if converter_autodetect == "ner": msg.info("Auto-detected token-per-line NER format") converter = converter_autodetect elif converter_autodetect == "iob": msg.info("Auto-detected sentence-per-line NER format") converter = converter_autodetect else: msg.warn( "Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert" ) if converter not in CONVERTERS: msg.fail("Can't find converter for {}".format(converter), exits=1) # Use converter function to convert data func = CONVERTERS[converter] data = func( input_data, n_sents=n_sents, seg_sents=seg_sents, use_morphology=morphology, lang=lang, model=model, no_print=no_print, ) if output_dir != "-": # Export data to a file suffix = ".{}".format(file_type) output_file = Path(output_dir) / Path( input_path.parts[-1]).with_suffix(suffix) if file_type == "json": srsly.write_json(output_file, data) elif file_type == "jsonl": srsly.write_jsonl(output_file, data) elif file_type == "msg": srsly.write_msgpack(output_file, data) msg.good("Generated output file ({} documents): {}".format( len(data), output_file)) else: # Print to stdout if file_type == "json": srsly.write_json("-", data) elif file_type == "jsonl": srsly.write_jsonl("-", data)