def _load_tree(self, tree: dict) -> None: self.__dict__.update(tree) self.tokens = split_strings(self.tokens) self.frequencies = { w: self.frequencies["vals"][i] for i, w in enumerate(split_strings(self.frequencies["keys"]))} self.checker = SymSpell(max_dictionary_edit_distance=self.max_distance) self.checker.__dict__.update(tree["checker"]) deletes = {} words = split_strings(self.checker._deletes["strings"]) lengths = self.checker._deletes["lengths"] data = self.checker._deletes["data"] offset = 0 for i, delindex in enumerate(self.checker._deletes["indexes"]): length = lengths[i] deletes[delindex] = [words[j] for j in data[offset:offset + length]] offset += length self.checker._deletes = deletes self.checker._words = {w: self.checker._words[i] for i, w in enumerate(words)} vectors = self.wv["vectors"] wv = FastTextKeyedVectors(vectors.shape[1], self.wv["min_n"], self.wv["max_n"], self.wv["bucket"], True) wv.vectors = numpy.array(vectors) vocab = split_strings(self.wv["vocab"]["strings"]) wv.vocab = { s: Vocab(index=i, count=self.wv["vocab"]["counts"][i]) for i, s in enumerate(vocab)} wv.bucket = self.wv["bucket"] wv.index2word = wv.index2entity = vocab wv.num_ngram_vectors = self.wv["num_ngram_vectors"] wv.vectors_ngrams = numpy.array(self.wv["vectors_ngrams"]) wv.hash2index = {k: v for v, k in enumerate(self.wv["hash2index"])} self.wv = wv
def _load_tree(self, tree): library_names = {} library_metadata = {lang: {} for lang in tree["langs"]} for lang in tree["langs"]: library_names[lang] = set( split_strings(tree[lang]["library_names"])) for meta, libs in tree[lang]["library_metadata"].items(): library_metadata[lang][meta] = set(split_strings(libs)) self.construct(library_names, library_metadata)
def _load_tree(self, tree): matrix = assemble_sparse_matrix(tree["matrix"]) files = split_strings(tree["files"]) deps = split_strings(tree["deps"]) ind_to_langs = { ind: lang for ind, lang in enumerate(split_strings(tree["ind_to_langs"])) } ind_to_repos = { ind: repo for ind, repo in enumerate(split_strings(tree["ind_to_repos"])) } self.construct(matrix, files, deps, ind_to_langs, ind_to_repos)
def _load_tree(self, tree: dict, tokens=None): if tokens is None: tokens = split_strings(tree["tokens"]) freqs = tree["freqs"] self._log.info("Building the docfreq dictionary...") tokfreq = dict(zip(tokens, freqs)) self.construct(docs=tree["docs"], tokfreqs=tokfreq)
def _load_tree(self, tree): self._levels = {} for key, vals in tree["schemes"].items(): classes = split_strings(vals["classes"]) levels = vals["levels"] self.levels[key] = dict( zip(classes, numpy.split(levels, len(classes))))
def validate_asdf_file(obj, filename): data = asdf.open(filename) obj.assertIn("meta", data.tree) obj.assertIn("filenames", data.tree) obj.assertIn("uasts", data.tree) obj.assertIn("repository", data.tree) obj.assertEqual(data.tree["meta"]["model"], "uast") Node.FromString(split_strings(data.tree["uasts"])[0]) obj.assertEqual(0, len(data.tree["meta"]["dependencies"])) obj.assertEqual(data.tree["meta"]["model"], "uast")
def validate_asdf_file(obj, filename): data = asdf.open(filename) obj.assertIn("meta", data.tree) obj.assertIn("filenames", data.tree) obj.assertIn("sources", data.tree) obj.assertIn("uasts", data.tree) obj.assertIn("repository", data.tree) obj.assertIn("positions", data.tree) Node.FromString(split_strings(data.tree["uasts"])[0]) obj.assertEqual(data.tree["sources"]["lengths"].shape[0], data.tree["uasts"]["lengths"].shape[0]) obj.assertEqual(0, len(data.tree["meta"]["dependencies"])) obj.assertEqual(data.tree["meta"]["model"], "snippet")
def _load_tree(self, tree: dict) -> None: self.construct( split_strings(tree["tokens"]), split_strings(tree["topics"]) if tree["topics"] else None, assemble_sparse_matrix(tree["matrix"]))
def _load_tree(self, tree): self.construct(embeddings=tree["embeddings"].copy(), tokens=split_strings(tree["tokens"]))
def _load_tree(self, tree): tokens = split_strings(tree["tokens"]) super()._load_tree(tree, tokens) self._log.info("Mapping the keys order...") self._order = {k: i for i, k in enumerate(tokens)}
def _load_tree(self, tree: dict) -> None: super()._load_tree(tree) self._identifiers = set(split_strings(tree["identifiers"]))
def _load_tree(self, tree): self.id_to_element = split_strings(tree["elements"]) data, indptr = tree["data"], tree["indptr"] self.communities = [data[i:j] for i, j in zip(indptr, indptr[1:])]
def _load_tree_kwargs(self, tree: dict): return { "documents": split_strings(tree["documents"]), "matrix": assemble_sparse_matrix(tree["matrix"]), "tokens": split_strings(tree["tokens"]) }
def _load_tree(self, tree): self.id_to_cc = tree["cc"] self.id_to_cc[0] # do not remove - loads the array from disk self.id_to_element = split_strings(tree["elements"]) self.id_to_buckets = assemble_sparse_matrix(tree["buckets"])
def _load_tree_kwargs(self, tree: dict): return dict(documents=split_strings(tree["documents"]), matrix=assemble_sparse_matrix(tree["matrix"]), tokens=split_strings(tree["tokens"]))