Ejemplo n.º 1
0
 def _load_tree_kwargs(self, tree):
     return dict(repository=tree["repository"],
                 filenames=split_strings(tree["filenames"]),
                 uasts=[
                     type(self).parse_bblfsh_response(uast)
                     for uast in split_strings(tree["uasts"])
                 ])
    def _load_tree(self, tree: dict) -> None:
        """
        Attaches the needed data from the tree.

        :param tree: asdf file tree.
        :return: None
        """

        self.construct(X=split_strings(tree["X"]),
                       y_text=split_strings(tree["y_text"]),
                       y_pos=tree["y_pos"],
                       y_uast=[
                           self.parse_bblfsh_response(response)
                           for response in split_strings(tree["y_uast"])
                       ])
Ejemplo n.º 3
0
 def test_split_bytes(self):
     strings = split_strings({
         "strings": numpy.array([b"abcdef"]),
         "lengths": numpy.array([1, 2, 3]),
         "str": False
     })
     self.assertEqual(strings, [b"a", b"bc", b"def"])
Ejemplo n.º 4
0
 def test_empty_split_save_load_merge(self):
     strings = []
     merged = merge_strings(strings)
     assert_array_equal(merged["strings"], numpy.array([], dtype="S1"))
     assert_array_equal(merged["lengths"], numpy.array([], dtype=int))
     self.assertIsNone(merged["str"])
     af = asdf.AsdfFile(merged)
     buffer = BytesIO()
     af.write_to(buffer)
     buffer.seek(0)
     af_loaded = asdf.open(buffer)
     strings_restored = split_strings(af_loaded.tree)
     self.assertEqual(strings, strings_restored)
Ejemplo n.º 5
0
 def _load_tree(self, tree):
     self.construct(embeddings=tree["embeddings"].copy(),
                    tokens=split_strings(tree["tokens"]))
Ejemplo n.º 6
0
 def _load_tree(self, tree):
     self.construct(docs=tree["docs"],
                    tokens=split_strings(tree["tokens"]),
                    freqs=tree["freqs"])
Ejemplo n.º 7
0
 def _load_tree(self, tree):
     self.construct(tokens=split_strings(tree["tokens"]),
                    matrix=assemble_sparse_matrix(tree["matrix"]))
Ejemplo n.º 8
0
 def test_preprocess(self):
     import tensorflow as tf
     with tempfile.TemporaryDirectory() as tmpdir:
         args = default_preprocess_params(tmpdir, VOCAB)
         with captured_output() as (out, err, log):
             id2vec_preprocess(args)
         self.assertFalse(out.getvalue())
         self.assertFalse(err.getvalue())
         self.assertEqual(sorted(os.listdir(tmpdir)), [
             "col_sums.txt", "col_vocab.txt", "row_sums.txt",
             "row_vocab.txt", "shard-000-000.pb"
         ])
         df = OrderedDocumentFrequencies().load(source=args.docfreq_in)
         self.assertEqual(len(df), VOCAB)
         with open(os.path.join(tmpdir, "col_sums.txt")) as fin:
             col_sums = fin.read()
         with open(os.path.join(tmpdir, "row_sums.txt")) as fin:
             row_sums = fin.read()
         self.assertEqual(col_sums, row_sums)
         with open(os.path.join(tmpdir, "col_vocab.txt")) as fin:
             col_vocab = fin.read()
         with open(os.path.join(tmpdir, "row_vocab.txt")) as fin:
             row_vocab = fin.read()
         self.assertEqual(col_vocab, row_vocab)
         self.assertEqual(row_vocab.split("\n"), df.tokens())
         for word in row_vocab.split("\n"):
             self.assertGreater(df[word], 0)
         with open(os.path.join(tmpdir, "shard-000-000.pb"), "rb") as fin:
             features = tf.parse_single_example(
                 fin.read(),
                 features={
                     "global_row": tf.FixedLenFeature([VOCAB],
                                                      dtype=tf.int64),
                     "global_col": tf.FixedLenFeature([VOCAB],
                                                      dtype=tf.int64),
                     "sparse_local_row": tf.VarLenFeature(dtype=tf.int64),
                     "sparse_local_col": tf.VarLenFeature(dtype=tf.int64),
                     "sparse_value": tf.VarLenFeature(dtype=tf.float32)
                 })
         with tf.Session() as session:
             global_row, global_col, local_row, local_col, value = session.run(
                 [
                     features[n]
                     for n in ("global_row", "global_col",
                               "sparse_local_row", "sparse_local_col",
                               "sparse_value")
                 ])
         self.assertEqual(set(range(VOCAB)), set(global_row))
         self.assertEqual(set(range(VOCAB)), set(global_col))
         nnz = 16001
         self.assertEqual(value.values.shape, (nnz, ))
         self.assertEqual(local_row.values.shape, (nnz, ))
         self.assertEqual(local_col.values.shape, (nnz, ))
         numpy.random.seed(0)
         all_tokens = row_vocab.split("\n")
         chosen_indices = numpy.random.choice(list(range(VOCAB)),
                                              128,
                                              replace=False)
         chosen = [all_tokens[i] for i in chosen_indices]
         freqs = numpy.zeros((len(chosen), ) * 2, dtype=int)
         index = {w: i for i, w in enumerate(chosen)}
         chosen = set(chosen)
         with asdf.open(args.input) as model:
             matrix = assemble_sparse_matrix(model.tree["matrix"]).tocsr()
             tokens = split_strings(model.tree["tokens"])
             interesting = {i for i, t in enumerate(tokens) if t in chosen}
             for y in interesting:
                 row = matrix[y]
                 yi = index[tokens[y]]
                 for x, v in zip(row.indices, row.data):
                     if x in interesting:
                         freqs[yi, index[tokens[x]]] += v
         matrix = coo_matrix(
             (value.values,
              ([global_row[row] for row in local_row.values
                ], [global_col[col] for col in local_col.values])),
             shape=(VOCAB, VOCAB))
         matrix = matrix.tocsr()[chosen_indices][:, chosen_indices].todense(
         ).astype(int)
         self.assertTrue((matrix == freqs).all())
Ejemplo n.º 9
0
 def test_split_strings(self):
     strings = split_strings({
         "strings": numpy.array([b"abcdef"]),
         "lengths": numpy.array([1, 2, 3])
     })
     self.assertEqual(strings, ["a", "bc", "def"])
Ejemplo n.º 10
0
 def _load_tree(self, tree: dict) -> None:
     self.construct(
         split_strings(tree["tokens"]),
         split_strings(tree["topics"]) if tree["topics"] else None,
         assemble_sparse_matrix(tree["matrix"]))
Ejemplo n.º 11
0
 def _load_tree_kwargs(self, tree):
     return dict(repos=split_strings(tree["repos"]),
                 matrix=assemble_sparse_matrix(tree["matrix"]))
Ejemplo n.º 12
0
 def _load_tree_kwargs(self, tree):
     tree_kwargs = super()._load_tree_kwargs(tree)
     tree_kwargs["tokens"] = split_strings(tree["tokens"])
     return tree_kwargs
Ejemplo n.º 13
0
 def _load_tree_kwargs(self, tree):
     tree_kwargs = super(Source, self)._load_tree_kwargs(tree)
     tree_kwargs["sources"] = split_strings(tree["sources"])
     return tree_kwargs