def test_present_embeddings_run_server(self): def sweded_which(prog): return None which = shutil.which shutil.which = sweded_which browser = os.getenv("BROWSER", "") os.environ["BROWSER"] = "" try: with tempfile.TemporaryDirectory( prefix="sourced.ml-test-") as tmpdir: with captured_output() as (stdout, _, _): present_embeddings(tmpdir, True, ["one"], [str(i) for i in range(5)], [(i, i) for i in range(5)]) with open(os.path.join(tmpdir, "id2vec.json")) as fin: json.load(fin) with open(os.path.join(tmpdir, "id2vec_meta.tsv")) as fin: self.assertEqual(fin.read(), "0\n1\n2\n3\n4\n") with open(os.path.join(tmpdir, "id2vec_data.tsv")) as fin: self.assertEqual(fin.read(), "0\t0\n1\t1\n2\t2\n3\t3\n4\t4\n") self.assertIn( "\thttp://projector.tensorflow.org/?config=http://0.0.0.0:8000/id2vec.json\n", stdout.getvalue()) finally: shutil.which = which os.environ["BROWSER"] = browser web_server.stop()
def test_empty(self): args = sys.argv error = argparse.ArgumentParser.error try: argparse.ArgumentParser.error = lambda self, message: None sys.argv = [main.__file__] with captured_output() as (stdout, _, _): main.main() finally: sys.argv = args argparse.ArgumentParser.error = error self.assertIn("usage:", stdout.getvalue())
def test_preprocess(self): import tensorflow as tf with tempfile.TemporaryDirectory() as tmpdir: args = default_preprocess_params(tmpdir, VOCAB) with captured_output() as (out, err, log): id2vec_preprocess(args) self.assertFalse(out.getvalue()) self.assertFalse(err.getvalue()) self.assertEqual(sorted(os.listdir(tmpdir)), [ "col_sums.txt", "col_vocab.txt", "row_sums.txt", "row_vocab.txt", "shard-000-000.pb" ]) df = OrderedDocumentFrequencies().load(source=args.docfreq_in) self.assertEqual(len(df), VOCAB) with open(os.path.join(tmpdir, "col_sums.txt")) as fin: col_sums = fin.read() with open(os.path.join(tmpdir, "row_sums.txt")) as fin: row_sums = fin.read() self.assertEqual(col_sums, row_sums) with open(os.path.join(tmpdir, "col_vocab.txt")) as fin: col_vocab = fin.read() with open(os.path.join(tmpdir, "row_vocab.txt")) as fin: row_vocab = fin.read() self.assertEqual(col_vocab, row_vocab) self.assertEqual(row_vocab.split("\n"), df.tokens()) for word in row_vocab.split("\n"): self.assertGreater(df[word], 0) with open(os.path.join(tmpdir, "shard-000-000.pb"), "rb") as fin: features = tf.parse_single_example( fin.read(), features={ "global_row": tf.FixedLenFeature([VOCAB], dtype=tf.int64), "global_col": tf.FixedLenFeature([VOCAB], dtype=tf.int64), "sparse_local_row": tf.VarLenFeature(dtype=tf.int64), "sparse_local_col": tf.VarLenFeature(dtype=tf.int64), "sparse_value": tf.VarLenFeature(dtype=tf.float32) }) with tf.Session() as session: global_row, global_col, local_row, local_col, value = session.run( [ features[n] for n in ("global_row", "global_col", "sparse_local_row", "sparse_local_col", "sparse_value") ]) self.assertEqual(set(range(VOCAB)), set(global_row)) self.assertEqual(set(range(VOCAB)), set(global_col)) nnz = 16001 self.assertEqual(value.values.shape, (nnz, )) self.assertEqual(local_row.values.shape, (nnz, )) self.assertEqual(local_col.values.shape, (nnz, )) numpy.random.seed(0) all_tokens = row_vocab.split("\n") chosen_indices = numpy.random.choice(list(range(VOCAB)), 128, replace=False) chosen = [all_tokens[i] for i in chosen_indices] freqs = numpy.zeros((len(chosen), ) * 2, dtype=int) index = {w: i for i, w in enumerate(chosen)} chosen = set(chosen) with asdf.open(args.input) as model: matrix = assemble_sparse_matrix(model.tree["matrix"]).tocsr() tokens = split_strings(model.tree["tokens"]) interesting = {i for i, t in enumerate(tokens) if t in chosen} for y in interesting: row = matrix[y] yi = index[tokens[y]] for x, v in zip(row.indices, row.data): if x in interesting: freqs[yi, index[tokens[x]]] += v matrix = coo_matrix( (value.values, ([global_row[row] for row in local_row.values ], [global_col[col] for col in local_col.values])), shape=(VOCAB, VOCAB)) matrix = matrix.tocsr()[chosen_indices][:, chosen_indices].todense( ).astype(int) self.assertTrue((matrix == freqs).all())