def test_preprocess(self): import tensorflow as tf with tempfile.TemporaryDirectory() as tmpdir: args = default_preprocess_params(tmpdir, VOCAB) with captured_output() as (out, err, log): id2vec_preprocess(args) self.assertFalse(out.getvalue()) self.assertFalse(err.getvalue()) self.assertEqual(sorted(os.listdir(tmpdir)), [ "col_sums.txt", "col_vocab.txt", "row_sums.txt", "row_vocab.txt", "shard-000-000.pb" ]) df = OrderedDocumentFrequencies().load(source=args.docfreq_in) self.assertEqual(len(df), VOCAB) with open(os.path.join(tmpdir, "col_sums.txt")) as fin: col_sums = fin.read() with open(os.path.join(tmpdir, "row_sums.txt")) as fin: row_sums = fin.read() self.assertEqual(col_sums, row_sums) with open(os.path.join(tmpdir, "col_vocab.txt")) as fin: col_vocab = fin.read() with open(os.path.join(tmpdir, "row_vocab.txt")) as fin: row_vocab = fin.read() self.assertEqual(col_vocab, row_vocab) self.assertEqual(row_vocab.split("\n"), df.tokens()) for word in row_vocab.split("\n"): self.assertGreater(df[word], 0) with open(os.path.join(tmpdir, "shard-000-000.pb"), "rb") as fin: features = tf.parse_single_example( fin.read(), features={ "global_row": tf.FixedLenFeature([VOCAB], dtype=tf.int64), "global_col": tf.FixedLenFeature([VOCAB], dtype=tf.int64), "sparse_local_row": tf.VarLenFeature(dtype=tf.int64), "sparse_local_col": tf.VarLenFeature(dtype=tf.int64), "sparse_value": tf.VarLenFeature(dtype=tf.float32) }) with tf.Session() as session: global_row, global_col, local_row, local_col, value = session.run( [ features[n] for n in ("global_row", "global_col", "sparse_local_row", "sparse_local_col", "sparse_value") ]) self.assertEqual(set(range(VOCAB)), set(global_row)) self.assertEqual(set(range(VOCAB)), set(global_col)) nnz = 16001 self.assertEqual(value.values.shape, (nnz, )) self.assertEqual(local_row.values.shape, (nnz, )) self.assertEqual(local_col.values.shape, (nnz, )) numpy.random.seed(0) all_tokens = row_vocab.split("\n") chosen_indices = numpy.random.choice(list(range(VOCAB)), 128, replace=False) chosen = [all_tokens[i] for i in chosen_indices] freqs = numpy.zeros((len(chosen), ) * 2, dtype=int) index = {w: i for i, w in enumerate(chosen)} chosen = set(chosen) with asdf.open(args.input) as model: matrix = assemble_sparse_matrix(model.tree["matrix"]).tocsr() tokens = split_strings(model.tree["tokens"]) interesting = {i for i, t in enumerate(tokens) if t in chosen} for y in interesting: row = matrix[y] yi = index[tokens[y]] for x, v in zip(row.indices, row.data): if x in interesting: freqs[yi, index[tokens[x]]] += v matrix = coo_matrix( (value.values, ([global_row[row] for row in local_row.values ], [global_col[col] for col in local_col.values])), shape=(VOCAB, VOCAB)) matrix = matrix.tocsr()[chosen_indices][:, chosen_indices].todense( ).astype(int) self.assertTrue((matrix == freqs).all())
def __init__(self, output, df_model: OrderedDocumentFrequencies, **kwargs): super().__init__(**kwargs) self.tokens_list = df_model.tokens() self.output = output self.df_model = df_model