Beispiel #1
0
    def test_shuffle_it(self):
        v = list(range(10))
        padding = -1

        b_it = batch_it(v, size=4, padding=True, padding_elem=padding)

        s_it = shuffle_it(b_it, 3)
        for elem in s_it:
            print(elem)
Beispiel #2
0
def data_pipeline(data, epochs=1, batch_size=args.batch_size, shuffle=False):
    def chunk_fn(x):
        return chunk_it(x, chunk_size=batch_size * 1000)

    if epochs > 1:
        data = repeat_apply(chunk_fn, data, epochs)
    else:
        data = chunk_fn(data)

    if shuffle:
        data = shuffle_it(data, args.shuffle_buffer_size)

    data = batch_it(data, size=batch_size, padding=False)
    return data
Beispiel #3
0
def data_pipeline(hdf5_dataset, epochs=1, batch_size=args.batch_size, shuffle=args.shuffle):
    def chunk_fn(x):
        return chunk_it(x, chunk_size=batch_size * 1000)

    if epochs > 1:
        dataset = repeat_apply(chunk_fn, hdf5_dataset, epochs)
    else:
        dataset = chunk_fn(hdf5_dataset)

    if shuffle:
        dataset = shuffle_it(dataset, args.shuffle_buffer_size)

    # cannot pad because 0 might be a valid index and that screws our evaluation
    # padding = np.zeros([args.ngram_size], dtype=np.int64)
    # dataset = batch_it(dataset, size=batch_size, padding=True, padding_elem=padding)
    dataset = batch_it(dataset, size=batch_size, padding=False)
    return dataset
Beispiel #4
0
    def test_chain_shuffle(self):
        n_samples = 4
        repeat = 2
        v = np.arange(0, n_samples, 1)
        data_it = chunk_it(v, chunk_size=2)

        def chunk_fn(x): return chunk_it(x, chunk_size=2)

        # first chain is normal, second is shuffled from the two repetitions
        data_it = repeat_apply(chunk_fn, v, repeat)

        data_it = chain_it(data_it, shuffle_it(repeat_apply(chunk_fn, v, repeat), buffer_size=8))

        data = list(data_it)

        unique_data = np.unique(data)
        counts = np.unique(np.bincount(data))

        self.assertEqual(len(unique_data), 4)
        self.assertEqual(len(counts), 1)
        self.assertEqual(counts[0], 4)
Beispiel #5
0
    def corpus_pipeline(corpus_stream,
                        n_gram_size=args.ngram_size,
                        epochs=1,
                        batch_size=args.batch_size,
                        shuffle=args.shuffle,
                        flatten=False):
        """ Corpus Processing Pipeline.

        Transforms the corpus reader -a stream of sentences or words- into a stream of n-gram batches.

        Args:
            n_gram_size: the size of the n-gram window
            corpus_stream: the stream of sentences of words
            epochs: number of epochs we want to iterate over this corpus
            batch_size: batch size for the n-gram batch
            shuffle: if true, shuffles the n-grams according to a buffer size
            flatten: if true sliding windows are applied over a stream of words rather than within each sentence
            (n-grams can cross sentence boundaries)
        """

        if flatten:
            word_it = flatten_it(corpus_stream)
            n_grams = window_it(word_it, n_gram_size)
        else:
            sentence_n_grams = (window_it(sentence, n_gram_size)
                                for sentence in corpus_stream)
            n_grams = flatten_it(sentence_n_grams)

        # at this point this is an n_gram iterator
        # n_grams = ([vocab[w] for w in ngram] for ngram in n_grams)
        n_grams = ([index.get_id(w) for w in ngram] for ngram in n_grams)

        if epochs > 1:
            n_grams = repeat_it(n_grams, epochs)

        if shuffle:
            n_grams = shuffle_it(n_grams, args.shuffle_buffer_size)

        n_grams = batch_it(n_grams, size=batch_size, padding=False)
        return n_grams
v = np.random.randint(2, size=[N, M])

v[..., C] = 0
v[R, C] = 1

# print("data:\n", v)
# data pipeline
batch_size = 1
epochs = 4

data = np.concatenate([v, labels], -1)

data = repeat_it(data, 2)

data = shuffle_it(iter(data), buffer_size=batch_size * 4)
data = batch_it(data, batch_size)

label_layer = tx.Input(1)
in_layer = tx.Input(M)

f1 = tx.FC(in_layer, 512, activation=tf.nn.tanh)
f2 = tx.FC(f1, 512, activation=tf.nn.relu)
fm = tx.Highway(f1, f2, carry_gate=True)

out = tx.Linear(f2, 1)
out_prob = tx.Activation(out, fn=tx.sigmoid)

loss = tx.binary_cross_entropy(labels=label_layer.tensor, logits=out.tensor)

model = tx.Model(run_inputs=in_layer,
Beispiel #7
0
    def test_nce_nrp(self):
        vocab_size = 1000
        k = 500
        s = 8
        embed_size = 128
        nce_samples = 10
        noise_ratio = 0.1
        use_nce = True

        vocab = [str(i) for i in range(vocab_size)]

        generator = Generator(k, s)
        sign_index = TrieSignIndex(generator,
                                   vocabulary=vocab,
                                   pregen_indexes=True)
        ris = [
            sign_index.get_ri(sign_index.get_sign(i))
            for i in range(len(sign_index))
        ]
        # ris = [generator.generate() for _ in range(vocab_size)]

        ri_tensor = ris_to_sp_tensor_value(ri_seq=ris,
                                           dim=k,
                                           all_positive=False)

        ri_tensor_input = tx.SparseInput(n_units=k, value=ri_tensor)

        if use_nce:
            label_inputs = tx.SparseInput(k, name="target_random_indices")
        else:
            label_inputs = [
                tx.Input(1, dtype=tf.int64, name="ids"),
                tx.InputParam(dtype=tf.int32,
                              value=vocab_size,
                              name="vocab_size")
            ]

        eval_label_inputs = [
            tx.Input(1, dtype=tf.int64, name="ids_eval"),
            tx.InputParam(dtype=tf.int32, value=vocab_size, name="vocab_size")
        ]

        model = NRP(
            run_inputs=tx.SparseInput(n_units=k, name="random_index_inputs"),
            label_inputs=label_inputs,
            eval_label_input=eval_label_inputs,
            ctx_size=2,
            # vocab_size=vocab_size,
            k_dim=k,
            ri_tensor_input=ri_tensor_input,  # current dictionary state
            embed_dim=embed_size,
            h_dim=128,
            num_h=1,
            h_activation=tx.relu,
            use_dropout=True,
            embed_dropout=True,
            keep_prob=0.70,
            use_nce=use_nce,
            nce_samples=nce_samples,
            nce_noise_amount=noise_ratio,
            noise_input=tx.SparseInput(k, name="noise"))

        tf.summary.histogram("embeddings", model.embeddings.weights)
        for h in model.h_layers:
            tf.summary.histogram("h", h.linear.weights)

        # model.eval_tensors.append(model.train_loss_tensors[0])
        runner = tx.ModelRunner(model)
        runner.set_log_dir("/tmp")
        runner.log_graph()

        options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
        # options = None
        runner.set_session(runtime_stats=True, run_options=options)

        # options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)

        # runner.config_optimizer(tf.train.GradientDescentOptimizer(learning_rate=0.005))#,
        # SGD with 0.025

        # lr = tx.InputParam(init_value=0.0002)
        lr = tx.InputParam(value=0.025)
        # runner.config_optimizer(tf.train.AdamOptimizer(learning_rate=lr.tensor, beta1=0.9), params=lr,
        runner.config_optimizer(
            tf.train.GradientDescentOptimizer(learning_rate=lr.tensor),
            optimizer_params=lr,
            global_gradient_op=False,
            # gradient_op=lambda grad: tf.clip_by_global_norm(grad, 10.0)[0])
            gradient_op=lambda grad: tf.clip_by_norm(grad, 1.0))

        data = np.array([[0, 2], [5, 7], [9, 8], [3, 4], [1, 9], [12, 8]])
        labels = np.array([[32], [56], [12], [2], [5], [23]])

        ppl_curve = []
        n = 256
        batch_size = 128

        dataset = np.column_stack((data, labels))
        # print(dataset)
        dataset = views.repeat_it([dataset], n)
        dataset = views.flatten_it(dataset)
        # shuffle 5 at a time
        dataset = views.shuffle_it(dataset, 6)
        dataset = views.batch_it(dataset, batch_size)

        # print(np.array(list(dataset)))
        # d = list(views.take_it(1, views.shuffle_it(d, 4)))[0]

        data_stream = dataset

        for data_stream in tqdm(data_stream, total=n * 5 / batch_size):
            sample = np.array(data_stream)

            ctx = sample[:, :-1]
            ctx.flatten()
            ctx = ctx.flatten()
            ctx_ris = [sign_index.get_ri(sign_index.get_sign(i)) for i in ctx]
            ctx_ris = ris_to_sp_tensor_value(
                ctx_ris,
                dim=sign_index.feature_dim(),
                all_positive=not sign_index.generator.symmetric)
            lbl_ids = sample[:, -1:]
            lbl = lbl_ids.flatten()

            if use_nce:
                lbl_ris = [
                    sign_index.get_ri(sign_index.get_sign(i)) for i in lbl
                ]
                lbl_ris = ris_to_sp_tensor_value(
                    lbl_ris,
                    dim=sign_index.feature_dim(),
                    all_positive=not sign_index.generator.symmetric)

                noise = generate_noise(k_dim=k,
                                       batch_size=lbl_ris.dense_shape[0] *
                                       nce_samples,
                                       ratio=noise_ratio)
                runner.train(ctx_ris, [lbl_ris, noise],
                             output_loss=True,
                             write_summaries=True)
            else:
                runner.train(model_input_data=ctx_ris,
                             loss_input_data=lbl_ids,
                             output_loss=True,
                             write_summaries=True)

        runner.close_session()