def test_get_sign(self): dim = 100 act = 10 gen = Generator(dim, act) signs = [str(i) for i in range(10)] sign_index = TrieSignIndex(gen, vocabulary=signs) for s in signs: self.assertTrue(sign_index.contains(s)) id = sign_index.get_id(s) self.assertTrue(sign_index.contains_id(id)) s2 = sign_index.get_sign(id) self.assertEqual(s,s2)\ #get sign for an id that doesn't exist id = 86 s = sign_index.get_sign(id) self.assertEqual(s,None) self.assertFalse(sign_index.contains_id(id)) self.assertEqual(len(sign_index.sign_trie),len(signs)) self.assertTrue(sign_index.contains_id(len(signs)-1)) self.assertFalse(sign_index.contains_id(len(signs)))
def test_contains(self): dim = 100 act = 10 gen = Generator(dim, act) sign_index = TrieSignIndex(generator=gen) sign_index.add("0") self.assertTrue(sign_index.contains("0")) self.assertFalse(sign_index.contains("1")) sign_index.remove("0") self.assertFalse(sign_index.contains("0"))
def test_size(self): gen = Generator(100, 10) sign_index = TrieSignIndex(generator=gen) # adding elements should increase size self.assertEqual(len(sign_index), 0) sign_index.add("0") self.assertEqual(len(sign_index), 1) # duplicated elements are not added sign_index.add("0") self.assertEqual(len(sign_index), 1) sign_index.add("1") self.assertEqual(len(sign_index), 2) # removing elements should reduce size size_before = len(sign_index) sign_index.remove("0") size_after = len(sign_index) self.assertEqual(size_after, size_before - 1)
def test_save(self): dim = 100 act = 10 gen = Generator(dim, act) signs = [str(i) for i in range(10)] sign_index = TrieSignIndex(gen, vocabulary=signs) filename = "index.hdf5" directory = os.path.dirname(os.path.abspath(__file__)) output_file = directory+"/"+filename self.assertFalse(os.path.exists(output_file)) try: sign_index.save(output_file) self.assertTrue(os.path.exists(output_file)) h5file = h5py.File(output_file,'r') h5signs = h5file["signs"] h5ri = h5file["ri"] self.assertEqual(len(h5signs),len(signs)) print(h5ri[0]) print(h5ri.attrs["k"]) print(h5ri.attrs["s"]) print(h5ri.attrs["state"].tostring()) h5file.close() except: raise finally: if os.path.exists(output_file): os.remove(output_file) self.assertFalse(os.path.exists(output_file))
def test_get_ri(self): dim = 100 act = 10 gen = Generator(dim, act) sign_index = TrieSignIndex(gen) sign_index.add("0") self.assertTrue(sign_index.contains("0")) ri0 = sign_index.get_ri("0") self.assertIsInstance(ri0, RandomIndex) self.assertEqual(ri0.dim, dim)
def test_load(self): """ The ids should be the same when the index is loaded back up """ dim = 100 act = 10 gen = Generator(dim, act) signs1 = [str(i) for i in range(1000)] index1 = TrieSignIndex(gen, vocabulary=signs1) filename = "index.hdf5" directory = os.path.dirname(os.path.abspath(__file__)) index_file = directory + "/" + filename self.assertFalse(os.path.exists(index_file)) try: index1.save(index_file) self.assertTrue(os.path.exists(index_file)) index2 = TrieSignIndex.load(index_file) self.assertEqual(len(index2),len(index1)) for sign in signs1: self.assertTrue(index1.contains(sign)) self.assertTrue(index2.contains(sign)) id1 = index1.get_id(sign) id2 = index2.get_id(sign) self.assertEqual(id1,id2) ri1 = index1.get_ri(sign).to_vector() ri2 = index2.get_ri(sign).to_vector() np.testing.assert_array_equal(ri1,ri2) except: raise finally: if os.path.exists(index_file): os.remove(index_file) self.assertFalse(os.path.exists(index_file))
# iterates over lines but loads them as chunks #n_rows = 100000 #sentences = chunk_it(corpus_dataset,n_rows=n_rows, chunk_size=20000) n_rows = len(corpus_dataset) sentences = chunk_it(corpus_dataset, chunk_size=100000) pipeline = WaCKyPipe(datagen=sentences) # ====================================================================================== # Load Vocabulary # ====================================================================================== vocab_file = data_dir + "wacky_vocab_6M_spacy.hdf5" vocab_hdf5 = h5py.File(vocab_file, 'r') ri_gen = Generator(dim=k, num_active=s) print("Loading Vocabulary...") sign_index = TrieSignIndex(ri_gen, list(vocab_hdf5["vocabulary"][:]), pregen_indexes=False) if subsampling: freq = TrieSignIndex.map_frequencies(list(vocab_hdf5["vocabulary"][:]), list(vocab_hdf5["frequencies"][:]), sign_index) total_freq = np.sum(vocab_hdf5["frequencies"]) print("done") # ====================================================================================== # Neural Random Projections Model # ====================================================================================== pos_labels = Input(n_units=k, name="yp") neg_labels = Input(n_units=k, name="yn")
def run(**kwargs): arg_dict.from_dict(kwargs) args = arg_dict.to_namespace() # ====================================================================================== # Load Corpus & Vocab # ====================================================================================== corpus = PTBReader(path=args.corpus, mark_eos=args.mark_eos) corpus_stats = h5py.File(os.path.join(args.corpus, "ptb_stats.hdf5"), mode='r') ri_generator = Generator(dim=args.k_dim, num_active=args.s_active, symmetric=True) # vocab = marisa_trie.Trie(corpus_stats["vocabulary"]) index = TrieSignIndex(generator=ri_generator, vocabulary=corpus_stats["vocabulary"], pregen_indexes=True) # for i in range(1000): # w = index.get_sign(i) # ri: RandomIndex = index.get_ri(w) # print(w) # print(ri) # print(ri) # print(index.get_id(w)) # pre-gen indices for vocab, we could do this iteratively ... same thing # ris = [ri_generator.generate() for _ in range(len(vocab))] # print(vocab.keys()) # index = TrieSignIndex(generator=ri_generator,vocabulary=vocab) # TODO could create the NRP model with NCE only and input with random indices could be passed to the model # dynamically, also for inference and evaluation, we could either work with a dynamic encoding process # or give it the current ri tensor with all the known ris if we know there are no OOV words (words that might not # have been seen during training. # table with random indices for all known symbols # ri_tensor = RandomIndexTensor.from_ri_list(ris, k=args.k_dim, s=args.s_active) def corpus_pipeline(corpus_stream, n_gram_size=args.ngram_size, epochs=1, batch_size=args.batch_size, shuffle=args.shuffle, flatten=False): """ Corpus Processing Pipeline. Transforms the corpus reader -a stream of sentences or words- into a stream of n-gram batches. Args: n_gram_size: the size of the n-gram window corpus_stream: the stream of sentences of words epochs: number of epochs we want to iterate over this corpus batch_size: batch size for the n-gram batch shuffle: if true, shuffles the n-grams according to a buffer size flatten: if true sliding windows are applied over a stream of words rather than within each sentence (n-grams can cross sentence boundaries) """ if flatten: word_it = flatten_it(corpus_stream) n_grams = window_it(word_it, n_gram_size) else: sentence_n_grams = (window_it(sentence, n_gram_size) for sentence in corpus_stream) n_grams = flatten_it(sentence_n_grams) # at this point this is an n_gram iterator # n_grams = ([vocab[w] for w in ngram] for ngram in n_grams) n_grams = ([index.get_id(w) for w in ngram] for ngram in n_grams) if epochs > 1: n_grams = repeat_it(n_grams, epochs) if shuffle: n_grams = shuffle_it(n_grams, args.shuffle_buffer_size) n_grams = batch_it(n_grams, size=batch_size, padding=False) return n_grams # print("counting dataset samples...") training_len = sum(1 for _ in corpus_pipeline( corpus.training_set(), batch_size=1, epochs=1, shuffle=False)) validation_len = None test_len = None if args.eval_progress: validation_len = sum(1 for _ in corpus_pipeline( corpus.validation_set(), batch_size=1, epochs=1, shuffle=False)) test_len = sum(1 for _ in corpus_pipeline( corpus.test_set(), batch_size=1, epochs=1, shuffle=False)) # print("done") # print("dset len ", training_len) # ====================================================================================== # Load Params, Prepare results assets # ====================================================================================== # Experiment parameter summary res_param_filename = os.path.join( args.out_dir, "params_{id}_{run}.csv".format(id=args.id, run=args.run)) with open(res_param_filename, "w") as param_file: writer = csv.DictWriter(f=param_file, fieldnames=arg_dict.keys()) writer.writeheader() writer.writerow(arg_dict) param_file.flush() # make dir for model checkpoints if args.save_model: model_ckpt_dir = os.path.join( args.out_dir, "model_{id}_{run}".format(id=args.id, run=args.run)) os.makedirs(model_ckpt_dir, exist_ok=True) model_path = os.path.join( model_ckpt_dir, "nnlm_{id}_{run}.ckpt".format(id=args.id, run=args.run)) # start perplexity file ppl_header = ["id", "run", "epoch", "step", "lr", "dataset", "perplexity"] ppl_fname = os.path.join( args.out_dir, "perplexity_{id}_{run}.csv".format(id=args.id, run=args.run)) ppl_file = open(ppl_fname, "w") ppl_writer = csv.DictWriter(f=ppl_file, fieldnames=ppl_header) ppl_writer.writeheader() # ====================================================================================== # MODEL # ====================================================================================== # Configure weight initializers based on activation functions if args.h_act == "relu": h_act = tx.relu h_init = tx.he_normal_init() elif args.h_act == "tanh": h_act = tx.tanh h_init = tx.glorot_uniform() elif args.h_act == "elu": h_act = tx.elu h_init = tx.he_normal_init() elif args.h_act == "selu": h_act = tf.nn.selu h_init = tx.glorot_uniform() # Configure embedding and logit weight initializers if args.embed_init == "normal": embed_init = tx.random_normal(mean=0., stddev=args.embed_init_val) elif args.embed_init == "uniform": embed_init = tx.random_uniform(minval=-args.embed_init_val, maxval=args.embed_init_val) if args.logit_init == "normal": logit_init = tx.random_normal(mean=0., stddev=args.logit_init_val) elif args.logit_init == "uniform": logit_init = tx.random_uniform(minval=-args.logit_init_val, maxval=args.logit_init_val) f_init = None if args.use_f_predict: if args.f_init == "normal": f_init = tx.random_normal(mean=0., stddev=args.f_init_val) elif args.f_init == "uniform": f_init = tx.random_uniform(minval=-args.f_init_val, maxval=args.f_init_val) model = NRP(ctx_size=args.ngram_size - 1, sign_index=index, k_dim=args.k_dim, s_active=args.s_active, embed_dim=args.embed_dim, h_dim=args.h_dim, embed_init=embed_init, logit_init=logit_init, num_h=args.num_h, h_activation=h_act, h_init=h_init, use_dropout=args.dropout, embed_dropout=args.embed_dropout, keep_prob=args.keep_prob, l2_loss=args.l2_loss, l2_loss_coef=args.l2_loss_coef, f_init=f_init, embed_share=args.embed_share, logit_bias=args.logit_bias, use_nce=args.nce, nce_samples=args.nce_samples, nce_noise_amount=0.04) model_runner = tx.ModelRunner(model) # Input params can be changed during training by setting their value # lr_param = tx.InputParam(init_value=args.lr) lr_param = tensorx.train.EvalStepDecayParam( value=args.lr, improvement_threshold=args.eval_threshold, less_is_better=True, decay_rate=args.lr_decay_rate, decay_threshold=args.lr_decay_threshold) if args.optimizer == "sgd": optimizer = tf.train.GradientDescentOptimizer( learning_rate=lr_param.tensor) elif args.optimizer == "adam": optimizer = tf.train.AdamOptimizer(learning_rate=lr_param.tensor, beta1=args.optimizer_beta1, beta2=args.optimizer_beta2, epsilon=args.optimizer_epsilon) elif args.optimizer == "ams": optimizer = tx.AMSGrad(learning_rate=lr_param.tensor, beta1=args.optimizer_beta1, beta2=args.optimizer_beta2, epsilon=args.optimizer_epsilon) def clip_grad_global(grads): grads, _ = tf.clip_by_global_norm(grads, 12) return grads def clip_grad_local(grad): return tf.clip_by_norm(grad, args.clip_value) if args.clip_grads: if args.clip_local: clip_fn = clip_grad_local else: clip_fn = clip_grad_global if args.clip_grads: model_runner.config_optimizer(optimizer, optimizer_params=lr_param, gradient_op=clip_fn, global_gradient_op=not args.clip_local) else: model_runner.config_optimizer(optimizer, optimizer_params=lr_param) # ====================================================================================== # EVALUATION # ====================================================================================== def eval_model(runner, dataset_it, len_dataset=None, display_progress=False): if display_progress: pb = tqdm(total=len_dataset, ncols=60, position=1) batches_processed = 0 sum_loss = 0 for batch in dataset_it: batch = np.array(batch, dtype=np.int64) ctx = batch[:, :-1] target = batch[:, -1:] mean_loss = runner.eval(ctx, target) sum_loss += mean_loss if display_progress: pb.update(args.batch_size) batches_processed += 1 if display_progress: pb.close() return np.exp(sum_loss / batches_processed) def evaluation(runner: tx.ModelRunner, progress_bar, cur_epoch, step, display_progress=False): ppl_validation = eval_model( runner, corpus_pipeline(corpus.validation_set(), epochs=1, shuffle=False), validation_len, display_progress) res_row = { "id": args.id, "run": args.run, "epoch": cur_epoch, "step": step, "lr": lr_param.value, "dataset": "validation", "perplexity": ppl_validation } ppl_writer.writerow(res_row) if args.eval_test: # pb.write("[Eval Test Set]") ppl_test = eval_model( runner, corpus_pipeline(corpus.test_set(), epochs=1, shuffle=False), test_len, display_progress) res_row = { "id": args.id, "run": args.run, "epoch": cur_epoch, "step": step, "lr": lr_param.value, "dataset": "test", "perplexity": ppl_test } ppl_writer.writerow(res_row) ppl_file.flush() if args.eval_test: progress_bar.set_postfix({"test PPL ": ppl_test}) # pb.write("valid. ppl = {}".format(ppl_validation)) return ppl_validation # ====================================================================================== # TRAINING LOOP # ====================================================================================== # print("Starting TensorFlow Session") # preparing evaluation steps # I use ceil because I make sure we have padded batches at the end epoch_step = 0 global_step = 0 current_epoch = 0 patience = 0 cfg = tf.ConfigProto() cfg.gpu_options.allow_growth = True sess = tf.Session(config=cfg) model_runner.set_session(sess) model_runner.init_vars() progress = tqdm(total=training_len * args.epochs, position=args.pid + 1, disable=not args.display_progress) training_data = corpus_pipeline(corpus.training_set(), batch_size=args.batch_size, epochs=args.epochs, shuffle=args.shuffle) evaluations = [] try: for ngram_batch in training_data: epoch = progress.n // training_len + 1 # Start New Epoch if epoch != current_epoch: current_epoch = epoch epoch_step = 0 if args.display_progress: progress.set_postfix({"epoch": current_epoch}) # ================================================ # EVALUATION # ================================================ if epoch_step == 0: current_eval = evaluation(model_runner, progress, epoch, global_step, display_progress=args.eval_progress) evaluations.append(current_eval) lr_param.update(current_eval) # print(lr_param.eval_history) # print("improvement ", lr_param.eval_improvement()) if global_step > 0: if args.early_stop and epoch > 1: if lr_param.eval_improvement( ) < lr_param.improvement_threshold: if patience >= 3: break patience += 1 else: patience = 0 # ================================================ # TRAIN MODEL # ================================================ ngram_batch = np.array(ngram_batch, dtype=np.int64) ctx_ids = ngram_batch[:, :-1] word_ids = ngram_batch[:, -1:] model_runner.train(ctx_ids, word_ids) progress.update(args.batch_size) epoch_step += 1 global_step += 1 # if not early stop, evaluate last state of the model if not args.early_stop or patience < 3: current_eval = evaluation(model_runner, progress, epoch, epoch_step) evaluations.append(current_eval) ppl_file.close() if args.save_model: model_runner.save_model(model_name=model_path, step=global_step, write_state=False) model_runner.close_session() progress.close() tf.reset_default_graph() # return the best validation evaluation return min(evaluations) except Exception as e: traceback.print_exc() os.remove(ppl_file.name) os.remove(param_file.name) raise e
# n_rows = len(corpus_dataset) # ====================================================================================== # Load Vocabulary # ====================================================================================== if args.lemmas: vocab_file = data_dir + "bnc_vocab_lemma.hdf5" else: vocab_file = data_dir + "bnc_vocab.hdf5" vocab_hdf5 = h5py.File(vocab_file, 'r') ri_gen = Generator(dim=k, num_active=s) print("Loading Vocabulary...") index = TrieSignIndex(ri_gen, list(vocab_hdf5["vocabulary"][:]), pregen_indexes=False) if subsampling: freq = TrieSignIndex.map_frequencies(list(vocab_hdf5["vocabulary"][:]), list(vocab_hdf5["frequencies"][:]), index) total_freq = np.sum(vocab_hdf5["frequencies"]) print("done") # ====================================================================================== # Neural Random Projections Model # ======================================================================================
print("vocab loaded") print(t1 - t0) top10w = list(vocabulary[0:10]) top10f = list(frequencies[0:10]) top10ids = [trie.get(top10w[i]) for i in range(10)] top10w_trie = [trie.restore_key(i) for i in top10ids] print(top10w) print(top10f) print(top10w_trie) ri_gen = Generator(dim=1000, num_active=10) t0 = time.time() sign_index = TrieSignIndex(ri_gen, list(vocabulary[:])) t1 = time.time() print(t1 - t0) print(top10ids) top10w_index = [sign_index.get_sign(i) for i in top10ids] print(top10w_index) #test load top ten print("=============================================") index = TrieSignIndex(generator=ri_gen, vocabulary=top10w) print(top10w) top10ids = [index.get_id(w) for w in top10w] print(top10ids) freq = TrieSignIndex.map_frequencies(top10w, top10f, index) top10freq = [freq[i] for i in top10ids]
result = pool.map(func=text_to_ri, iterable=args) pool.close() if __name__ == '__main__': # model parameters max_sentences = 10000 # corpus and output assets home = os.getenv("HOME") corpus_file = home + "/data/gold_standards/wacky_1M.hdf5" output_vectors = home + "/data/results/wacky_ri_1M.hdf5" # load sign index print("loading vocabulary") vocab_file = home + "/data/results/wacky_vocab_1M.hdf5" h5v = h5py.File(vocab_file, 'r') vocabulary = h5v["vocabulary"] frequencies = h5v["frequencies"] ri_gen = Generator(dim=1000, num_active=10) sign_index = TrieSignIndex(generator=ri_gen, vocabulary=list(vocabulary[()]), pregen_indexes=True) print("done") # index_filename = None parallel_ri(corpus_file, max_sentences, window_size=3, n_processes=16)
from deepsign.rp.index import TrieSignIndex as Index from deepsign.data.corpora.toefl import TOEFLReader import seaborn as sns # model dir home = os.getenv("HOME") data_dir = home + "/data/gold_standards/" result_dir = home + "/data/results/" model_dir = result_dir + "nrp/300d_reg_embeddings/" model_file = model_dir + "model_bnc" embeddings_file = model_dir + "embeddings.npy" index_file = model_dir + "index.hdf5" # load index print("loading word index") index = Index.load(index_file) print(len(index)) # load toefl questions_file = data_dir + "toefl/questions.csv" answers_file = data_dir + "toefl/answers.csv" toefl = TOEFLReader(questions_file=questions_file, answers_file=answers_file) # words in toelf and not in index toefl_remove = set(w for w in toefl.words if not index.contains(w)) for (i, question) in enumerate(toefl.questions): qw = question[0] aw = question[1] # print(question) answer = toefl.answer(i)
def test_nce_nrp(self): vocab_size = 1000 k = 500 s = 8 embed_size = 128 nce_samples = 10 noise_ratio = 0.1 use_nce = True vocab = [str(i) for i in range(vocab_size)] generator = Generator(k, s) sign_index = TrieSignIndex(generator, vocabulary=vocab, pregen_indexes=True) ris = [ sign_index.get_ri(sign_index.get_sign(i)) for i in range(len(sign_index)) ] # ris = [generator.generate() for _ in range(vocab_size)] ri_tensor = ris_to_sp_tensor_value(ri_seq=ris, dim=k, all_positive=False) ri_tensor_input = tx.SparseInput(n_units=k, value=ri_tensor) if use_nce: label_inputs = tx.SparseInput(k, name="target_random_indices") else: label_inputs = [ tx.Input(1, dtype=tf.int64, name="ids"), tx.InputParam(dtype=tf.int32, value=vocab_size, name="vocab_size") ] eval_label_inputs = [ tx.Input(1, dtype=tf.int64, name="ids_eval"), tx.InputParam(dtype=tf.int32, value=vocab_size, name="vocab_size") ] model = NRP( run_inputs=tx.SparseInput(n_units=k, name="random_index_inputs"), label_inputs=label_inputs, eval_label_input=eval_label_inputs, ctx_size=2, # vocab_size=vocab_size, k_dim=k, ri_tensor_input=ri_tensor_input, # current dictionary state embed_dim=embed_size, h_dim=128, num_h=1, h_activation=tx.relu, use_dropout=True, embed_dropout=True, keep_prob=0.70, use_nce=use_nce, nce_samples=nce_samples, nce_noise_amount=noise_ratio, noise_input=tx.SparseInput(k, name="noise")) tf.summary.histogram("embeddings", model.embeddings.weights) for h in model.h_layers: tf.summary.histogram("h", h.linear.weights) # model.eval_tensors.append(model.train_loss_tensors[0]) runner = tx.ModelRunner(model) runner.set_log_dir("/tmp") runner.log_graph() options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) # options = None runner.set_session(runtime_stats=True, run_options=options) # options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) # runner.config_optimizer(tf.train.GradientDescentOptimizer(learning_rate=0.005))#, # SGD with 0.025 # lr = tx.InputParam(init_value=0.0002) lr = tx.InputParam(value=0.025) # runner.config_optimizer(tf.train.AdamOptimizer(learning_rate=lr.tensor, beta1=0.9), params=lr, runner.config_optimizer( tf.train.GradientDescentOptimizer(learning_rate=lr.tensor), optimizer_params=lr, global_gradient_op=False, # gradient_op=lambda grad: tf.clip_by_global_norm(grad, 10.0)[0]) gradient_op=lambda grad: tf.clip_by_norm(grad, 1.0)) data = np.array([[0, 2], [5, 7], [9, 8], [3, 4], [1, 9], [12, 8]]) labels = np.array([[32], [56], [12], [2], [5], [23]]) ppl_curve = [] n = 256 batch_size = 128 dataset = np.column_stack((data, labels)) # print(dataset) dataset = views.repeat_it([dataset], n) dataset = views.flatten_it(dataset) # shuffle 5 at a time dataset = views.shuffle_it(dataset, 6) dataset = views.batch_it(dataset, batch_size) # print(np.array(list(dataset))) # d = list(views.take_it(1, views.shuffle_it(d, 4)))[0] data_stream = dataset for data_stream in tqdm(data_stream, total=n * 5 / batch_size): sample = np.array(data_stream) ctx = sample[:, :-1] ctx.flatten() ctx = ctx.flatten() ctx_ris = [sign_index.get_ri(sign_index.get_sign(i)) for i in ctx] ctx_ris = ris_to_sp_tensor_value( ctx_ris, dim=sign_index.feature_dim(), all_positive=not sign_index.generator.symmetric) lbl_ids = sample[:, -1:] lbl = lbl_ids.flatten() if use_nce: lbl_ris = [ sign_index.get_ri(sign_index.get_sign(i)) for i in lbl ] lbl_ris = ris_to_sp_tensor_value( lbl_ris, dim=sign_index.feature_dim(), all_positive=not sign_index.generator.symmetric) noise = generate_noise(k_dim=k, batch_size=lbl_ris.dense_shape[0] * nce_samples, ratio=noise_ratio) runner.train(ctx_ris, [lbl_ris, noise], output_loss=True, write_summaries=True) else: runner.train(model_input_data=ctx_ris, loss_input_data=lbl_ids, output_loss=True, write_summaries=True) runner.close_session()