def test_basic(): # code adapted from Tao's `rationale.py`: train = 'data/reviews.aspect1.train.txt.gz' train_x, train_y = myio.read_annotations(train) # train_x = [embedding_layer.map_to_ids(x)[:max_len] for x in train_x] dev = 'data/reviews.aspect1.heldout.txt.gz' dev_x, dev_y = myio.read_annotations(dev) # dev_x = [embedding_layer.map_to_ids(x)[:max_len] for x in dev_x] load_rationale = 'data/annotations.json' rationale_data = myio.read_rationales(load_rationale)
def main(): print args embedding_layer = None if args.embedding: assert args.embedding, "Pre-trained word embeddings required." embedding_layer = myio.create_embedding_layer(args.embedding) max_len = args.max_len if args.train: train_x, train_y = myio.read_annotations(args.train) train_words = set([word for x in train_x for word in x]) embedding_layer = EmbeddingLayer(n_d=args.hidden_dimension, vocab=["<unk>", "<padding>"] + list(train_words), oov="<unk>", fix_init_embs=False) train_x = [embedding_layer.map_to_ids(x)[:max_len] for x in train_x] if args.dev: dev_x, dev_y = myio.read_annotations(args.dev) dev_x = [embedding_layer.map_to_ids(x)[:max_len] for x in dev_x] if args.load_rationale: rationale_data = myio.read_rationales(args.load_rationale) for x in rationale_data: x["xids"] = embedding_layer.map_to_ids(x["x"]) if args.train: model = Model(args=args, embedding_layer=embedding_layer, nclasses=len(train_y[0])) model.ready() #debug_func2 = theano.function( # inputs = [ model.x, model.z ], # outputs = model.generator.logpz # ) #theano.printing.debugprint(debug_func2) #return model.train( (train_x, train_y), (dev_x, dev_y) if args.dev else None, None, #(test_x, test_y), rationale_data if args.load_rationale else None)
def main(): print args assert args.embedding, "Pre-trained word embeddings required." embedding_layer = myio.create_embedding_layer( args.embedding ) max_len = args.max_len if args.train: train_x, train_y = myio.read_annotations(args.train) train_x = [ embedding_layer.map_to_ids(x)[:max_len] for x in train_x ] if args.dev: dev_x, dev_y = myio.read_annotations(args.dev) dev_x = [ embedding_layer.map_to_ids(x)[:max_len] for x in dev_x ] if args.load_rationale: rationale_data = myio.read_rationales(args.load_rationale) for x in rationale_data: x["xids"] = embedding_layer.map_to_ids(x["x"]) if args.train: model = Model( args = args, embedding_layer = embedding_layer, nclasses = len(train_y[0]) ) model.ready() #debug_func2 = theano.function( # inputs = [ model.x, model.z ], # outputs = model.generator.logpz # ) #theano.printing.debugprint(debug_func2) #return model.train( (train_x, train_y), (dev_x, dev_y) if args.dev else None, None, #(test_x, test_y), rationale_data if args.load_rationale else None )
def main(): print args assert args.embedding, "Pre-trained word embeddings required." embedding_layer = myio.create_embedding_layer(args.embedding) max_len = args.max_len if args.train: train_x, train_y = myio.read_annotations(args.train) train_x = [embedding_layer.map_to_ids(x)[:max_len] for x in train_x] if args.dev: dev_x, dev_y = myio.read_annotations(args.dev) dev_x = [embedding_layer.map_to_ids(x)[:max_len] for x in dev_x] if args.load_rationale: rationale_data = myio.read_rationales(args.load_rationale) for x in rationale_data: x["xids"] = embedding_layer.map_to_ids(x["x"]) if args.train: model = Model(args=args, embedding_layer=embedding_layer, nclasses=len(train_y[0])) model.ready() #debug_func2 = theano.function( # inputs = [ model.x, model.z ], # outputs = model.generator.logpz # ) #theano.printing.debugprint(debug_func2) #return model.train( (train_x, train_y), (dev_x, dev_y) if args.dev else None, None, #(test_x, test_y), rationale_data if args.load_rationale else None)
def main(): print args set_default_rng_seed(args.seed) assert args.embedding, "Pre-trained word embeddings required." embedding_layer = myio.create_embedding_layer(args.embedding) max_len = args.max_len if args.train: train_x, train_y = myio.read_annotations(args.train) if args.debug: len_ = len(train_x) * args.debug len_ = int(len_) train_x = train_x[:len_] train_y = train_y[:len_] print 'train size: ', len(train_x) #, train_x[0], len(train_x[0]) #exit() train_x = [embedding_layer.map_to_ids(x)[:max_len] for x in train_x] if args.dev: dev_x, dev_y = myio.read_annotations(args.dev) if args.debug: len_ = len(dev_x) * args.debug len_ = int(len_) dev_x = dev_x[:len_] dev_x = dev_y[:len_] print 'train size: ', len(train_x) dev_x = [embedding_layer.map_to_ids(x)[:max_len] for x in dev_x] if args.load_rationale: rationale_data = myio.read_rationales(args.load_rationale) for x in rationale_data: x["xids"] = embedding_layer.map_to_ids(x["x"]) #print 'in main: ', args.seed if args.train: model = Model(args=args, embedding_layer=embedding_layer, nclasses=len(train_y[0])) if args.load_model: model.load_model(args.load_model, seed=args.seed, select_all=args.select_all) say("model loaded successfully.\n") else: model.ready() #say(" ready time nedded {} \n".format(time.time()-start_ready_time)) #debug_func2 = theano.function( # inputs = [ model.x, model.z ], # outputs = model.generator.logpz # ) #theano.printing.debugprint(debug_func2) #return model.train( (train_x, train_y), (dev_x, dev_y) if args.dev else None, None, #(test_x, test_y), rationale_data if args.load_rationale else None, trained_max_epochs=args.trained_max_epochs) if args.load_model and not args.dev and not args.train: model = Model(args=args, embedding_layer=embedding_layer, nclasses=-1) model.load_model(args.load_model, seed=args.seed, select_all=args.select_all) say("model loaded successfully.\n") sample_generator = theano.function( inputs=[model.x], outputs=model.z, #updates = model.generator.sample_updates ) sample_encoder = theano.function( inputs=[model.x, model.y, model.z], outputs=[ model.encoder.obj, model.encoder.loss, model.encoder.pred_diff ], #updates = model.generator.sample_updates ) # compile an evaluation function eval_func = theano.function( inputs=[model.x, model.y], outputs=[ model.z, model.encoder.obj, model.encoder.loss, model.encoder.pred_diff ], #updates = model.generator.sample_updates ) debug_func_enc = theano.function( inputs=[model.x, model.y], outputs=[ model.z, model.encoder.obj, model.encoder.loss, model.encoder.pred_diff ], #updates = model.generator.sample_updates ) debug_func_gen = theano.function( inputs=[model.x, model.y], outputs=[ model.z, model.encoder.obj, model.encoder.loss, model.encoder.pred_diff ], #updates = model.generator.sample_updates ) # compile a predictor function pred_func = theano.function( inputs=[model.x], outputs=[model.z, model.encoder.preds], #updates = model.generator.sample_updates ) # batching data padding_id = embedding_layer.vocab_map["<padding>"] if rationale_data is not None: valid_batches_x, valid_batches_y = myio.create_batches( [u["xids"] for u in rationale_data], [u["y"] for u in rationale_data], args.batch, padding_id, sort=False) # disable dropout model.dropout.set_value(0.0) if rationale_data is not None: #model.dropout.set_value(0.0) start_rational_time = time.time() r_mse, r_p1, r_prec1, r_prec2, gen_time, enc_time, prec_cal_time = model.evaluate_rationale( rationale_data, valid_batches_x, valid_batches_y, sample_generator, sample_encoder, eval_func) #valid_batches_y, eval_func) #model.dropout.set_value(dropout_prob) #say(("\ttest rationale mser={:.4f} p[1]r={:.2f} prec1={:.4f}" + # " prec2={:.4f} generator time={:.4f} encoder time={:.4f} total test time={:.4f}\n").format( # r_mse, # r_p1, # r_prec1, # r_prec2, # gen_time, # enc_time, # time.time() - start_rational_time #)) data = str('%.5f' % r_mse) + "\t" + str( '%4.2f' % r_p1) + "\t" + str('%4.4f' % r_prec1) + "\t" + str( '%4.4f' % r_prec2) + "\t" + str('%4.2f' % gen_time) + "\t" + str( '%4.2f' % enc_time) + "\t" + str( '%4.2f' % prec_cal_time) + "\t" + str( '%4.2f' % (time.time() - start_rational_time) ) + "\t" + str(args.sparsity) + "\t" + str( args.coherent) + "\t" + str( args.max_epochs) + "\t" + str( args.cur_epoch) with open(args.graph_data_path, 'a') as g_f: print 'writning to file: ', data g_f.write(data + "\n")
def main(): print(args) assert args.embedding, "Pre-trained word embeddings required." embedding_layer = myio.create_embedding_layer(args.embedding) max_len = args.max_len if args.train: train_x, train_y = myio.read_annotations(args.train) train_x = [embedding_layer.map_to_ids(x)[:max_len] for x in train_x] if args.dev: dev_x, dev_y = myio.read_annotations(args.dev) dev_x = [embedding_layer.map_to_ids(x)[:max_len] for x in dev_x] if args.load_rationale: rationale_data = myio.read_rationales(args.load_rationale) for x in rationale_data: x["xids"] = embedding_layer.map_to_ids(x["x"]) if args.train: model = Model(args=args, embedding_layer=embedding_layer, nclasses=len(train_y[0])) model.ready() model.train( (train_x, train_y), (dev_x, dev_y) if args.dev else None, None, #(test_x, test_y), rationale_data if args.load_rationale else None) if args.load_model and args.dev and not args.train: model = Model(args=None, embedding_layer=embedding_layer, nclasses=-1) model.load_model(args.load_model) say("model loaded successfully.\n") # compile an evaluation function eval_func = theano.function( inputs=[model.x, model.y], outputs=[ model.z, model.generator.obj, model.generator.loss, model.encoder.pred_diff ], givens={model.z: model.generator.z_pred}, ) # compile a predictor function pred_func = theano.function( inputs=[model.x], outputs=[model.z, model.encoder.preds], givens={model.z: model.generator.z_pred}, ) # batching data padding_id = embedding_layer.vocab_map["<padding>"] dev_batches_x, dev_batches_y = myio.create_batches( dev_x, dev_y, args.batch, padding_id) # disable dropout model.dropout.set_value(0.0) dev_obj, dev_loss, dev_diff, dev_p1 = model.evaluate_data( dev_batches_x, dev_batches_y, eval_func, sampling=True) say("{} {} {} {}\n".format(dev_obj, dev_loss, dev_diff, dev_p1))
def main(): print args assert args.embedding, "Pre-trained word embeddings required." embedding_layer = myio.create_embedding_layer( args.embedding ) max_len = args.max_len if args.train: train_x, train_y = myio.read_annotations(args.train) train_x = [ embedding_layer.map_to_ids(x)[:max_len] for x in train_x ] if args.dev: dev_x, dev_y = myio.read_annotations(args.dev) dev_x = [ embedding_layer.map_to_ids(x)[:max_len] for x in dev_x ] if args.load_rationale: rationale_data = myio.read_rationales(args.load_rationale) for x in rationale_data: x["xids"] = embedding_layer.map_to_ids(x["x"]) if args.train: model = Model( args = args, embedding_layer = embedding_layer, nclasses = len(train_y[0]) ) model.ready() #debug_func2 = theano.function( # inputs = [ model.x, model.z ], # outputs = model.generator.logpz # ) #theano.printing.debugprint(debug_func2) #return model.train( (train_x, train_y), (dev_x, dev_y) if args.dev else None, None, #(test_x, test_y), rationale_data if args.load_rationale else None ) if args.load_model and args.dev and not args.train: model = Model( args = None, embedding_layer = embedding_layer, nclasses = -1 ) model.load_model(args.load_model) say("model loaded successfully.\n") # compile an evaluation function eval_func = theano.function( inputs = [ model.x, model.y ], outputs = [ model.z, model.encoder.obj, model.encoder.loss, model.encoder.pred_diff ], updates = model.generator.sample_updates ) # compile a predictor function pred_func = theano.function( inputs = [ model.x ], outputs = [ model.z, model.encoder.preds ], updates = model.generator.sample_updates ) # batching data padding_id = embedding_layer.vocab_map["<padding>"] dev_batches_x, dev_batches_y = myio.create_batches( dev_x, dev_y, args.batch, padding_id ) # disable dropout model.dropout.set_value(0.0) dev_obj, dev_loss, dev_diff, dev_p1 = model.evaluate_data( dev_batches_x, dev_batches_y, eval_func, sampling=True) say("{} {} {} {}\n".format(dev_obj, dev_loss, dev_diff, dev_p1))
def main(): assert args.embedding, "Pre-trained word embeddings required." embedding_layer = myio.create_embedding_layer(args.embedding) embedding_layer_y = myio.create_embedding_layer(args.embedding) max_len_x = args.sentence_length * args.max_sentences max_len_y = args.sentence_length_hl * args.max_sentences_hl if args.train: train_x, train_y = myio.read_docs(args.train) train_x = [embedding_layer.map_to_ids(x)[:max_len_x] for x in train_x] train_y = [ embedding_layer_y.map_to_ids(y)[:max_len_y] for y in train_y ] if args.dev: dev_x, dev_y = myio.read_docs(args.dev) dev_x = [embedding_layer.map_to_ids(x)[:max_len_x] for x in dev_x] dev_y = [embedding_layer_y.map_to_ids(y)[:max_len_y] for y in dev_y] if args.load_rationale: rationale_data = myio.read_rationales(args.load_rationale) for x in rationale_data: x["xids"] = embedding_layer.map_to_ids(x["x"]) if args.train: model = Model(args=args, embedding_layer=embedding_layer, embedding_layer_y=embedding_layer_y, nclasses=len(train_y[0])) model.ready() # debug_func2 = theano.function( # inputs = [ model.x, model.z ], # outputs = model.generator.logpz # ) # theano.printing.debugprint(debug_func2) # return model.train( (train_x, train_y), (dev_x, dev_y) if args.dev else None, None, # (test_x, test_y), rationale_data if args.load_rationale else None) if args.load_model and args.dev and not args.train: model = Model(args=None, embedding_layer=embedding_layer, nclasses=-1) model.load_model(args.load_model) say("model loaded successfully.\n") # compile an evaluation function eval_func = theano.function(inputs=[model.x, model.y], outputs=[ model.z, model.encoder.obj, model.encoder.loss, model.encoder.pred_diff ], updates=model.generator.sample_updates) # compile a predictor function pred_func = theano.function(inputs=[model.x], outputs=[model.z, model.encoder.preds], updates=model.generator.sample_updates) # batching data padding_id = embedding_layer.vocab_map["<padding>"] dev_batches_x, dev_batches_y = myio.create_batches( dev_x, dev_y, args.batch, padding_id) # disable dropout model.dropout.set_value(0.0) dev_obj, dev_loss, dev_diff, dev_p1 = model.evaluate_data( dev_batches_x, dev_batches_y, eval_func, sampling=True) say("{} {} {} {}\n".format(dev_obj, dev_loss, dev_diff, dev_p1))