def run_wild_test(_args): _args.rng = numpy.random.RandomState(_args.seed) _args.loaddata = load_data if 'Graph' in _args.circuit: _args.graph = True if 'Add' in _args.circuit: _args.add = True if 'Weighted' in _args.circuit: _args.weighted = True _args.train_set, _args.valid_set, _args.test_set, _args.dicts = _args.loaddata(_args.train_path, _args.valid_path, num_entities=_args.num_entity, dep=_args.graph, train_dep=_args.train_graph, valid_dep=_args.valid_graph, add=_args.add) # convert the data from array to numpy arrays _args.train_set, _args.valid_set, _args.test_set = batch_run_func((_args.train_set, _args.valid_set, _args.test_set), conv_data_graph, _args.win_l, _args.win_r) print 'word dict size:', len(_args.dicts['words2idx']) print 'checking training data!' check_input(_args.train_set[:3], len(_args.dicts['words2idx'])) print 'checking test data!' check_input(_args.valid_set[:3], len(_args.dicts['words2idx'])) print 'finish check inputs!!!' word2idx = _args.dicts['words2idx'] prepare_corpus(_args) if _args.emb_dir != 'RANDOM': print 'started loading embeddings from file', _args.emb_dir M_emb, _ = read_matrix_from_file(_args.emb_dir, word2idx) #M_emb, _ = read_matrix_from_gzip(_args.emb_dir, word2idx) print 'global map size:', len(M_emb) #, count, 'of them are initialized from glove' emb_var = theano.shared(M_emb, name='emb_matrix') _args.emb_matrix = emb_var _args.emb_dim = len(M_emb[0]) _args.wemb1_out_dim = _args.emb_dim if _args.fine_tuning : print 'fine tuning!!!!!' _args.emb_matrix.is_regularizable = True run_wild_prediction(_args)
def run_single_corpus(_args): _args.rng = numpy.random.RandomState(_args.seed) _args.loaddata = load_data_cv if 'Graph' in _args.circuit: _args.graph = True if 'Add' in _args.circuit: _args.add = True if 'Weighted' in _args.circuit: _args.weighted = True # load dataset initiallly to build the word2vec dict _args.train_set, _args.valid_set, _args.test_set, _args.dicts, _args.corpus_size = _args.loaddata( _args.data_dir, _args.total_fold, _args.dev_fold, _args.test_fold, num_entities=_args.num_entity, dep=_args.graph, content_fname=_args.content_file, dep_fname=_args.dependent_file, add=_args.add, additional=_args.additional) # convert the data from array to numpy arrays _args.train_set, _args.valid_set, _args.test_set = batch_run_func( (_args.train_set, _args.valid_set, _args.test_set), conv_data_graph, _args.win_l, _args.win_r) print 'word dict size:', len(_args.dicts['words2idx']) print 'checking training data!' check_input(_args.train_set[:3], len(_args.dicts['words2idx'])) print 'checking test data!' check_input(_args.valid_set[:3], len(_args.dicts['words2idx'])) print 'finish check inputs!!!' word2idx = _args.dicts['words2idx'] prepare_corpus(_args) #for k, v in word2idx.iteritems(): # print k, v if _args.emb_dir != 'RANDOM': print 'started loading embeddings from file', _args.emb_dir M_emb, _ = read_matrix_from_file(_args.emb_dir, word2idx) #M_emb, _ = read_matrix_from_gzip(_args.emb_dir, word2idx) print 'global map size:', len( M_emb) #, count, 'of them are initialized from glove' emb_var = theano.shared(M_emb, name='emb_matrix') _args.emb_matrix = emb_var _args.emb_dim = len(M_emb[0]) _args.wemb1_out_dim = _args.emb_dim if _args.fine_tuning: print 'fine tuning!!!!!' _args.emb_matrix.is_regularizable = True # compile the model and initialize the model cargs = compile_circuit(_args) run_epochs(_args, cargs)
def init_emb(_args): if _args.emb_dir != 'RANDOM': print 'started loading embeddings from file', _args.emb_dir #print 'global map size:', len(M_emb), len(_args.global_word_map) ## load pretrained embeddings #_args.global_word_map = _args.dicts['objs2idx'] Q_emb, _ = read_matrix_from_file(_args.emb_dir, _args.dicts['words2idx']) _args.qemb_matrix = theano.shared(Q_emb, name='qemb_matrix') O_emb, _ = read_matrix_from_file(_args.emb_dir, _args.dicts['objs2idx']) _args.oemb_matrix = theano.shared(O_emb, name='oemb_matrix') _args.emb_dim = len(Q_emb[0]) _args.emb_out_dim = _args.emb_dim _args.question_emb_out_dim = _args.emb_dim _args.object_emb_out_dim = _args.emb_dim _args.attention_out_dim = _args.question_lstm_out_dim*2 + _args.object_emb_out_dim if _args.fine_tuning : print 'fine tuning!!!!!' _args.qemb_matrix.is_trainable= True _args.oemb_matrix.is_trainable= True
def train(): ''' load vocabulary, datasets(train, dev, test) and the maximum degree ''' num_entities = int(sys.argv[2]) train_set, dev_set, dicts, max_degree, arc_type_dict = data_utils.read_RE_dataset_graph(DIR, 5, int(sys.argv[1]), num_entities) #train_set, dev_set, test_set = data['train'], data['dev'], data['test'] print 'train', len(train_set) print 'dev', len(dev_set) #print 'test', len(test_set) num_emb = len(dicts['words2idx']) num_labels = len(dicts['labels2idx']) #5 if FINE_GRAINED else 3 #for _, dataset in data.items(): # labels = [label for _, label in dataset] # assert set(labels) <= set(xrange(num_labels)), set(labels) print 'num emb', num_emb print 'num labels', num_labels random.seed(SEED) np.random.seed(SEED) ''' Initialize the model ''' model = get_model(num_emb, num_labels, max_degree, num_entities, len(arc_type_dict)) print 'Finish initializing the model!!!' ''' initialize model embeddings to glove ''' #embeddings = model.embeddings.get_value() #glove_vecs = np.load(os.path.join(GLOVE_DIR, 'glove.npy')) #glove_words = np.load(os.path.join(GLOVE_DIR, 'words.npy')) #glove_word2idx = dict((word, i) for i, word in enumerate(glove_words)) #for i, word in enumerate(vocab.words): #for word, i in dicts['words2idx'].iteritems(): # if word in glove_word2idx: # embeddings[i] = glove_vecs[glove_word2idx[word]] #glove_vecs, glove_words, glove_word2idx = [], [], [] M_emb, _ = read_matrix_from_file(EMB_DIR, dicts['words2idx']) model.embeddings.set_value(M_emb) model.vocab = dict((v, k) for k,v in dicts['words2idx'].iteritems()) print 'Finish loading embeddings!!!' for epoch in xrange(NUM_EPOCHS): print 'epoch', epoch tic = time.time() avg_loss = train_dataset(model, train_set) print '\n>> Epoch completed in %.2f (sec) << avg loss: %.2f' % (time.time() - tic, avg_loss) train_score = evaluate_dataset(model, train_set) print 'train score', train_score dev_score = evaluate_dataset(model, dev_set) print 'dev score', dev_score print 'finished training'
def init_emb_multi(_args): _args.emb_matrices = [] # initialize word embeddings if _args.emb_dir != 'RANDOM': print 'started loading embeddings from file', _args.emb_dir M_emb, _ = read_matrix_from_file(_args.emb_dir, _args.global_word_map) print 'global map size:', len(M_emb), len(_args.global_word_map) ## load pretrained embeddings else: print 'random initialize the embeddings!' M_emb = numpy.random.rand(len(_args.global_word_map)+2, _args.emb_out_dim) _args.emb_matrices.append(theano.shared(M_emb, name='wemb_matrix') ) # add pos embeddings P_emb = numpy.random.rand(len(_args.dicts['poss2idx']), _args.pos_emb_dim) _args.emb_matrices.append(theano.shared(P_emb, name='pemb_matrix') ) _args.emb_out_dim = M_emb.shape[1] + P_emb.shape[1] if _args.fine_tuning : print 'fine tuning!!!!!' for matrix in _args.emb_matrices: matrix.is_trainable= True
def load_all_data_multitask(_args): # load 3 corpora _args.loaddata = load_data_cv dataSets = [] dataset_map = dict() lr_arr = [] _args.num_entity_d0 = 2 arc_type_dict = dict() _args.prediction_files = [_args.drug_gene_prediction_file, _args.drug_var_prediction_file, _args.triple_prediction_file] dataSets.append(_args.loaddata(_args.drug_gene_dir, _args.total_fold, _args.dev_fold, _args.test_fold, arc_type_dict, _args.num_entity_d0, dep=_args.graph, content_fname=_args.content_file, dep_fname=_args.dependent_file, add=_args.add)) dataset_map['drug_gene'] = len(dataset_map) lr_arr.append(_args.dg_lr) _args.num_entity_d1 = 2 dataSets.append(_args.loaddata(_args.drug_variant_dir, _args.total_fold, _args.dev_fold, _args.test_fold, arc_type_dict, _args.num_entity_d1, dep=_args.graph, content_fname=_args.content_file, dep_fname=_args.dependent_file, add=_args.add)) dataset_map['drug_variant'] = len(dataset_map) lr_arr.append(_args.dv_lr) _args.num_entity_d2 = 3 dataSets.append(_args.loaddata(_args.drug_gene_variant_dir, _args.total_fold, _args.dev_fold, _args.test_fold, arc_type_dict, _args.num_entity_d2, dep=_args.graph, content_fname=_args.content_file, dep_fname=_args.dependent_file, add=_args.add)) dataset_map['drug_gene_variant'] = len(dataset_map) lr_arr.append(_args.dgv_lr) # load embedding _args.global_word_map = dict() for ds in dataSets: _args.global_word_map = combine_word_dicts(_args.global_word_map, ds[-1]['words2idx']) if _args.emb_dir != 'RANDOM': print 'started loading embeddings from file', _args.emb_dir M_emb, _ = read_matrix_from_file(_args.emb_dir, _args.global_word_map) print 'global map size:', len(M_emb), len(_args.global_word_map) ## load pretrained embeddings _args.emb_matrix = theano.shared(M_emb, name='emb_matrix') _args.emb_dim = len(M_emb[0]) _args.wemb1_out_dim = _args.emb_dim if _args.fine_tuning : print 'fine tuning!!!!!' _args.emb_matrix.is_regularizable = True print 'loading data dataset map:', dataset_map return dataSets, lr_arr, dataset_map
def main(_args): if _args.only_test: tester = Tester(_args) res_test, pred_test = tester.run(_args) exit(0) print "loading data from:", _args.training_data, _args.valid_data, _args.test_data train_set, valid_set, test_set, dic = loaddata( _args.training_data, _args.valid_data, _args.test_data, feature_thresh=_args.ner_feature_thresh, mode=_args.emb_type, test_label=_args.eval_test) #, anno=SEG) _args.label2idx = dic['labels2idx'] _args.word2idx = dic['words2idx'] _args.feature2idx = dic['features2idx'] _args.win_l = -(_args.win // 2) _args.win_r = _args.win // 2 train_set, valid_set, test_set = batch_run_func( (train_set, valid_set, test_set), conv_data, _args.win_l, _args.win_r, len(_args.feature2idx), len(_args.label2idx)) _args.wemb1_win = _args.win print _args.label2idx nclasses = len(_args.label2idx) nsentences = len(train_set[1]) numpy.random.seed(_args.seed) random.seed(_args.seed) _args.y_dim = nclasses _args.vocsize = len(_args.word2idx) #ufnum #vocsize _args.featsize = len( _args.feature2idx) #+ 1 #!!!! Important: maybe should + 1 _args.feature_size = _args.featsize + 1 #3 _args.voc_size = _args.vocsize #+ 2 if _args.circuit == 'plainOrderOneCRF': _args.emission_trans_out_dim = nclasses _args.emb_output_transform_out_dim = nclasses _args.model = 'crf' print 'emission_trans_out_dim:', _args.emission_trans_out_dim else: raise NotImplementedError _args.nsentences = nsentences # eval all training and topology related parameters for a in TOPO_PARAM + TRAIN_PARAM: try: _args.__dict__[a] = eval(_args.__dict__[a]) except: pass # This way we can inject code from command line. if _args.use_emb and _args.emb_init != 'RANDOM': M_emb, idx_map = read_matrix_from_file(_args.emb_file, _args.word2idx) emb_var = theano.shared(M_emb, name='emb_matrix') _args.emb_matrix = emb_var '''print 'printing ner embedding matrix:' for row in emb_var.get_value(): for num in row: print num, print '' ''' _args.emb_dim = len(M_emb[0]) print 'embeding size:', _args.emb_dim _args.emb_matrix.is_regularizable = False if _args.fine_tuning: print 'fine tuning!!!!!' _args.emb_matrix.is_regularizable = True best_f1 = -numpy.inf param = dict(clr=_args.lr, ce=0, be=0) # Create Circuit ( _args.f_cost, _args.f_update, _args.f_classify, #_args.f_debug, cargs) = create_circuit(_args, StackConfig) #params_to_save = {k:v for k,v in cargs.items() if (hasattr(v, 'is_regularizable') and v.is_regularizable and k.startswith('tparam'))} #print _args _args.idx2label = dict((k, v) for v, k in _args.label2idx.iteritems()) _args.idx2word = dict((k, v) for v, k in _args.word2idx.iteritems()) groundtruth_valid = convert_id_to_word(valid_set[2], _args.idx2label) groundtruth_test = None if _args.eval_test: groundtruth_test = convert_id_to_word(test_set[2], _args.idx2label) epoch_id = -1 while epoch_id + 1 < _args.nepochs: epoch_id += 1 #print 'train_f', train_set[0] train_seq(train_set[1], train_set[0], train_set[2], _args, _args.f_cost, _args.f_update, epoch_id, param['clr']) # Train and Evaluate if epoch_id % _args.neval_epochs == 0: groundtruth_train = convert_id_to_word(train_set[2], _args.idx2label) #print 'evaluate train!!!' res_train, pred_train = predict(train_set[0], train_set[1], _args, groundtruth_train) #print 'evaluate valid!!!' res_valid, pred_valid = predict(valid_set[0], valid_set[1], _args, groundtruth_valid) print( 'TEST: epoch', epoch_id, 'train F1', res_train['f1'], 'valid F1', res_valid['f1'], #'test F1' , res_test['f1'] ) if _args.eval_test: res_test, pred_test = predict(test_set[0], test_set[1], _args, groundtruth_test) print 'test F1', res_test['f1'] # If this update created a 'new best' model then save it. if res_valid['f1'] > best_f1: best_f1 = res_valid['f1'] param['be'] = epoch_id param['vf1'] = (res_valid['f1'] ) #res_train['f1'], , res_test['f1'] param['vp'] = (res_valid['p'] ) #res_train['p'], , res_test['p'] param['vr'] = (res_valid['r'] ) #res_train['r'], , res_test['r'] if _args.eval_test: param['tf1'] = (res_test['f1']) param['tp'] = (res_test['p']) param['tr'] = (res_test['r']) print "saving parameters!" cargs['f_classify'] = _args.f_classify save_parameters(_args.save_model_param, cargs) #error_analysis(valid_set[1], pred_valid, groundtruth_valid, _args.idx2word) else: pass # decay learning rate if no improvement in 10 epochs if _args.decay and (epoch_id - param['be']) >= _args.decay_epochs and ( epoch_id - param['be']) % _args.decay_epochs == 0: param['clr'] *= 0.5 # If learning rate goes down to minimum then break. if param['clr'] < _args.minimum_lr: print "\nLearning rate became too small, breaking out of training" break print( 'BEST RESULT: epoch', param['be'], 'valid F1', param['vf1'], param['vp'], param['vr'], #'best test F1', param['tf1'], param['tp'], param['tr'] ) if _args.eval_test: print 'best test F1', param['tf1'], param['tp'], param['tr']
def main(_args): if _args.only_test: cargs = {} print "loading parameters!" load_params(_args.save_model_param, cargs) test_feat, test_lex_orig, test_y = get_data(_args.test_data, cargs['feature2idx'], cargs['word2idx'], cargs['label2idx'], cargs['emb_type'], anno=None, has_label=_args.eval_test) test_feat, test_lex, test_y = conv_data(test_feat, test_lex_orig, test_y, cargs['win'], cargs['vocsize']) idx2label = dict((k, v) for v, k in cargs['label2idx'].iteritems()) idx2word = dict((k, v) for v, k in cargs['word2idx'].iteritems()) groundtruth_test = None if _args.eval_test: groundtruth_test = convert_id_to_word(test_y, idx2label) original_text = convert_id_to_word(test_lex_orig, idx2word) f_classify = cargs['f_classify'] res_test, pred_test = predict(test_feat, test_lex, idx2label, idx2word, _args, f_classify, groundtruth_test) write_prediction(_args.test_data+'.prediction', _args.output_dir, original_text, pred_test) exit(0) print "loading data from:", _args.training_data, _args.valid_data, _args.test_data train_set, valid_set, test_set, dicts = loaddata(_args.training_data, _args.valid_data, _args.test_data, feature_thresh=_args.ner_feature_thresh, mode=_args.emb_type, test_label=_args.eval_test) train_feat, train_lex_orig, train_y = train_set valid_feat, valid_lex_orig, valid_y = valid_set test_feat, test_lex_orig, test_y = test_set feature2idx = dicts['features2idx'] word2idx = dicts['words2idx'] label2idx = dicts['labels2idx'] #idx2feature = dict((k, v) for v, k in feature2idx.iteritems()) _args.label2idx = label2idx _args.word2idx = word2idx _args.feature2idx = feature2idx nclasses = len(label2idx) nsentences = len(train_lex_orig) numpy.random.seed(_args.seed) random.seed(_args.seed) _args.y_dim = nclasses _args.vocsize = len(feature2idx) #ufnum #vocsize _args.in_dim = _args.vocsize #+ 2 if _args.circuit == 'plainOrderOneCRF': _args.emission_trans_out_dim = nclasses _args.nsentences = nsentences # eval all training and topology related parameters for a in TOPO_PARAM + TRAIN_PARAM: try: _args.__dict__[a] = eval(_args.__dict__[a]) except: pass # This way we can inject code from command line. if _args.use_emb == 'true': M_emb, idx_map = read_matrix_from_file(_args.emb_file, word2idx) emb_var = theano.shared(M_emb, name='emb_matrix') _args.emb_matrix = emb_var _args.emb_dim = len(M_emb[0]) print 'embeding size:', _args.emb_dim if _args.fine_tuning == 'true': print 'fine tuning!!!!!' _args.emb_matrix.is_regularizable = True train_feat, train_lex, train_y = conv_data(train_feat, train_lex_orig, train_y, _args.win, _args.vocsize) valid_feat, valid_lex, valid_y = conv_data(valid_feat, valid_lex_orig, valid_y, _args.win, _args.vocsize) test_feat, test_lex, test_y = conv_data(test_feat, test_lex_orig, test_y, _args.win, _args.vocsize) best_f1 = -numpy.inf param = dict(clr = _args.lr, ce = 0, be = 0) # Create Circuit (f_cost, f_update, f_classify, f_debug, cargs) = create_circuit(_args, StackConfig) #params_to_save = {k:v for k,v in cargs.items() if (hasattr(v, 'is_regularizable') and v.is_regularizable and k.startswith('tparam'))} #print _args idx2label = dict((k, v) for v, k in _args.label2idx.iteritems()) idx2word = dict((k, v) for v, k in _args.word2idx.iteritems()) groundtruth_valid = convert_id_to_word(valid_y, idx2label) groundtruth_test = None if _args.eval_test: groundtruth_test = convert_id_to_word(test_y, idx2label) epoch_id = -1 while epoch_id+1 < _args.nepochs: epoch_id += 1 train(train_feat, train_lex, train_y, _args, f_cost, f_update, f_debug, epoch_id, param['clr']) # Train and Evaluate if epoch_id % _args.neval_epochs == 0: groundtruth_train = convert_id_to_word(train_y, idx2label) #print 'evaluate train!!!' res_train, pred_train = predict(train_feat, train_lex, idx2label, idx2word, _args, f_classify, groundtruth_train) #print 'evaluate valid!!!' res_valid, pred_valid = predict(valid_feat, valid_lex, idx2label, idx2word, _args, f_classify, groundtruth_valid) res_test, pred_test = predict(test_feat, test_lex, idx2label, idx2word, _args, f_classify, groundtruth_test) print('TEST: epoch', epoch_id, 'train F1' , res_train['f1'], 'valid F1' , res_valid['f1'], #'test F1' , res_test['f1'] ) if _args.eval_test: print 'test F1' , res_test['f1'] # If this update created a 'new best' model then save it. if res_valid['f1'] > best_f1: best_f1 = res_valid['f1'] param['be'] = epoch_id param['last_decay'] = epoch_id param['vf1'] = (res_valid['f1']) #res_train['f1'], , res_test['f1'] param['vp'] = (res_valid['p']) #res_train['p'], , res_test['p'] param['vr'] = (res_valid['r']) #res_train['r'], , res_test['r'] if _args.eval_test: param['tf1'] = (res_test['f1']) param['tp'] = (res_test['p']) param['tr'] = (res_test['r']) print "saving parameters!" cargs['f_classify'] = f_classify save_parameters(_args.save_model_param, cargs) ''' print "loading parameters!" load_params(_args.save_model_param, cargs) res_test, pred_test = predict(test_feat, test_lex, idx2label, idx2word, _args, f_classify, groundtruth_test) print 'test F1:', res_test['f1'] ''' else: pass # decay learning rate if no improvement in 10 epochs if _args.decay and (epoch_id - param['last_decay']) >= _args.decay_epochs: #and (epoch_id - param['be']) % _args.decay_epochs == 0: print 'learning rate decay at epoch', epoch_id param['last_decay'] = epoch_id param['clr'] *= 0.5 # If learning rate goes down to minimum then break. if param['clr'] < _args.minimum_lr: print "\nLearning rate became too small, breaking out of training" break print('BEST RESULT: epoch', param['be'], 'valid F1', param['vf1'], param['vp'], param['vr'], #'best test F1', param['tf1'], param['tp'], param['tr'] ) if _args.eval_test: print 'best test F1', param['tf1'], param['tp'], param['tr']