Example #1
0
def run_wild_test(_args):
    _args.rng = numpy.random.RandomState(_args.seed)
    _args.loaddata = load_data
    if 'Graph' in _args.circuit:
        _args.graph = True
    if 'Add' in _args.circuit:
        _args.add = True
    if 'Weighted' in _args.circuit:
        _args.weighted = True
    _args.train_set, _args.valid_set, _args.test_set, _args.dicts = _args.loaddata(_args.train_path, _args.valid_path, num_entities=_args.num_entity, dep=_args.graph, train_dep=_args.train_graph, valid_dep=_args.valid_graph, add=_args.add)
    # convert the data from array to numpy arrays
    _args.train_set, _args.valid_set, _args.test_set = batch_run_func((_args.train_set, _args.valid_set, _args.test_set), conv_data_graph, _args.win_l, _args.win_r)
    print 'word dict size:', len(_args.dicts['words2idx'])
    print 'checking training data!'
    check_input(_args.train_set[:3], len(_args.dicts['words2idx']))
    print 'checking test data!'
    check_input(_args.valid_set[:3], len(_args.dicts['words2idx']))
    print 'finish check inputs!!!'
    word2idx = _args.dicts['words2idx']
    prepare_corpus(_args)
    if _args.emb_dir != 'RANDOM':
        print 'started loading embeddings from file', _args.emb_dir        
        M_emb, _ = read_matrix_from_file(_args.emb_dir, word2idx) 
        #M_emb, _ = read_matrix_from_gzip(_args.emb_dir, word2idx) 
        print 'global map size:', len(M_emb) #, count, 'of them are initialized from glove'
        emb_var = theano.shared(M_emb, name='emb_matrix')
        _args.emb_matrix = emb_var
        _args.emb_dim = len(M_emb[0])
        _args.wemb1_out_dim = _args.emb_dim
        if _args.fine_tuning :
            print 'fine tuning!!!!!'
            _args.emb_matrix.is_regularizable = True
    run_wild_prediction(_args) 
Example #2
0
def run_single_corpus(_args):

    _args.rng = numpy.random.RandomState(_args.seed)
    _args.loaddata = load_data_cv
    if 'Graph' in _args.circuit:
        _args.graph = True
    if 'Add' in _args.circuit:
        _args.add = True
    if 'Weighted' in _args.circuit:
        _args.weighted = True

    # load dataset initiallly to build the word2vec dict
    _args.train_set, _args.valid_set, _args.test_set, _args.dicts, _args.corpus_size = _args.loaddata(
        _args.data_dir,
        _args.total_fold,
        _args.dev_fold,
        _args.test_fold,
        num_entities=_args.num_entity,
        dep=_args.graph,
        content_fname=_args.content_file,
        dep_fname=_args.dependent_file,
        add=_args.add,
        additional=_args.additional)
    # convert the data from array to numpy arrays
    _args.train_set, _args.valid_set, _args.test_set = batch_run_func(
        (_args.train_set, _args.valid_set, _args.test_set), conv_data_graph,
        _args.win_l, _args.win_r)
    print 'word dict size:', len(_args.dicts['words2idx'])
    print 'checking training data!'
    check_input(_args.train_set[:3], len(_args.dicts['words2idx']))
    print 'checking test data!'
    check_input(_args.valid_set[:3], len(_args.dicts['words2idx']))
    print 'finish check inputs!!!'
    word2idx = _args.dicts['words2idx']
    prepare_corpus(_args)

    #for k, v in word2idx.iteritems():
    #    print k, v
    if _args.emb_dir != 'RANDOM':
        print 'started loading embeddings from file', _args.emb_dir
        M_emb, _ = read_matrix_from_file(_args.emb_dir, word2idx)
        #M_emb, _ = read_matrix_from_gzip(_args.emb_dir, word2idx)
        print 'global map size:', len(
            M_emb)  #, count, 'of them are initialized from glove'
        emb_var = theano.shared(M_emb, name='emb_matrix')
        _args.emb_matrix = emb_var
        _args.emb_dim = len(M_emb[0])
        _args.wemb1_out_dim = _args.emb_dim
        if _args.fine_tuning:
            print 'fine tuning!!!!!'
            _args.emb_matrix.is_regularizable = True

    # compile the model and initialize the model
    cargs = compile_circuit(_args)

    run_epochs(_args, cargs)
Example #3
0
def init_emb(_args):

    if _args.emb_dir != 'RANDOM':
        print 'started loading embeddings from file', _args.emb_dir

        #print 'global map size:', len(M_emb), len(_args.global_word_map)
        ## load pretrained embeddings
    #_args.global_word_map = _args.dicts['objs2idx']
        Q_emb, _ = read_matrix_from_file(_args.emb_dir, _args.dicts['words2idx'])
        _args.qemb_matrix = theano.shared(Q_emb, name='qemb_matrix')
        O_emb, _ = read_matrix_from_file(_args.emb_dir, _args.dicts['objs2idx'])
        _args.oemb_matrix = theano.shared(O_emb, name='oemb_matrix')
        _args.emb_dim = len(Q_emb[0])
        _args.emb_out_dim = _args.emb_dim
        _args.question_emb_out_dim = _args.emb_dim
        _args.object_emb_out_dim = _args.emb_dim
        _args.attention_out_dim = _args.question_lstm_out_dim*2 + _args.object_emb_out_dim
        if _args.fine_tuning :
            print 'fine tuning!!!!!'
            _args.qemb_matrix.is_trainable= True
            _args.oemb_matrix.is_trainable= True
Example #4
0
def train():
    ''' load vocabulary, datasets(train, dev, test) and the maximum degree '''
    num_entities = int(sys.argv[2])
    train_set, dev_set, dicts, max_degree, arc_type_dict = data_utils.read_RE_dataset_graph(DIR, 5, int(sys.argv[1]), num_entities)

    #train_set, dev_set, test_set = data['train'], data['dev'], data['test']
    print 'train', len(train_set)
    print 'dev', len(dev_set)
    #print 'test', len(test_set)

    num_emb = len(dicts['words2idx'])
    num_labels = len(dicts['labels2idx']) #5 if FINE_GRAINED else 3
    #for _, dataset in data.items():
    #    labels = [label for _, label in dataset]
    #    assert set(labels) <= set(xrange(num_labels)), set(labels)
    print 'num emb', num_emb
    print 'num labels', num_labels

    random.seed(SEED)
    np.random.seed(SEED)
    ''' Initialize the model '''
    model = get_model(num_emb, num_labels, max_degree, num_entities, len(arc_type_dict))
    print 'Finish initializing the model!!!'
    ''' initialize model embeddings to glove '''
    #embeddings = model.embeddings.get_value()
    #glove_vecs = np.load(os.path.join(GLOVE_DIR, 'glove.npy'))
    #glove_words = np.load(os.path.join(GLOVE_DIR, 'words.npy'))
    #glove_word2idx = dict((word, i) for i, word in enumerate(glove_words))
    #for i, word in enumerate(vocab.words):
    #for word, i in dicts['words2idx'].iteritems():
    #    if word in glove_word2idx:
    #        embeddings[i] = glove_vecs[glove_word2idx[word]]
    #glove_vecs, glove_words, glove_word2idx = [], [], []
    M_emb, _ = read_matrix_from_file(EMB_DIR, dicts['words2idx'])
    model.embeddings.set_value(M_emb)
    
    model.vocab = dict((v, k) for k,v in dicts['words2idx'].iteritems())
    print 'Finish loading embeddings!!!'
    for epoch in xrange(NUM_EPOCHS):
        print 'epoch', epoch
        tic = time.time() 
        avg_loss = train_dataset(model, train_set)
        print '\n>> Epoch completed in %.2f (sec) << avg loss: %.2f' % (time.time() - tic,  avg_loss)
        train_score = evaluate_dataset(model, train_set)
        print 'train score', train_score
        dev_score = evaluate_dataset(model, dev_set)
        print 'dev score', dev_score

    print 'finished training'
Example #5
0
def init_emb_multi(_args):
    _args.emb_matrices = []
    # initialize word embeddings
    if _args.emb_dir != 'RANDOM':
        print 'started loading embeddings from file', _args.emb_dir
        M_emb, _ = read_matrix_from_file(_args.emb_dir, _args.global_word_map)
        print 'global map size:', len(M_emb), len(_args.global_word_map)
        ## load pretrained embeddings
    else:
        print 'random initialize the embeddings!'
        M_emb = numpy.random.rand(len(_args.global_word_map)+2, _args.emb_out_dim)
    _args.emb_matrices.append(theano.shared(M_emb, name='wemb_matrix') )
    # add pos embeddings
    P_emb = numpy.random.rand(len(_args.dicts['poss2idx']), _args.pos_emb_dim)
    _args.emb_matrices.append(theano.shared(P_emb, name='pemb_matrix') )
    _args.emb_out_dim = M_emb.shape[1] + P_emb.shape[1]
    if _args.fine_tuning :
        print 'fine tuning!!!!!'
        for matrix in _args.emb_matrices:
            matrix.is_trainable= True
Example #6
0
def load_all_data_multitask(_args):
    # load 3 corpora
    _args.loaddata = load_data_cv
    dataSets = []
    dataset_map = dict()
    lr_arr = []
    _args.num_entity_d0 = 2
    arc_type_dict = dict()
    _args.prediction_files = [_args.drug_gene_prediction_file, _args.drug_var_prediction_file, _args.triple_prediction_file]
    dataSets.append(_args.loaddata(_args.drug_gene_dir, _args.total_fold, _args.dev_fold, _args.test_fold, arc_type_dict, _args.num_entity_d0, dep=_args.graph, content_fname=_args.content_file, dep_fname=_args.dependent_file, add=_args.add))
    dataset_map['drug_gene'] = len(dataset_map)
    lr_arr.append(_args.dg_lr)
    _args.num_entity_d1 = 2
    dataSets.append(_args.loaddata(_args.drug_variant_dir, _args.total_fold, _args.dev_fold, _args.test_fold, arc_type_dict, _args.num_entity_d1, dep=_args.graph, content_fname=_args.content_file, dep_fname=_args.dependent_file, add=_args.add))
    dataset_map['drug_variant'] = len(dataset_map)
    lr_arr.append(_args.dv_lr)
    _args.num_entity_d2 = 3
    dataSets.append(_args.loaddata(_args.drug_gene_variant_dir, _args.total_fold, _args.dev_fold, _args.test_fold, arc_type_dict, _args.num_entity_d2, dep=_args.graph, content_fname=_args.content_file, dep_fname=_args.dependent_file, add=_args.add))
    dataset_map['drug_gene_variant'] = len(dataset_map)
    lr_arr.append(_args.dgv_lr)
    # load embedding
    _args.global_word_map = dict()
    for ds in dataSets:
        _args.global_word_map = combine_word_dicts(_args.global_word_map, ds[-1]['words2idx']) 
    if _args.emb_dir != 'RANDOM':
        print 'started loading embeddings from file', _args.emb_dir        
        M_emb, _ = read_matrix_from_file(_args.emb_dir, _args.global_word_map) 
        print 'global map size:', len(M_emb), len(_args.global_word_map)
        ## load pretrained embeddings
        _args.emb_matrix = theano.shared(M_emb, name='emb_matrix')
        _args.emb_dim = len(M_emb[0])
        _args.wemb1_out_dim = _args.emb_dim
        if _args.fine_tuning :
            print 'fine tuning!!!!!'
            _args.emb_matrix.is_regularizable = True
    print 'loading data dataset map:', dataset_map
    return dataSets, lr_arr, dataset_map
Example #7
0
def main(_args):
    if _args.only_test:
        tester = Tester(_args)
        res_test, pred_test = tester.run(_args)
        exit(0)

    print "loading data from:", _args.training_data, _args.valid_data, _args.test_data
    train_set, valid_set, test_set, dic = loaddata(
        _args.training_data,
        _args.valid_data,
        _args.test_data,
        feature_thresh=_args.ner_feature_thresh,
        mode=_args.emb_type,
        test_label=_args.eval_test)  #, anno=SEG)
    _args.label2idx = dic['labels2idx']
    _args.word2idx = dic['words2idx']
    _args.feature2idx = dic['features2idx']
    _args.win_l = -(_args.win // 2)
    _args.win_r = _args.win // 2
    train_set, valid_set, test_set = batch_run_func(
        (train_set, valid_set, test_set), conv_data, _args.win_l, _args.win_r,
        len(_args.feature2idx), len(_args.label2idx))
    _args.wemb1_win = _args.win
    print _args.label2idx
    nclasses = len(_args.label2idx)
    nsentences = len(train_set[1])
    numpy.random.seed(_args.seed)
    random.seed(_args.seed)
    _args.y_dim = nclasses
    _args.vocsize = len(_args.word2idx)  #ufnum #vocsize
    _args.featsize = len(
        _args.feature2idx)  #+ 1  #!!!! Important: maybe should + 1
    _args.feature_size = _args.featsize + 1  #3
    _args.voc_size = _args.vocsize  #+ 2
    if _args.circuit == 'plainOrderOneCRF':
        _args.emission_trans_out_dim = nclasses
        _args.emb_output_transform_out_dim = nclasses
        _args.model = 'crf'
        print 'emission_trans_out_dim:', _args.emission_trans_out_dim
    else:
        raise NotImplementedError
    _args.nsentences = nsentences
    # eval all training and topology related parameters
    for a in TOPO_PARAM + TRAIN_PARAM:
        try:
            _args.__dict__[a] = eval(_args.__dict__[a])
        except:
            pass
    # This way we can inject code from command line.
    if _args.use_emb and _args.emb_init != 'RANDOM':
        M_emb, idx_map = read_matrix_from_file(_args.emb_file, _args.word2idx)
        emb_var = theano.shared(M_emb, name='emb_matrix')
        _args.emb_matrix = emb_var
        '''print 'printing ner embedding matrix:'
        for row in emb_var.get_value():
            for num in row:
                print num,
            print ''
        '''
        _args.emb_dim = len(M_emb[0])
        print 'embeding size:', _args.emb_dim
        _args.emb_matrix.is_regularizable = False
        if _args.fine_tuning:
            print 'fine tuning!!!!!'
            _args.emb_matrix.is_regularizable = True
    best_f1 = -numpy.inf
    param = dict(clr=_args.lr, ce=0, be=0)  # Create Circuit
    (
        _args.f_cost,
        _args.f_update,
        _args.f_classify,  #_args.f_debug, 
        cargs) = create_circuit(_args, StackConfig)
    #params_to_save = {k:v for k,v in cargs.items() if (hasattr(v, 'is_regularizable') and v.is_regularizable and k.startswith('tparam'))}
    #print _args
    _args.idx2label = dict((k, v) for v, k in _args.label2idx.iteritems())
    _args.idx2word = dict((k, v) for v, k in _args.word2idx.iteritems())
    groundtruth_valid = convert_id_to_word(valid_set[2], _args.idx2label)
    groundtruth_test = None
    if _args.eval_test:
        groundtruth_test = convert_id_to_word(test_set[2], _args.idx2label)
    epoch_id = -1
    while epoch_id + 1 < _args.nepochs:
        epoch_id += 1
        #print 'train_f', train_set[0]
        train_seq(train_set[1], train_set[0], train_set[2], _args,
                  _args.f_cost, _args.f_update, epoch_id, param['clr'])
        # Train and Evaluate
        if epoch_id % _args.neval_epochs == 0:
            groundtruth_train = convert_id_to_word(train_set[2],
                                                   _args.idx2label)
            #print 'evaluate train!!!'
            res_train, pred_train = predict(train_set[0], train_set[1], _args,
                                            groundtruth_train)
            #print 'evaluate valid!!!'
            res_valid, pred_valid = predict(valid_set[0], valid_set[1], _args,
                                            groundtruth_valid)
            print(
                'TEST: epoch',
                epoch_id,
                'train F1',
                res_train['f1'],
                'valid F1',
                res_valid['f1'],
                #'test F1'   , res_test['f1']
            )
            if _args.eval_test:
                res_test, pred_test = predict(test_set[0], test_set[1], _args,
                                              groundtruth_test)
                print 'test F1', res_test['f1']
            # If this update created a 'new best' model then save it.
            if res_valid['f1'] > best_f1:
                best_f1 = res_valid['f1']
                param['be'] = epoch_id
                param['vf1'] = (res_valid['f1']
                                )  #res_train['f1'], , res_test['f1']
                param['vp'] = (res_valid['p']
                               )  #res_train['p'], , res_test['p']
                param['vr'] = (res_valid['r']
                               )  #res_train['r'], , res_test['r']
                if _args.eval_test:
                    param['tf1'] = (res_test['f1'])
                    param['tp'] = (res_test['p'])
                    param['tr'] = (res_test['r'])
                print "saving parameters!"
                cargs['f_classify'] = _args.f_classify
                save_parameters(_args.save_model_param, cargs)
                #error_analysis(valid_set[1], pred_valid, groundtruth_valid, _args.idx2word)
        else:
            pass
        # decay learning rate if no improvement in 10 epochs
        if _args.decay and (epoch_id - param['be']) >= _args.decay_epochs and (
                epoch_id - param['be']) % _args.decay_epochs == 0:
            param['clr'] *= 0.5
        # If learning rate goes down to minimum then break.
        if param['clr'] < _args.minimum_lr:
            print "\nLearning rate became too small, breaking out of training"
            break

    print(
        'BEST RESULT: epoch',
        param['be'],
        'valid F1',
        param['vf1'],
        param['vp'],
        param['vr'],
        #'best test F1', param['tf1'], param['tp'], param['tr']
    )
    if _args.eval_test:
        print 'best test F1', param['tf1'], param['tp'], param['tr']
Example #8
0
def main(_args):
    if _args.only_test:
        cargs = {}
        print "loading parameters!"
        load_params(_args.save_model_param, cargs)
        test_feat, test_lex_orig, test_y = get_data(_args.test_data, cargs['feature2idx'], cargs['word2idx'], cargs['label2idx'], cargs['emb_type'], anno=None, has_label=_args.eval_test)
        test_feat, test_lex, test_y = conv_data(test_feat, test_lex_orig, test_y, cargs['win'], cargs['vocsize'])
        idx2label = dict((k, v) for v, k in cargs['label2idx'].iteritems())
        idx2word = dict((k, v) for v, k in cargs['word2idx'].iteritems())
        groundtruth_test = None
        if _args.eval_test:
            groundtruth_test = convert_id_to_word(test_y, idx2label)
        original_text = convert_id_to_word(test_lex_orig, idx2word)
        f_classify = cargs['f_classify']
        res_test, pred_test = predict(test_feat, test_lex, idx2label, idx2word, _args, f_classify, groundtruth_test)
        write_prediction(_args.test_data+'.prediction', _args.output_dir, original_text, pred_test)
        exit(0)
    
    print "loading data from:", _args.training_data, _args.valid_data, _args.test_data
    train_set, valid_set, test_set, dicts = loaddata(_args.training_data, _args.valid_data, _args.test_data, feature_thresh=_args.ner_feature_thresh, mode=_args.emb_type, test_label=_args.eval_test) 
    train_feat, train_lex_orig, train_y = train_set 
    valid_feat, valid_lex_orig, valid_y = valid_set 
    test_feat, test_lex_orig, test_y = test_set 
    feature2idx = dicts['features2idx'] 
    word2idx = dicts['words2idx'] 
    label2idx = dicts['labels2idx']
    #idx2feature = dict((k, v) for v, k in feature2idx.iteritems())
    _args.label2idx = label2idx
    _args.word2idx = word2idx
    _args.feature2idx = feature2idx
    nclasses = len(label2idx)
    nsentences = len(train_lex_orig)
    numpy.random.seed(_args.seed)
    random.seed(_args.seed)
    _args.y_dim = nclasses
    _args.vocsize = len(feature2idx) #ufnum #vocsize
    _args.in_dim = _args.vocsize #+ 2
    if _args.circuit == 'plainOrderOneCRF':
        _args.emission_trans_out_dim = nclasses
    _args.nsentences = nsentences
    # eval all training and topology related parameters
    for a in TOPO_PARAM + TRAIN_PARAM:
        try:
	    _args.__dict__[a] = eval(_args.__dict__[a])
        except:
            pass
    # This way we can inject code from command line.
    if _args.use_emb == 'true':
    	M_emb, idx_map = read_matrix_from_file(_args.emb_file, word2idx) 
    	emb_var = theano.shared(M_emb, name='emb_matrix')
    	_args.emb_matrix = emb_var 
        _args.emb_dim = len(M_emb[0])
        print 'embeding size:', _args.emb_dim
    	if _args.fine_tuning == 'true':
        	print 'fine tuning!!!!!'
        	_args.emb_matrix.is_regularizable = True
    train_feat, train_lex, train_y = conv_data(train_feat, train_lex_orig, train_y, _args.win, _args.vocsize)
    valid_feat, valid_lex, valid_y = conv_data(valid_feat, valid_lex_orig, valid_y, _args.win, _args.vocsize)
    test_feat, test_lex, test_y = conv_data(test_feat, test_lex_orig, test_y, _args.win, _args.vocsize)
    best_f1 = -numpy.inf
    param = dict(clr = _args.lr, ce = 0, be = 0) # Create Circuit
    (f_cost, f_update, f_classify, f_debug, cargs) = create_circuit(_args, StackConfig)
    #params_to_save = {k:v for k,v in cargs.items() if (hasattr(v, 'is_regularizable') and v.is_regularizable and k.startswith('tparam'))}
    #print _args
    idx2label = dict((k, v) for v, k in _args.label2idx.iteritems())
    idx2word = dict((k, v) for v, k in _args.word2idx.iteritems())
    groundtruth_valid = convert_id_to_word(valid_y, idx2label)
    groundtruth_test = None
    if _args.eval_test:
    	groundtruth_test = convert_id_to_word(test_y, idx2label)
    epoch_id = -1
    while epoch_id+1 < _args.nepochs:
        epoch_id += 1
	train(train_feat, train_lex, train_y, _args, f_cost, f_update, f_debug, epoch_id, param['clr'])
        # Train and Evaluate
	if epoch_id % _args.neval_epochs == 0:
            groundtruth_train = convert_id_to_word(train_y, idx2label)
	    #print 'evaluate train!!!'
	    res_train, pred_train = predict(train_feat, train_lex, idx2label, idx2word, _args, f_classify, groundtruth_train)
	    #print 'evaluate valid!!!'
	    res_valid, pred_valid = predict(valid_feat, valid_lex, idx2label, idx2word, _args, f_classify, groundtruth_valid)
	    res_test, pred_test = predict(test_feat, test_lex, idx2label, idx2word, _args, f_classify, groundtruth_test)
	    print('TEST: epoch', epoch_id,
	          'train F1'   , res_train['f1'],
                  'valid F1'   , res_valid['f1'],
                  #'test F1'   , res_test['f1']
		  )
	    if _args.eval_test:
		print 'test F1'   , res_test['f1']
            # If this update created a 'new best' model then save it.
            if res_valid['f1'] > best_f1:
                best_f1 = res_valid['f1']
                param['be'] = epoch_id
                param['last_decay'] = epoch_id
                param['vf1'] = (res_valid['f1'])  #res_train['f1'], , res_test['f1']
                param['vp'] = (res_valid['p'])  #res_train['p'], , res_test['p']
                param['vr'] = (res_valid['r'])  #res_train['r'], , res_test['r']
		if _args.eval_test:
		    param['tf1'] = (res_test['f1'])
           	    param['tp'] = (res_test['p'])
           	    param['tr'] = (res_test['r'])
		print "saving parameters!"
		cargs['f_classify'] = f_classify
		save_parameters(_args.save_model_param, cargs)
	        '''
		print "loading parameters!"
		load_params(_args.save_model_param, cargs)
	        res_test, pred_test = predict(test_feat, test_lex, idx2label, idx2word, _args, f_classify, groundtruth_test)
	        print 'test F1:', res_test['f1']
		'''
	    else:
                pass
        # decay learning rate if no improvement in 10 epochs
        if _args.decay and (epoch_id - param['last_decay']) >= _args.decay_epochs: #and (epoch_id - param['be']) % _args.decay_epochs == 0:
            print 'learning rate decay at epoch', epoch_id
            param['last_decay'] = epoch_id
            param['clr'] *= 0.5
        # If learning rate goes down to minimum then break.
        if param['clr'] < _args.minimum_lr:
            print "\nLearning rate became too small, breaking out of training"
            break

    print('BEST RESULT: epoch', param['be'],
          'valid F1', param['vf1'], param['vp'], param['vr'],
          #'best test F1', param['tf1'], param['tp'], param['tr'] 
          )
    if _args.eval_test:
	print 'best test F1', param['tf1'], param['tp'], param['tr']