def build_model(self, pc, best_model_path):
        
        if best_model_path:
            print 'Loading model from: {}'.format(best_model_path)
            self.RNN, self.VOCAB_LOOKUP, self.R, self.bias = dy.load(best_model_path, pc)
        else:
            # LSTM
            self.RNN  = dy.CoupledLSTMBuilder(self.hyperparams['LAYERS'], self.hyperparams['INPUT_DIM'], self.hyperparams['HIDDEN_DIM'], pc)
            
            # embedding lookups for vocabulary
            self.VOCAB_LOOKUP  = pc.add_lookup_parameters((self.hyperparams['VOCAB_SIZE'], self.hyperparams['INPUT_DIM']))

            # softmax parameters
            self.R = pc.add_parameters((self.hyperparams['VOCAB_SIZE'], self.hyperparams['HIDDEN_DIM']))
            self.bias = pc.add_parameters(self.hyperparams['VOCAB_SIZE'])
        
        
        print 'Model dimensions:'
        print ' * VOCABULARY EMBEDDING LAYER: IN-DIM: {}, OUT-DIM: {}'.format(self.hyperparams['VOCAB_SIZE'], self.hyperparams['INPUT_DIM'])
        print
        print ' * LSTM: IN-DIM: {}, OUT-DIM: {}'.format(self.hyperparams['INPUT_DIM'], self.hyperparams['HIDDEN_DIM'])
        print ' LSTM has {} layer(s)'.format(self.hyperparams['LAYERS'])
        print
        print ' * SOFTMAX: IN-DIM: {}, OUT-DIM: {}'.format(self.hyperparams['HIDDEN_DIM'], self.hyperparams['VOCAB_SIZE'])
        print
 def __init__(self, params, source_alphabet_size, embedding_size, hidden_units,
              stack_embedding_size):
     input_size = source_alphabet_size + 2
     output_size = source_alphabet_size + 1
     self.stack_embedding_size = stack_embedding_size
     self.input_embeddings = params.add_lookup_parameters(
         (input_size, embedding_size),
         name='input-embeddings')
     self.output_embeddings = params.add_lookup_parameters(
         (output_size, embedding_size),
         name='output-embeddings')
     self.controller = dy.CoupledLSTMBuilder(
         1, embedding_size + stack_embedding_size, hidden_units, params)
     # Intentionally set the gain for the sigmoid layers low, since this
     # seems to work better
     gain = 0.5
     self.pop_strength_layer = add_layer(
         params, hidden_units, 1, sigmoid,
         weights_initializer=dy.GlorotInitializer(False, gain=gain),
         # Initialize the pop bias to -1 to allow information to propagate
         # through the stack
         bias_initializer=dy.ConstInitializer(-1.0),
         name='pop-strength')
     self.push_strength_layer = add_layer(
         params, hidden_units, 1, sigmoid,
         weights_initializer=dy.GlorotInitializer(False, gain=gain),
         bias_initializer=dy.GlorotInitializer(False, gain=gain),
         name='push-strength')
     self.push_value_layer = add_layer(
         params, hidden_units, stack_embedding_size, tanh, name='push-value')
     self.output_layer = combine_layers([
         add_layer(params, hidden_units, hidden_units, tanh, name='output'),
         # This adds an extra affine layer between the tanh and the softmax
         add_layer(params, hidden_units, output_size, linear, name='softmax')
     ])
Beispiel #3
0
    def _init_params(self):
        """
        Defines all model parameters.
        """

        self.model = dy.Model()
        self.trainer = dy.AdamTrainer(self.model)

        self.word_lookup = self.model.add_lookup_parameters(
            (self.num_words, self.word_embedding_size))
        # self.chars_lookup = self.model.add_lookup_parameters((self.num_chars, self.char_embedding_size))

        # word-level LSTMs
        self.word_lstm_input_size = self.word_embedding_size  # + 2 * self.char_embedding_size

        self.fwd_word_rnn = dy.CoupledLSTMBuilder(
            self.word_num_hidden_layers,  # number of layers
            self.word_lstm_input_size,  # input dimension
            self.word_hidden_output_size,  # output dimension
            self.model)
        self.bwd_word_rnn = dy.CoupledLSTMBuilder(self.word_num_hidden_layers,
                                                  self.word_lstm_input_size,
                                                  self.word_hidden_output_size,
                                                  self.model)

        # char-level LSTMs
        # self.fwd_char_rnn = dy.CoupledLSTMBuilder(self.char_num_hidden_layers,
        #                                          self.char_embedding_size,
        #                                          self.char_embedding_size,
        #                                          self.model)
        # self.bwd_char_rnn = dy.CoupledLSTMBuilder(self.char_num_hidden_layers,
        #                                          self.char_embedding_size,
        #                                          self.char_embedding_size,
        #                                          self.model)

        # set variational dropout
        if self.word_dropout:
            self.fwd_word_rnn.set_dropout(0.2)
            self.bwd_word_rnn.set_dropout(0.2)
        # if self._char_dropout:
        #    self.fwd_char_rnn.set_dropout(0.2)
        #    self.bwd_char_rnn.set_dropout(0.2)

        self.softmax_weight = self.model.add_parameters(
            (self.num_labels, self.word_hidden_output_size * 2))
        self.softmax_bias = self.model.add_parameters((self.num_labels, ))
Beispiel #4
0
    def build_model(self, pc, best_model_path):
        
        if best_model_path:
            print 'Loading model from: {}'.format(best_model_path)
            self.fbuffRNN, self.bbuffRNN, self.VOCAB_LOOKUP, self.decoder, self.R, self.bias, self.W_c, self.W__a, self.U__a,  self.v__a = dy.load(best_model_path, pc)
        else:
            # BiLSTM for input
            self.fbuffRNN  = dy.CoupledLSTMBuilder(self.hyperparams['LAYERS'], self.hyperparams['INPUT_DIM'], self.hyperparams['HIDDEN_DIM'], pc)
            self.bbuffRNN  = dy.CoupledLSTMBuilder(self.hyperparams['LAYERS'], self.hyperparams['INPUT_DIM'], self.hyperparams['HIDDEN_DIM'], pc)
            
            # embedding lookups for vocabulary
            self.VOCAB_LOOKUP  = pc.add_lookup_parameters((self.hyperparams['VOCAB_SIZE'], self.hyperparams['INPUT_DIM']))

            # decoder LSTM
            self.decoder = dy.CoupledLSTMBuilder(self.hyperparams['LAYERS'], self.hyperparams['INPUT_DIM'], self.hyperparams['HIDDEN_DIM'], pc)

            # softmax parameters
            self.R = pc.add_parameters((self.hyperparams['VOCAB_SIZE'], 3 * self.hyperparams['HIDDEN_DIM']))
            self.bias = pc.add_parameters(self.hyperparams['VOCAB_SIZE'])
            
            # attention MLPs - Loung-style with extra v_a from Bahdanau
            
            # concatenation layer for h (hidden dim), c (2 * hidden_dim)
            self.W_c = pc.add_parameters((3 * self.hyperparams['HIDDEN_DIM'], 3 * self.hyperparams['HIDDEN_DIM']))
            
            # attention MLP's - Bahdanau-style
            # concatenation layer for h_input (2*hidden_dim), h_output (hidden_dim)
            self.W__a = pc.add_parameters((self.hyperparams['HIDDEN_DIM'], self.hyperparams['HIDDEN_DIM']))
            
            # concatenation layer for h (hidden dim), c (2 * hidden_dim)
            self.U__a = pc.add_parameters((self.hyperparams['HIDDEN_DIM'], 2 * self.hyperparams['HIDDEN_DIM']))
            
            # concatenation layer for h_input (2*hidden_dim), h_output (hidden_dim)
            self.v__a = pc.add_parameters((1, self.hyperparams['HIDDEN_DIM']))
        
        
        print 'Model dimensions:'
        print ' * VOCABULARY EMBEDDING LAYER: IN-DIM: {}, OUT-DIM: {}'.format(self.hyperparams['VOCAB_SIZE'], self.hyperparams['INPUT_DIM'])
        print
        print ' * ENCODER biLSTM: IN-DIM: {}, OUT-DIM: {}'.format(2*self.hyperparams['INPUT_DIM'], 2*self.hyperparams['HIDDEN_DIM'])
        print ' * DECODER LSTM: IN-DIM: {}, OUT-DIM: {}'.format(self.hyperparams['INPUT_DIM'], self.hyperparams['HIDDEN_DIM'])
        print ' All LSTMs have {} layer(s)'.format(self.hyperparams['LAYERS'])
        print
        print ' * SOFTMAX: IN-DIM: {}, OUT-DIM: {}'.format(self.hyperparams['HIDDEN_DIM'], self.hyperparams['VOCAB_SIZE'])
        print
Beispiel #5
0
    def initialize_graph(self, num_words=None, num_chars=None):
        """
        build graph and link to parameters
        """
        num_words = num_words if num_words is not None else len(self.w2i)
        num_chars = num_chars if num_chars is not None else len(self.c2i)
        if num_words == 0 or num_chars == 0:
            raise ValueError('Word2id and char2id have to be loaded before '
                             'initializing the graph.')
        print('Initializing the graph...')

        # initialize the word embeddings and the parameters
        self.cembeds = None
        if self.embeds_file:
            print("loading embeddings", file=sys.stderr)
            embeddings, emb_dim = load_embeddings_file(self.embeds_file)
            assert (emb_dim == self.in_dim)
            num_words = len(
                set(embeddings.keys()).union(set(
                    self.w2i.keys())))  # initialize all with embeddings
            # init model parameters and initialize them
            self.wembeds = self.model.add_lookup_parameters(
                (num_words, self.in_dim),
                init=dynet.ConstInitializer(0.01),
                name="wembeds".encode("utf-8"))

            if self.c_in_dim > 0:
                self.cembeds = self.model.add_lookup_parameters(
                    (num_chars, self.c_in_dim),
                    init=dynet.ConstInitializer(0.01),
                    name="cembeds".encode("utf-8"))

            init = 0
            l = len(embeddings.keys())
            for word in embeddings.keys():
                # for those words we have already in w2i, update vector, otherwise add to w2i (since we keep data as integers)
                if word in self.w2i:
                    self.wembeds.init_row(self.w2i[word], embeddings[word])
                else:
                    self.w2i[word] = len(self.w2i.keys())  # add new word
                    self.wembeds.init_row(self.w2i[word], embeddings[word])
                init += 1
            print("initialized: {}".format(init), file=sys.stderr)

        else:
            self.wembeds = self.model.add_lookup_parameters(
                (num_words, self.in_dim),
                init=dynet.ConstInitializer(0.01),
                name="wembeds".encode("utf-8"))
            if self.c_in_dim > 0:
                self.cembeds = self.model.add_lookup_parameters(
                    (num_chars, self.c_in_dim),
                    init=dynet.ConstInitializer(0.01),
                    name="cembeds".encode("utf-8"))

        # make it more flexible to add number of layers as specified by parameter
        layers = []  # inner layers

        for layer_num in range(0, self.h_layers):

            if layer_num == 0:
                if self.c_in_dim > 0:
                    f_builder = dynet.CoupledLSTMBuilder(
                        1, self.in_dim + self.c_in_dim * 2, self.h_dim,
                        self.model)  # in_dim: size of each layer
                    b_builder = dynet.CoupledLSTMBuilder(
                        1, self.in_dim + self.c_in_dim * 2, self.h_dim,
                        self.model)
                else:
                    f_builder = dynet.CoupledLSTMBuilder(
                        1, self.in_dim, self.h_dim, self.model)
                    b_builder = dynet.CoupledLSTMBuilder(
                        1, self.in_dim, self.h_dim, self.model)
                layers.append(BiRNNSequencePredictor(
                    f_builder,
                    b_builder))  #returns forward and backward sequence
            else:
                # add inner layers (if h_layers >1)
                f_builder = dynet.LSTMBuilder(1, self.h_dim, self.h_dim,
                                              self.model)
                b_builder = dynet.LSTMBuilder(1, self.h_dim, self.h_dim,
                                              self.model)
                layers.append(BiRNNSequencePredictor(f_builder, b_builder))

        # store at which layer to predict task
        task_num_labels = len(self.tag2idx)
        output_layer = FFSequencePredictor(
            Layer(self.model, self.h_dim * 2, task_num_labels, dynet.softmax))

        if self.c_in_dim > 0:
            self.char_rnn = BiRNNSequencePredictor(
                dynet.CoupledLSTMBuilder(1, self.c_in_dim, self.c_in_dim,
                                         self.model),
                dynet.CoupledLSTMBuilder(1, self.c_in_dim, self.c_in_dim,
                                         self.model))
        else:
            self.char_rnn = None

        self.predictors = dict()
        self.predictors["inner"] = layers
        self.predictors["output_layers_dict"] = output_layer
        self.predictors["task_expected_at"] = self.h_layers
Beispiel #6
0
    def build_computation_graph(self, num_words, num_chars):
        """
        build graph and link to parameters
        """
        # initialize the word embeddings and the parameters
        cembeds = None
        if self.embeds_file:
            print("loading embeddings", file=sys.stderr)
            embeddings, emb_dim = load_embeddings_file(self.embeds_file)
            assert(emb_dim==self.in_dim)
            num_words=len(set(embeddings.keys()).union(set(self.w2i.keys()))) # initialize all with embeddings
            # init model parameters and initialize them
            wembeds = self.model.add_lookup_parameters((num_words, self.in_dim),init=dynet.ConstInitializer(0.01))

            if self.c_in_dim > 0:
                cembeds = self.model.add_lookup_parameters((num_chars, self.c_in_dim),init=dynet.ConstInitializer(0.01))
               
            init=0
            l = len(embeddings.keys())
            for word in embeddings.keys():
                # for those words we have already in w2i, update vector, otherwise add to w2i (since we keep data as integers)
                if word in self.w2i:
                    wembeds.init_row(self.w2i[word], embeddings[word])
                else:
                    self.w2i[word]=len(self.w2i.keys()) # add new word
                    wembeds.init_row(self.w2i[word], embeddings[word])
                init+=1
            print("initialized: {}".format(init), file=sys.stderr)

        else:
            wembeds = self.model.add_lookup_parameters((num_words, self.in_dim),init=dynet.ConstInitializer(0.01))
            if self.c_in_dim > 0:
                cembeds = self.model.add_lookup_parameters((num_chars, self.c_in_dim),init=dynet.ConstInitializer(0.01))

        #make it more flexible to add number of layers as specified by parameter
        layers = [] # inner layers

        for layer_num in range(0,self.h_layers):

            if layer_num == 0:
                if self.c_in_dim > 0:
                    f_builder = dynet.CoupledLSTMBuilder(1, self.in_dim+self.c_in_dim*2, self.h_dim, self.model) # in_dim: size of each layer
                    b_builder = dynet.CoupledLSTMBuilder(1, self.in_dim+self.c_in_dim*2, self.h_dim, self.model) 
                else:
                    f_builder = dynet.CoupledLSTMBuilder(1, self.in_dim, self.h_dim, self.model)
                    b_builder = dynet.CoupledLSTMBuilder(1, self.in_dim, self.h_dim, self.model)
                layers.append(BiRNNSequencePredictor(f_builder, b_builder)) #returns forward and backward sequence
            else:
                # add inner layers (if h_layers >1)
                f_builder = dynet.LSTMBuilder(1, self.h_dim, self.h_dim, self.model)
                b_builder = dynet.LSTMBuilder(1, self.h_dim, self.h_dim, self.model)
                layers.append(BiRNNSequencePredictor(f_builder,b_builder))

       # store at which layer to predict task

        task_num_labels= len(self.tag2idx)
        output_layer = FFSequencePredictor(Layer(self.model, self.h_dim*2, task_num_labels, dynet.softmax))

        if self.c_in_dim > 0:
            char_rnn = BiRNNSequencePredictor(dynet.CoupledLSTMBuilder(1, self.c_in_dim, self.c_in_dim, self.model), dynet.CoupledLSTMBuilder(1, self.c_in_dim, self.c_in_dim, self.model))
        else:
            char_rnn = None

        predictors = {}
        predictors["inner"] = layers
        predictors["output_layers_dict"] = output_layer
        predictors["task_expected_at"] = self.h_layers

        return predictors, char_rnn, wembeds, cembeds
Beispiel #7
0
 def setUp(self):
     # create model
     self.m = dy.ParameterCollection()
     self.rnn = dy.CoupledLSTMBuilder(2, 10, 10, self.m)
    def add_parameters(self,
                       dropout,
                       lstm_size,
                       optimizer,
                       model_type,
                       include_embeddings,
                       gru=True):

        if model_type == "gru":
            self.encoder_rnn = dy.GRUBuilder(NUM_LAYERS, EMBEDDING_SIZE,
                                             lstm_size, self.model)
            self.encoder_rnn.set_dropout(dropout)
            self.encoder_rnn2 = dy.GRUBuilder(NUM_LAYERS, EMBEDDING_SIZE,
                                              lstm_size, self.model)
            self.encoder_rnn2.set_dropout(dropout)
            self.decoder_rnn = dy.GRUBuilder(NUM_LAYERS,
                                             EMBEDDING_SIZE + lstm_size,
                                             lstm_size, self.model)
            self.decoder_rnn.set_dropout(dropout)
        else:

            self.encoder_rnn = dy.CoupledLSTMBuilder(NUM_LAYERS,
                                                     EMBEDDING_SIZE, lstm_size,
                                                     self.model)
            self.encoder_rnn.set_dropout(dropout)
            self.encoder_rnn2 = dy.CoupledLSTMBuilder(NUM_LAYERS,
                                                      EMBEDDING_SIZE,
                                                      lstm_size, self.model)
            self.encoder_rnn2.set_dropout(dropout)
            self.decoder_rnn = dy.CoupledLSTMBuilder(
                NUM_LAYERS, EMBEDDING_SIZE + lstm_size, lstm_size, self.model)
            self.decoder_rnn.set_dropout(dropout)

        global DROPOUT
        DROPOUT = dropout

        self.W1 = self.model.add_parameters((200, lstm_size))
        self.b1 = self.model.add_parameters((200, 1))
        self.W2 = self.model.add_parameters((100, 200))
        self.b2 = self.model.add_parameters((100, 1))
        self.W3 = self.model.add_parameters((len(self.C2I), 100))
        self.b3 = self.model.add_parameters((len(self.C2I), 1))
        self.W_query = self.model.add_parameters((lstm_size, lstm_size))
        self.W_key = self.model.add_parameters((lstm_size, lstm_size))
        self.W_val = self.model.add_parameters((lstm_size, lstm_size))
        self.W_att = self.model.add_parameters((1, EMBEDDING_SIZE))
        self.W_c_s = self.model.add_parameters((lstm_size, EMBEDDING_SIZE))
        self.W_direct = self.model.add_parameters((len(self.C2I), lstm_size))
        self.b_att = self.model.add_parameters((lstm_size, 1))
        self.b_direct = self.model.add_parameters((len(self.C2I), 1))
        self.E_lang = self.model.add_lookup_parameters((7, EMBEDDING_SIZE))
        self.latin_semantic_rep = {}

        #self.W_latin_embeddings = self.model.add_lookup_parameters((EMBEDDING_SIZE, 100))
        #self.W_latin_embeddings2 = self.model.add_lookup_parameters((EMBEDDING_SIZE, EMBEDDING_SIZE))

        if optimizer == "sgd":
            self.trainer = dy.SimpleSGDTrainer(self.model)
        elif optimizer == "rms":
            self.trainer = dy.RMSPropTrainer(self.model)
        if optimizer == "cyclic":
            self.trainer = dy.CyclicalSGDTrainer(self.model)
        elif optimizer == "adam":
            self.trainer = dy.AdamTrainer(self.model)
        else:
            self.trainer = dy.AdagradTrainer(self.model)
Beispiel #9
0
def process(options,args):
    """Do the processing..."""
    # sorry for the ugly global variables... it's research code...
    global train, dev, train_labelset_goldtags , dev_labelset_goldtags, test1, test2
    global vw, vt, vc, UNK, nwords, ntags, nchars, model, trainer
    global WORDS_LOOKUP, CHARS_LOOKUP, p_t1, pH,pO, fwdRNN,bwdRNN,cFwdRNN,cBwdRNN

    global DEVSET_EVAL_INTERVAL, SKIP_NON_RELEVANT, PATIENCE, BEST_DEV_F1, CHARACTER_THRESHOLD, BEST_MODEL, DUMMYTAGSET
    global CHAR_EMBEDDING_SIZE, WORD_EMBEDDING_SIZE, HIDDEN_OUTPUT_SIZE, STOP_LABELSET_EVAL_F1, NBR_OF_CLASSES, CLASSDIST
    global general_stats, dev_confusion


    MAX_EPOCHS = options.max_epochs
    DEVSET_EVAL_INTERVAL = 5000
    SKIP_NON_RELEVANT = False

    PATIENCE = 5
    BEST_DEV_F1 = 0.0
    MINIMUM_DEV_F1_SCORE = options.minimum_dev_f1_score
    BEST_DEV_F1_SCORE = 0
    CHARACTER_THRESHOLD = 5
    BEST_MODEL = None
    DUMMYTAGSET = set(['O'])

    CHAR_EMBEDDING_SIZE = 20
    WORD_EMBEDDING_SIZE = 64  # must be even number
    HIDDEN_OUTPUT_SIZE = 64  # must be even number

    STOP_LABELSET_EVAL_F1 = 0.7100

    NBR_OF_CLASSES = 20

    CLASSDIST = {
        "Allgemein": 14066,
        "Zugfahrt": 3583,
        "Sonstige_Unregelm\u00e4ssigkeiten": 3361,
        "Atmosph\u00e4re": 2135,
        "Sicherheit": 1140,
        "Ticketkauf": 1005,
        "Service_und_Kundenbetreuung": 670,
        "DB_App_und_Website": 570,
        "Informationen": 491,
        "Connectivity": 441,
        "Auslastung_und_Platzangebot": 431,
        "Komfort_und_Ausstattung": 214,
        "Gastronomisches_Angebot": 131,
        "Barrierefreiheit": 103,
        "Image": 93,
        "Reisen_mit_Kindern": 68,
        "Design": 60,
        "Toiletten": 56,
        "Gep\u00e4ck": 16
    }

    # some general stats on
    general_stats = defaultdict(Counter)

    dev_confusion = Counter()

    # format of files: each line is "word1/tag2 word2/tag2 ..."
    train_file = options.train_file
    dev_file = options.dev_file
    test1_file = options.test1_file
    test2_file = options.test2_file




    train = list(read(train_file))
    print >> sys.stderr, '#TRAINING SET SEQUENCE SIZE', len(train)
    print >> sys.stderr, '#TRAINING: NUMBER OF CLASSES',len(CLASSDIST)

    train_labelset_goldtags = seqlabel2labelset(train)

    dev = list(read(dev_file, dataset="DEV"))
    print >> sys.stderr, '#DEV SET SEQUENCE SIZE', len(dev)

    dev_labelset_goldtags = seqlabel2labelset(dev)

    output_dataset(dev, filename='gold' + '__' + 'devset' + '.tsv', meta={})

    test1 = list(read(test1_file, dataset="TEST"))
    print >> sys.stderr, '#TEST1 SET SEQUENCE SIZE', len(test1)

    output_dataset(test1, filename='gold' + '__' + 'test1set' + '.tsv', meta={})

    test2 = list(read(test2_file, dataset="TEST"))
    print >> sys.stderr, '#TEST2 SET SEQUENCE SIZE', len(test2)
    global words, tags, wc
    output_dataset(test2, filename='gold' + '__' + 'test2set' + '.tsv', meta={})
    words = []
    tags = []
    chars_counter = Counter()
    wc = Counter()

    for sent in train:
        for w, p in sent:
            words.append(w)
            tags.append(p)
            chars_counter.update(w)
            wc[w] += 1
    words.append("_UNK_")
    words.append("__D__")  # Dummy words for sentences without an explicit cateogorie

    for c in chars_counter.keys():
        if chars_counter[c] < CHARACTER_THRESHOLD:
            del chars_counter[c]

    chars = set(chars_counter)
    chars.add("<*>")

    vw = Vocab.from_corpus([words])
    vt = Vocab.from_corpus([tags])
    vc = Vocab.from_corpus([chars])

    UNK = vw.w2i["_UNK_"]

    nwords = vw.size()
    ntags = vt.size()
    nchars = vc.size()
    print >> sys.stderr, '# NUMBER OF DIFFERENT WORDS', nwords
    print >> sys.stderr, '# NUMBER OF DIFFERENT TAGS', ntags, vt.w2i
    print >> sys.stderr, '# NUMBER OF DIFFERENT CHARACTERS', nchars, vc.w2i

    for statistics in general_stats:
        print >> sys.stderr, '#STATISTICS'
        for k, c in general_stats[statistics].most_common():
            print >> sys.stderr, "%s\t%s\t%d" % (statistics, k, c)  # DyNet Starts

    model = dy.Model()
    trainer = dy.AdamTrainer(model)
    #trainer = dy.AdadeltaTrainer(model)

    WORDS_LOOKUP = model.add_lookup_parameters((nwords, WORD_EMBEDDING_SIZE))
    CHARS_LOOKUP = model.add_lookup_parameters((nchars, CHAR_EMBEDDING_SIZE))
    p_t1 = model.add_lookup_parameters((ntags, NBR_OF_CLASSES))

    # MLP on top of biLSTM outputs 2*HIDDEN_OUTPUT_SIZE -> HIDDEN_OUTPUT_SIZE -> ntags
    pH = model.add_parameters((HIDDEN_OUTPUT_SIZE, HIDDEN_OUTPUT_SIZE * 2))
    pO = model.add_parameters((ntags, HIDDEN_OUTPUT_SIZE))

    # word-level LSTMs
    fwdRNN = dy.CoupledLSTMBuilder(2, WORD_EMBEDDING_SIZE, HIDDEN_OUTPUT_SIZE, model)  # layers, in-dim, out-dim, model
    bwdRNN = dy.CoupledLSTMBuilder(2, WORD_EMBEDDING_SIZE, HIDDEN_OUTPUT_SIZE, model)

    # char-level LSTMs
    cFwdRNN = dy.CoupledLSTMBuilder(2, CHAR_EMBEDDING_SIZE, WORD_EMBEDDING_SIZE / 2, model)
    cBwdRNN = dy.CoupledLSTMBuilder(2, CHAR_EMBEDDING_SIZE, WORD_EMBEDDING_SIZE / 2, model)

    num_tagged = cum_loss = 0
    sample_iter_count = best_sample_iter_count = 0
    for ITER in xrange(MAX_EPOCHS):
        random.shuffle(train)
        for i,s in enumerate(train,1):
            sample_iter_count += 1
            best_sample_iter_count += 1
            if i > 0 and i % (DEVSET_EVAL_INTERVAL / 2) == 0:   # print status
                #trainer.status()
                print >> sys.stderr, 'AVERAGE LOSS: %.4f' % (cum_loss / num_tagged)
                cum_loss = num_tagged = 0

            if i % DEVSET_EVAL_INTERVAL == 0 or i == len(train)-1: # eval on dev
                dev_system = tag_dataset(dev,'DEV')
                dev_labelset_eval_dict = eval_dataset(dev_system, dev, 'DEV')
                dev_labelset_eval_dict['ITERATION'] = i
                if dev_labelset_eval_dict['F'] >  MINIMUM_DEV_F1_SCORE  and  dev_labelset_eval_dict['F'] > BEST_DEV_F1_SCORE:
                    BEST_DEV_F1_SCORE = dev_labelset_eval_dict['F']
                    PATIENCE = 5
                    output_dataset(dev_system, filename=options.model_identifier +'__'+'devset' + '.tsv',meta=dev_labelset_eval_dict)
                    system_test1 = tag_dataset(test1, 'TEST')
                    system_test2 = tag_dataset(test2, 'TEST')
                    output_dataset(system_test1, filename=options.model_identifier +'__'+'testset1_' + '.tsv', meta=dev_labelset_eval_dict)
                    output_dataset(system_test2, filename=options.model_identifier +'__'+'testset2_' + '.tsv', meta=dev_labelset_eval_dict)
                if BEST_DEV_F1_SCORE > 0.0:
                    if PATIENCE > 0:
                        PATIENCE -= 1
                    else:
                        exit(0)
            # train on sent
            words = [w for w,t in s]
            golds = [t for w,t in s]

            loss_exp =  sent_loss(words, golds)
            cum_loss += loss_exp.scalar_value()
            num_tagged += len(golds)
            loss_exp.backward()
            trainer.update()
        print >> sys.stderr, "epoch %r finished" % ITER
    def __init__(self,
                 characters_vocab,
                 tag_vocab,
                 LSTM_NUM_OF_LAYERS=1,
                 EMBEDDINGS_SIZE=32,
                 STATE_SIZE=100,
                 ATTENTION_SIZE=100,
                 MINIBATCH_SIZE=1,
                 COPY_WEIGHT=0.8,
                 DROPOUT_PROB=0.2,
                 EOS="<EOS>",
                 NULL="<NULL>",
                 MAX_PREDICTION_LEN_DEF=20,
                 LENGTH_NORM_WEIGHT=0.1,
                 USE_ATT_REG=False,
                 USE_TAG_ATT_REG=False,
                 PREDICT_LANG=False):
        self.model = dy.Model()

        self.characters = characters_vocab
        self.tags = tag_vocab
        self.int2char = list(self.characters)
        self.char2int = {c: i for i, c in enumerate(self.characters)}

        self.int2tag = list(self.tags)
        self.tag2int = {c: i for i, c in enumerate(self.tags)}

        self.VOCAB_SIZE = len(self.characters)
        self.TAG_VOCAB_SIZE = len(self.tags)

        self.LSTM_NUM_OF_LAYERS = LSTM_NUM_OF_LAYERS
        self.EMBEDDINGS_SIZE = EMBEDDINGS_SIZE
        self.STATE_SIZE = STATE_SIZE
        self.ATTENTION_SIZE = ATTENTION_SIZE
        self.MINIBATCH_SIZE = MINIBATCH_SIZE
        self.COPY_WEIGHT = COPY_WEIGHT
        self.DROPOUT_PROB = DROPOUT_PROB
        self.MAX_PREDICTION_LEN_DEF = MAX_PREDICTION_LEN_DEF
        self.LENGTH_NORM_WEIGHT = LENGTH_NORM_WEIGHT
        self.USE_ATT_REG = USE_ATT_REG
        self.USE_TAG_ATT_REG = USE_TAG_ATT_REG
        self.PREDICT_LANG = PREDICT_LANG

        self.EOS = EOS
        self.NULL = NULL

        self.enc_fwd_lstm = dy.CoupledLSTMBuilder(self.LSTM_NUM_OF_LAYERS,
                                                  self.EMBEDDINGS_SIZE,
                                                  self.STATE_SIZE, self.model)
        self.enc_bwd_lstm = dy.CoupledLSTMBuilder(self.LSTM_NUM_OF_LAYERS,
                                                  self.EMBEDDINGS_SIZE,
                                                  self.STATE_SIZE, self.model)

        self.dec_lstm = dy.CoupledLSTMBuilder(
            self.LSTM_NUM_OF_LAYERS,
            self.STATE_SIZE * 3 + self.EMBEDDINGS_SIZE, self.STATE_SIZE,
            self.model)

        self.input_lookup = self.model.add_lookup_parameters(
            (self.VOCAB_SIZE, self.EMBEDDINGS_SIZE))
        self.tag_input_lookup = self.model.add_lookup_parameters(
            (self.TAG_VOCAB_SIZE, self.EMBEDDINGS_SIZE))
        self.attention_w1 = self.model.add_parameters(
            (self.ATTENTION_SIZE, self.STATE_SIZE * 2))
        self.attention_w2 = self.model.add_parameters(
            (self.ATTENTION_SIZE,
             self.STATE_SIZE * self.LSTM_NUM_OF_LAYERS * 2))
        self.attention_w3 = self.model.add_parameters((self.ATTENTION_SIZE, 5))
        self.attention_v = self.model.add_parameters((1, self.ATTENTION_SIZE))

        self.decoder_w = self.model.add_parameters(
            (self.VOCAB_SIZE, self.STATE_SIZE))
        self.decoder_b = self.model.add_parameters((self.VOCAB_SIZE))
        #output_lookup = model.add_lookup_parameters((VOCAB_SIZE, EMBEDDINGS_SIZE))
        self.output_lookup = self.input_lookup

        self.enc_tag_lstm = dy.CoupledLSTMBuilder(self.LSTM_NUM_OF_LAYERS,
                                                  self.EMBEDDINGS_SIZE,
                                                  self.STATE_SIZE, self.model)
        self.tag_attention_w1 = self.model.add_parameters(
            (self.ATTENTION_SIZE, self.STATE_SIZE))
        self.tag_attention_w2 = self.model.add_parameters(
            (self.ATTENTION_SIZE,
             self.STATE_SIZE * self.LSTM_NUM_OF_LAYERS * 2))
        self.tag_attention_v = self.model.add_parameters(
            (1, self.ATTENTION_SIZE))