Esempio n. 1
0
    def build_computation_graph(self, num_words, num_chars):
        """
        build graph and link to parameters
        self.predictors, self.char_rnn, self.wembeds, self.cembeds =
        """
        ## initialize word embeddings
        if self.embeds_file:
            print("loading embeddings")
            embeddings, emb_dim = load_embeddings_file(self.embeds_file)
            assert(emb_dim==self.in_dim)
            num_words=len(set(embeddings.keys()).union(set(self.w2i.keys()))) # initialize all with embeddings
            # init model parameters and initialize them
            self.wembeds = self.model.add_lookup_parameters((num_words, self.in_dim), init=self.initializer)

            init=0
            for word in embeddings.keys():
                if word not in self.w2i:
                    self.w2i[word]=len(self.w2i.keys()) # add new word
                    self.wembeds.init_row(self.w2i[word], embeddings[word])
                    init +=1
                elif word in embeddings:
                    self.wembeds.init_row(self.w2i[word], embeddings[word])
                    init += 1
            print("initialized: {}".format(init))
            del embeddings # clean up
        else:
            self.wembeds = self.model.add_lookup_parameters((num_words, self.in_dim), init=self.initializer)

        ## initialize character embeddings
        self.cembeds = None
        if self.c_in_dim > 0:
            self.cembeds = self.model.add_lookup_parameters((num_chars, self.c_in_dim), init=self.initializer)
        if self.lex_dim > 0 and self.embed_lex:
            # +1 for UNK property
            self.lembeds = self.model.add_lookup_parameters((len(self.dictionary_values)+1, self.lex_dim), init=dynet.GlorotInitializer()) #init=self.initializer)

        # make it more flexible to add number of layers as specified by parameter
        layers = [] # inner layers
        output_layers_dict = {}   # from task_id to actual softmax predictor
        for layer_num in range(0,self.h_layers):
            if layer_num == 0:
                if self.c_in_dim > 0:
                    # in_dim: size of each layer
                    if self.lex_dim > 0 and self.embed_lex:
                        lex_embed_size = self.lex_dim * len(self.dictionary_values)
                        f_builder = self.builder(1, self.in_dim+self.c_h_dim*2+lex_embed_size, self.h_dim, self.model)
                        b_builder = self.builder(1, self.in_dim+self.c_h_dim*2+lex_embed_size, self.h_dim, self.model)
                    else:
                        f_builder = self.builder(1, self.in_dim + self.c_h_dim * 2 + self.lex_dim, self.h_dim, self.model)
                        b_builder = self.builder(1, self.in_dim + self.c_h_dim * 2 + self.lex_dim, self.h_dim, self.model)
                else:
                    f_builder = self.builder(1, self.in_dim+self.lex_dim, self.h_dim, self.model)
                    b_builder = self.builder(1, self.in_dim+self.lex_dim, self.h_dim, self.model)

                layers.append(BiRNNSequencePredictor(f_builder, b_builder)) #returns forward and backward sequence
            else:
                # add inner layers (if h_layers >1)
                f_builder = self.builder(1, self.h_dim, self.h_dim, self.model)
                b_builder = self.builder(1, self.h_dim, self.h_dim, self.model)
                layers.append(BiRNNSequencePredictor(f_builder, b_builder))

        # store at which layer to predict task
        task2layer = {task_id: out_layer for task_id, out_layer in zip(self.task2tag2idx, self.pred_layer)}
        if len(task2layer) > 1:
            print("task2layer", task2layer)
        for task_id in task2layer:
            task_num_labels= len(self.task2tag2idx[task_id])
            if not self.crf:
                output_layers_dict[task_id] = FFSequencePredictor(self.task2tag2idx[task_id], Layer(self.model, self.h_dim*2, task_num_labels,
                                                                                                    dynet.softmax, mlp=self.mlp, mlp_activation=self.activation_mlp))
            else:
                print("CRF")
                output_layers_dict[task_id] = CRFSequencePredictor(self.model, task_num_labels,
                                                                   self.task2tag2idx[task_id],
                                                                   Layer(self.model, self.h_dim * 2, task_num_labels,
                                                                        None, mlp=self.mlp,
                                                                        mlp_activation=self.activation_mlp), viterbi_loss=self.viterbi_loss)

        self.char_rnn = BiRNNSequencePredictor(self.builder(1, self.c_in_dim, self.c_h_dim, self.model),
                                          self.builder(1, self.c_in_dim, self.c_h_dim, self.model))

        self.predictors = {}
        self.predictors["inner"] = layers
        self.predictors["output_layers_dict"] = output_layers_dict
        self.predictors["task_expected_at"] = task2layer
Esempio n. 2
0
    def build_computation_graph(self, num_words, num_chars):
        """
        build graph and link to parameters
        """
        # initialize the word embeddings and the parameters
        cembeds = None
        if self.embeds_file:
            print("loading embeddings", file=sys.stderr)
            embeddings, emb_dim = load_embeddings_file(self.embeds_file)
            assert(emb_dim==self.in_dim)
            num_words=len(set(embeddings.keys()).union(set(self.w2i.keys()))) # initialize all with embeddings
            # init model parameters and initialize them
            wembeds = self.model.add_lookup_parameters((num_words, self.in_dim),init=dynet.ConstInitializer(0.01))

            if self.c_in_dim > 0:
                cembeds = self.model.add_lookup_parameters((num_chars, self.c_in_dim),init=dynet.ConstInitializer(0.01))
               
            init=0
            l = len(embeddings.keys())
            for word in embeddings.keys():
                # for those words we have already in w2i, update vector, otherwise add to w2i (since we keep data as integers)
                if word in self.w2i:
                    wembeds.init_row(self.w2i[word], embeddings[word])
                else:
                    self.w2i[word]=len(self.w2i.keys()) # add new word
                    wembeds.init_row(self.w2i[word], embeddings[word])
                init+=1
            print("initialized: {}".format(init), file=sys.stderr)

        else:
            wembeds = self.model.add_lookup_parameters((num_words, self.in_dim),init=dynet.ConstInitializer(0.01))
            if self.c_in_dim > 0:
                cembeds = self.model.add_lookup_parameters((num_chars, self.c_in_dim),init=dynet.ConstInitializer(0.01))

        #make it more flexible to add number of layers as specified by parameter
        layers = [] # inner layers

        for layer_num in range(0,self.h_layers):

            if layer_num == 0:
                if self.c_in_dim > 0:
                    f_builder = dynet.CoupledLSTMBuilder(1, self.in_dim+self.c_in_dim*2, self.h_dim, self.model) # in_dim: size of each layer
                    b_builder = dynet.CoupledLSTMBuilder(1, self.in_dim+self.c_in_dim*2, self.h_dim, self.model) 
                else:
                    f_builder = dynet.CoupledLSTMBuilder(1, self.in_dim, self.h_dim, self.model)
                    b_builder = dynet.CoupledLSTMBuilder(1, self.in_dim, self.h_dim, self.model)
                layers.append(BiRNNSequencePredictor(f_builder, b_builder)) #returns forward and backward sequence
            else:
                # add inner layers (if h_layers >1)
                f_builder = dynet.LSTMBuilder(1, self.h_dim, self.h_dim, self.model)
                b_builder = dynet.LSTMBuilder(1, self.h_dim, self.h_dim, self.model)
                layers.append(BiRNNSequencePredictor(f_builder,b_builder))

       # store at which layer to predict task

        task_num_labels= len(self.tag2idx)
        output_layer = FFSequencePredictor(Layer(self.model, self.h_dim*2, task_num_labels, dynet.softmax))

        if self.c_in_dim > 0:
            char_rnn = BiRNNSequencePredictor(dynet.CoupledLSTMBuilder(1, self.c_in_dim, self.c_in_dim, self.model), dynet.CoupledLSTMBuilder(1, self.c_in_dim, self.c_in_dim, self.model))
        else:
            char_rnn = None

        predictors = {}
        predictors["inner"] = layers
        predictors["output_layers_dict"] = output_layer
        predictors["task_expected_at"] = self.h_layers

        return predictors, char_rnn, wembeds, cembeds
Esempio n. 3
0
    def build_computation_graph(self, num_words, num_chars):
        """
        build graph and link to parameters
        """
         # initialize the word embeddings and the parameters
        if self.embeds_file:
            print("loading embeddings", file=sys.stderr)
            embeddings, emb_dim = load_embeddings_file(self.embeds_file, lower=self.lower)
            assert(emb_dim==self.in_dim)
            num_words=len(set(embeddings.keys()).union(set(self.w2i.keys()))) # initialize all with embeddings
            # init model parameters and initialize them
            wembeds = self.model.add_lookup_parameters((num_words, self.in_dim))
            cembeds = self.model.add_lookup_parameters((num_chars, self.c_in_dim))
               
            init=0
            l = len(embeddings.keys())
            for word in embeddings.keys():
                # for those words we have already in w2i, update vector, otherwise add to w2i (since we keep data as integers)
                if word in self.w2i:
                    wembeds.init_row(self.w2i[word], embeddings[word])
                else:
                    self.w2i[word]=len(self.w2i.keys()) # add new word
                    wembeds.init_row(self.w2i[word], embeddings[word])
                init+=1
            print("initialized: {}".format(init), file=sys.stderr)

        else:
            wembeds = self.model.add_lookup_parameters((num_words, self.in_dim))
            cembeds = self.model.add_lookup_parameters((num_chars, self.c_in_dim))
               

        #make it more flexible to add number of layers as specified by parameter
        layers = [] # inner layers
        output_layers_dict = {}   # from task_id to actual softmax predictor
        task_expected_at = {} # map task_id => output_layer_#

        # connect output layers to tasks
        for output_layer, task_id in zip(self.pred_layer, self.tasks_ids):
            if output_layer > self.h_layers:
                raise ValueError("cannot have a task at a layer which is beyond the model, increase h_layers")
            task_expected_at[task_id] = output_layer

        print("task expected at", task_expected_at, file=sys.stderr)

        nb_tasks = len( self.tasks_ids )

        print("h_layers:", self.h_layers, file=sys.stderr)
        for layer_num in range(0,self.h_layers):
            print(">>>", layer_num, "layer_num") 

            if layer_num == 0:
                builder = dynet.LSTMBuilder(1, self.in_dim+self.c_in_dim*2, self.h_dim, self.model) # in_dim: size of each layer
                layers.append(BiRNNSequencePredictor(builder)) #returns forward and backward sequence
            else:
                # add inner layers (if h_layers >1)
                builder = dynet.LSTMBuilder(1, self.h_dim, self.h_dim, self.model)
                layers.append(BiRNNSequencePredictor(builder))

       # store at which layer to predict task
        for task_id in self.tasks_ids:
            task_num_labels= len(self.task2tag2idx[task_id])
            output_layers_dict[task_id] = FFSequencePredictor(Layer(self.model, self.h_dim*2, task_num_labels, dynet.softmax))

        sys.stderr.write('#\nOutput layers'+str(len(output_layers_dict))+'\n')

        char_rnn = RNNSequencePredictor(dynet.LSTMBuilder(1, self.c_in_dim, self.c_in_dim, self.model))

        predictors = {}
        predictors["inner"] = layers
        predictors["output_layers_dict"] = output_layers_dict
        predictors["task_expected_at"] = task_expected_at

        return predictors, char_rnn, wembeds, cembeds
Esempio n. 4
0
class NNTagger(object):

    # turn dynamic allocation off by defining slots
    __slots__ = ['w2i', 'c2i', 'wcount', 'ccount','wtotal','ctotal','w2c_cache','w_dropout_rate','c_dropout_rate',
                  'task2tag2idx', 'model', 'in_dim', 'c_in_dim', 'c_h_dim','h_dim', 'activation',
                 'noise_sigma', 'pred_layer', 'mlp', 'activation_mlp', 'backprob_embeds', 'initializer',
                 'h_layers', 'predictors', 'wembeds', 'cembeds', 'embeds_file', 'char_rnn', 'trainer',
                 'builder', 'crf', 'viterbi_loss', 'mimickx_model_path', 'mimickx_model',
                 'dictionary',  'dictionary_values', 'path_to_dictionary', 'lex_dim', 'type_constraint',
                 'embed_lex', 'l2i', 'lembeds']

    def __init__(self,in_dim,h_dim,c_in_dim,c_h_dim,h_layers,pred_layer,learning_algo="sgd", learning_rate=0,
                 embeds_file=None,activation=ACTIVATION_MAP["tanh"],mlp=0,activation_mlp=ACTIVATION_MAP["rectify"],
                 backprob_embeds=True,noise_sigma=0.1, w_dropout_rate=0.25, c_dropout_rate=0.25,
                 initializer=INITIALIZER_MAP["glorot"], builder=BUILDERS["lstmc"], crf=False, viterbi_loss=False,
                 mimickx_model_path=None, dictionary=None, type_constraint=False,
                 lex_dim=0, embed_lex=False):
        self.w2i = {}  # word to index mapping
        self.c2i = {}  # char to index mapping
        self.w2c_cache = {} # word to char index cache for frequent words
        self.wcount = None # word count
        self.ccount = None # char count
        self.task2tag2idx = {} # need one dictionary per task
        self.pred_layer = [int(layer) for layer in pred_layer] # at which layer to predict each task
        self.model = dynet.ParameterCollection() #init model
        self.in_dim = in_dim
        self.h_dim = h_dim
        self.c_in_dim = c_in_dim
        self.c_h_dim = c_h_dim
        self.w_dropout_rate = w_dropout_rate
        self.c_dropout_rate = c_dropout_rate
        self.activation = activation
        self.mlp = mlp
        self.activation_mlp = activation_mlp
        self.noise_sigma = noise_sigma
        self.h_layers = h_layers
        self.predictors = {"inner": [], "output_layers_dict": {}, "task_expected_at": {} } # the inner layers and predictors
        self.wembeds = None # lookup: embeddings for words
        self.cembeds = None # lookup: embeddings for characters
        self.lembeds = None # lookup: embeddings for lexical features (optional)
        self.embeds_file = embeds_file
        trainer_algo = TRAINER_MAP[learning_algo]
        if learning_rate > 0:
            ### TODO: better handling of additional learning-specific parameters
            self.trainer = trainer_algo(self.model, learning_rate=learning_rate)
        else:
            # using default learning rate
            self.trainer = trainer_algo(self.model)
        self.backprob_embeds = backprob_embeds
        self.initializer = initializer
        self.char_rnn = None # biRNN for character input
        self.builder = builder # default biRNN is an LSTM
        self.crf = crf
        self.viterbi_loss = viterbi_loss
        self.mimickx_model_path = mimickx_model_path
        if mimickx_model_path: # load
            self.mimickx_model = load_model(mimickx_model_path)
        self.dictionary = None
        self.type_constraint = type_constraint
        self.embed_lex = False
        self.l2i = {UNK: 0}  # lex feature to index mapping
        if dictionary:
            self.dictionary, self.dictionary_values = load_dict(dictionary)
            self.path_to_dictionary = dictionary
            if type_constraint:
                self.lex_dim = 0
            else:
                if embed_lex:
                    self.lex_dim = lex_dim
                    self.embed_lex = True
                    print("Embed lexical features")
                    # register property indices
                    for prop in self.dictionary_values:
                        self.l2i[prop] = len(self.l2i)
                else:
                    self.lex_dim = len(self.dictionary_values) #n-hot encoding
                print("Lex_dim: {}".format(self.lex_dim), file=sys.stderr)
        else:
            self.dictionary = None
            self.path_to_dictionary = None
            self.lex_dim = 0

    def fit(self, train, num_iterations, dev=None, model_path=None, patience=0, minibatch_size=0, log_losses=False):
        """
        train the tagger
        """
        losses_log = {} # log losses

        print("init parameters")
        self.init_parameters(train)

        # init lookup parameters and define graph
        print("build graph")
        self.build_computation_graph(len(self.w2i),  len(self.c2i))

        update_embeds = True
        if self.backprob_embeds == False: ## disable backprob into embeds
            print(">>> disable wembeds update <<<")
            update_embeds = False
            
        best_val_acc, epochs_no_improvement = 0.0, 0

        if dev and model_path is not None and patience > 0:
            print('Using early stopping with patience of {}...'.format(patience))

        batch = []
        print("train..")
        for iteration in range(num_iterations):

            total_loss=0.0
            total_tagged=0.0

            indices = [i for i in range(len(train.seqs))]
            random.shuffle(indices)

            loss_accum_loss = defaultdict(float)
            loss_accum_tagged = defaultdict(float)

            for idx in indices:
                seq = train.seqs[idx]

                if seq.task_id not in losses_log:
                    losses_log[seq.task_id] = [] #initialize

                if minibatch_size > 1:
                    # accumulate instances for minibatch update
                    loss1 = self.predict(seq, train=True, update_embeds=update_embeds)
                    total_tagged += len(seq.words)
                    batch.append(loss1)
                    if len(batch) == minibatch_size:
                        loss = dynet.esum(batch)
                        total_loss += loss.value()

                        # logging
                        loss_accum_tagged[seq.task_id] += len(seq.words)
                        loss_accum_loss[seq.task_id] += loss.value()

                        loss.backward()
                        self.trainer.update()
                        dynet.renew_cg()  # use new computational graph for each BATCH when batching is active
                        batch = []
                else:
                    dynet.renew_cg() # new graph per item
                    loss1 = self.predict(seq, train=True, update_embeds=update_embeds)
                    total_tagged += len(seq.words)
                    lv = loss1.value()
                    total_loss += lv

                    # logging
                    loss_accum_tagged[seq.task_id] += len(seq.words)
                    loss_accum_loss[seq.task_id] += loss1.value()

                    loss1.backward()
                    self.trainer.update()

            print("iter {2} {0:>12}: {1:.2f}".format("total loss", total_loss/total_tagged, iteration))

            # log losses
            for task_id in sorted(losses_log):
                losses_log[task_id].append(loss_accum_loss[task_id] / loss_accum_tagged[task_id])

            if log_losses:
                dill.dump(losses_log, open(model_path + ".model" + ".losses.pickle", "wb"))

            if dev:
                # evaluate after every epoch
                correct, total = self.evaluate(dev, "task0")
                val_accuracy = correct/total
                print("dev accuracy: {0:.4f}".format(val_accuracy))

                if val_accuracy > best_val_acc:
                    print('Accuracy {0:.4f} is better than best val accuracy '
                          '{1:.4f}.'.format(val_accuracy, best_val_acc))
                    best_val_acc = val_accuracy
                    epochs_no_improvement = 0
                    save(self, model_path)
                else:
                    print('Accuracy {0:.4f} is worse than best val loss {1:.4f}.'.format(val_accuracy, best_val_acc))
                    epochs_no_improvement += 1

                if patience > 0:
                    if epochs_no_improvement == patience:
                        print('No improvement for {} epochs. Early stopping...'.format(epochs_no_improvement))
                        break

    def set_indices(self, w2i, c2i, task2t2i, w2c_cache, l2i=None):
        """ helper function for loading model"""
        for task_id in task2t2i:
            self.task2tag2idx[task_id] = task2t2i[task_id]
        self.w2i = w2i
        self.c2i = c2i
        self.w2c_cache = w2c_cache
        self.l2i = l2i

    def set_counts(self, wcount, wtotal, ccount, ctotal):
        """ helper function for loading model"""
        self.wcount = wcount
        self.wtotal = wtotal
        self.ccount = ccount
        self.ctotal = ctotal

    def build_computation_graph(self, num_words, num_chars):
        """
        build graph and link to parameters
        self.predictors, self.char_rnn, self.wembeds, self.cembeds =
        """
        ## initialize word embeddings
        if self.embeds_file:
            print("loading embeddings")
            embeddings, emb_dim = load_embeddings_file(self.embeds_file)
            assert(emb_dim==self.in_dim)
            num_words=len(set(embeddings.keys()).union(set(self.w2i.keys()))) # initialize all with embeddings
            # init model parameters and initialize them
            self.wembeds = self.model.add_lookup_parameters((num_words, self.in_dim), init=self.initializer)

            init=0
            for word in embeddings.keys():
                if word not in self.w2i:
                    self.w2i[word]=len(self.w2i.keys()) # add new word
                    self.wembeds.init_row(self.w2i[word], embeddings[word])
                    init +=1
                elif word in embeddings:
                    self.wembeds.init_row(self.w2i[word], embeddings[word])
                    init += 1
            print("initialized: {}".format(init))
            del embeddings # clean up
        else:
            self.wembeds = self.model.add_lookup_parameters((num_words, self.in_dim), init=self.initializer)

        ## initialize character embeddings
        self.cembeds = None
        if self.c_in_dim > 0:
            self.cembeds = self.model.add_lookup_parameters((num_chars, self.c_in_dim), init=self.initializer)
        if self.lex_dim > 0 and self.embed_lex:
            # +1 for UNK property
            self.lembeds = self.model.add_lookup_parameters((len(self.dictionary_values)+1, self.lex_dim), init=dynet.GlorotInitializer()) #init=self.initializer)

        # make it more flexible to add number of layers as specified by parameter
        layers = [] # inner layers
        output_layers_dict = {}   # from task_id to actual softmax predictor
        for layer_num in range(0,self.h_layers):
            if layer_num == 0:
                if self.c_in_dim > 0:
                    # in_dim: size of each layer
                    if self.lex_dim > 0 and self.embed_lex:
                        lex_embed_size = self.lex_dim * len(self.dictionary_values)
                        f_builder = self.builder(1, self.in_dim+self.c_h_dim*2+lex_embed_size, self.h_dim, self.model)
                        b_builder = self.builder(1, self.in_dim+self.c_h_dim*2+lex_embed_size, self.h_dim, self.model)
                    else:
                        f_builder = self.builder(1, self.in_dim + self.c_h_dim * 2 + self.lex_dim, self.h_dim, self.model)
                        b_builder = self.builder(1, self.in_dim + self.c_h_dim * 2 + self.lex_dim, self.h_dim, self.model)
                else:
                    f_builder = self.builder(1, self.in_dim+self.lex_dim, self.h_dim, self.model)
                    b_builder = self.builder(1, self.in_dim+self.lex_dim, self.h_dim, self.model)

                layers.append(BiRNNSequencePredictor(f_builder, b_builder)) #returns forward and backward sequence
            else:
                # add inner layers (if h_layers >1)
                f_builder = self.builder(1, self.h_dim, self.h_dim, self.model)
                b_builder = self.builder(1, self.h_dim, self.h_dim, self.model)
                layers.append(BiRNNSequencePredictor(f_builder, b_builder))

        # store at which layer to predict task
        task2layer = {task_id: out_layer for task_id, out_layer in zip(self.task2tag2idx, self.pred_layer)}
        if len(task2layer) > 1:
            print("task2layer", task2layer)
        for task_id in task2layer:
            task_num_labels= len(self.task2tag2idx[task_id])
            if not self.crf:
                output_layers_dict[task_id] = FFSequencePredictor(self.task2tag2idx[task_id], Layer(self.model, self.h_dim*2, task_num_labels,
                                                                                                    dynet.softmax, mlp=self.mlp, mlp_activation=self.activation_mlp))
            else:
                print("CRF")
                output_layers_dict[task_id] = CRFSequencePredictor(self.model, task_num_labels,
                                                                   self.task2tag2idx[task_id],
                                                                   Layer(self.model, self.h_dim * 2, task_num_labels,
                                                                        None, mlp=self.mlp,
                                                                        mlp_activation=self.activation_mlp), viterbi_loss=self.viterbi_loss)

        self.char_rnn = BiRNNSequencePredictor(self.builder(1, self.c_in_dim, self.c_h_dim, self.model),
                                          self.builder(1, self.c_in_dim, self.c_h_dim, self.model))

        self.predictors = {}
        self.predictors["inner"] = layers
        self.predictors["output_layers_dict"] = output_layers_dict
        self.predictors["task_expected_at"] = task2layer

        
    def get_features(self, words, train=False, update=True):
        """
        get feature representations
        """
        # word embeddings
        wfeatures = np.array([self.get_w_repr(word, train=train, update=update) for word in words])

        lex_features = []
        if self.dictionary and not self.type_constraint:
            ## add lexicon features
            lex_features = np.array([self.get_lex_repr(word) for word in words])
        # char embeddings
        if self.c_in_dim > 0:
            cfeatures = [self.get_c_repr(word, train=train) for word in words]
            if len(lex_features) > 0:
                lex_features = dynet.inputTensor(lex_features)
                features = [dynet.concatenate([w,c,l]) for w,c,l in zip(wfeatures,cfeatures,lex_features)]
            else:
                features = [dynet.concatenate([w, c]) for w, c in zip(wfeatures, cfeatures)]
        else:
            features = wfeatures
        if train: # only do at training time
            features = [dynet.noise(fe,self.noise_sigma) for fe in features]
        return features

    def predict(self, seq, train=False, output_confidences=False, unk_tag=None, update_embeds=True):
        """
        predict tags for a sentence represented as char+word embeddings and compute losses for this instance
        """
        if not train:
            dynet.renew_cg()
        features = self.get_features(seq.words, train=train, update=update_embeds)

        output_expected_at_layer = self.predictors["task_expected_at"][seq.task_id]
        output_expected_at_layer -=1

        # go through layers
        # input is now combination of w + char emb
        prev = features
        prev_rev = features
        num_layers = self.h_layers

        for i in range(0,num_layers):
            predictor = self.predictors["inner"][i]
            forward_sequence, backward_sequence = predictor.predict_sequence(prev, prev_rev)        
            if i > 0 and self.activation:
                # activation between LSTM layers
                forward_sequence = [self.activation(s) for s in forward_sequence]
                backward_sequence = [self.activation(s) for s in backward_sequence]

            if i == output_expected_at_layer:
                output_predictor = self.predictors["output_layers_dict"][seq.task_id]
                concat_layer = [dynet.concatenate([f, b]) for f, b in zip(forward_sequence,reversed(backward_sequence))]

                if train and self.noise_sigma > 0.0:
                    concat_layer = [dynet.noise(fe,self.noise_sigma) for fe in concat_layer]
                # fill-in predictions and get loss per tag
                losses = output_predictor.predict_sequence(seq, concat_layer,
                                                           train=train, output_confidences=output_confidences,
                                                           unk_tag=unk_tag, dictionary=self.dictionary,
                                                           type_constraint=self.type_constraint)

            prev = forward_sequence
            prev_rev = backward_sequence 

        if train:
            # return losses
            return losses
        else:
            return seq.pred_tags, seq.tag_confidences

    def output_preds(self, seq, raw=False, output_confidences=False):
        """
        output predictions to a file
        """
        i = 0
        for w, g, p in zip(seq.words, seq.tags, seq.pred_tags):
            if raw:
                if output_confidences:
                    print(u"{0}\t{1}\t{2:.2f}".format(w, p, seq.tag_confidences[i]))
                else:
                    print(u"{}\t{}".format(w, p))  # do not print DUMMY tag when --raw is on
            else:
                if output_confidences:
                    print(u"{0}\t{1}\t{2}\t{3:.2f}".format(w, g, p, seq.tag_confidences[i]))
                else:
                    print(u"{}\t{}\t{}".format(w, g, p))
            i += 1
        print("")

    def evaluate(self, test_file, task_id, output_predictions=None, raw=False, output_confidences=False, unk_tag=None):
        """
        compute accuracy on a test file, optionally output to file
        """
        correct = 0
        total = 0

        for seq in test_file:
            if seq.task_id != task_id:
                continue # we evaluate only on a specific task
            self.predict(seq, output_confidences=output_confidences, unk_tag=unk_tag)
            if output_predictions:
                self.output_preds(seq, raw=raw, output_confidences=output_confidences)
            correct_inst, total_inst = seq.evaluate()
            correct+=correct_inst
            total+= total_inst
        return correct, total

    def get_w_repr(self, word, train=False, update=True):
        """
        Get representation of word (word embedding)
        """
        if train:
            if self.w_dropout_rate > 0.0:
                w_id = self.w2i[UNK] if drop(word, self.wcount, self.w_dropout_rate) else self.w2i.get(word, self.w2i[UNK])
        else:
            if self.mimickx_model_path: # if given use MIMICKX
                if word not in self.w2i: #
                    #print("predict with MIMICKX for: ", word)
                    return dynet.inputVector(self.mimickx_model.predict(word).npvalue())
            w_id = self.w2i.get(word, self.w2i[UNK])
        if not update:
            return dynet.nobackprop(self.wembeds[w_id])
        else:
            return self.wembeds[w_id] 

    def get_c_repr(self, word, train=False):
        """
        Get representation of word via characters sub-LSTMs
        """
        # get representation for words
        if word in self.w2c_cache:
            chars_of_token = self.w2c_cache[word]
            if train:
                chars_of_token = [drop(c, self.ccount, self.c_dropout_rate) for c in chars_of_token]
        else:
            chars_of_token = array.array('I',[self.c2i[WORD_START]]) + array.array('I',[self.get_c_idx(c, train=train) for c in word]) + array.array('I',[self.c2i[WORD_END]])

        char_feats = [self.cembeds[c_id] for c_id in chars_of_token]
        # use last state as word representation
        f_char, b_char = self.char_rnn.predict_sequence(char_feats, char_feats)
        return dynet.concatenate([f_char[-1], b_char[-1]])

    def get_c_idx(self, c, train=False):
        """ helper function to get index of character"""
        if self.c_dropout_rate > 0.0 and train and drop(c, self.ccount, self.c_dropout_rate):
            return self.c2i.get(UNK)
        else:
            return self.c2i.get(c, self.c2i[UNK])

    def get_lex_repr(self, word):
        """
        Get representation for lexical feature
        """
        if not self.embed_lex: ## n-hot representation
            n_hot = np.zeros(len(self.dictionary_values))
            values = is_in_dict(word, self.dictionary)
            if values:
                for v in values:
                    n_hot[self.dictionary_values.index(v)] = 1.0
            return n_hot
        else:
            lex_feats = []
            for property in self.dictionary_values:
                values = is_in_dict(word, self.dictionary)
                if values:
                    if property in values:
                        lex_feats.append(self.lembeds[self.l2i[property]].npvalue())
                    else:
                        lex_feats.append(self.lembeds[self.l2i[UNK]].npvalue())
                else:
                    lex_feats.append(self.lembeds[self.l2i[UNK]].npvalue()) # unknown word
            return np.concatenate(lex_feats)

    def init_parameters(self, train_data):
        """init parameters from training data"""
        # word 2 indices and tag 2 indices
        self.w2i = {}  # word to index
        self.c2i = {}  # char to index
        self.task2tag2idx = {}  # id of the task -> tag2idx

        self.w2i[UNK] = 0  # unk word / OOV
        self.c2i[UNK] = 0  # unk char
        self.c2i[WORD_START] = 1  # word start
        self.c2i[WORD_END] = 2  # word end index

        # word and char counters
        self.wcount = Counter()
        self.ccount = Counter()

        for seq in train_data:
            self.wcount.update([w for w in seq.words])
            self.ccount.update([c for w in seq.words for c in w])

            if seq.task_id not in self.task2tag2idx:
                self.task2tag2idx[seq.task_id] = {"<START>": START_TAG, "<END>": END_TAG}

            # record words and chars
            for word, tag in zip(seq.words, seq.tags):
                if word not in self.w2i:
                    self.w2i[word] = len(self.w2i)

                if self.c_in_dim > 0:
                    for char in word:
                        if char not in self.c2i:
                            self.c2i[char] = len(self.c2i)

                if tag not in self.task2tag2idx[seq.task_id]:
                    self.task2tag2idx[seq.task_id][tag] = len(self.task2tag2idx[seq.task_id])

        n = int(len(self.w2i) * 0.3) # top 30%
        print("Caching top {} words".format(n))
        for word in self.wcount.most_common(n):
            self.w2c_cache[word] = array.array('I', [self.c2i[WORD_START]]) + array.array('I', [self.get_c_idx(c) for c in word]) + array.array('I', [self.c2i[WORD_END]])
        # get total counts
        self.wtotal = np.sum([self.wcount[w] for w in self.wcount])
        self.ctotal = np.sum([self.ccount[c] for c in self.ccount])
        print("{} w features, {} c features".format(len(self.w2i), len(self.c2i)))
        #print(self.wtotal, self.ctotal)


    def save_embeds(self, out_filename):
        """
        save final embeddings to file
        :param out_filename: filename
        """
        # construct reverse mapping
        i2w = {self.w2i[w]: w for w in self.w2i.keys()}

        OUT = open(out_filename+".w.emb","w")
        for word_id in i2w.keys():
            wembeds_expression = self.wembeds[word_id]
            word = i2w[word_id]
            OUT.write("{} {}\n".format(word," ".join([str(x) for x in wembeds_expression.npvalue()])))
        OUT.close()


    def save_lex_embeds(self, out_filename):
        """
        save final embeddings to file
        :param out_filename: filename
        """
        # construct reverse mapping
        i2l = {self.l2i[w]: w for w in self.l2i.keys()}

        OUT = open(out_filename+".l.emb","w")
        for lex_id in i2l.keys():
            lembeds_expression = self.lembeds[lex_id]
            lex = i2l[lex_id]
            OUT.write("{} {}\n".format(lex," ".join([str(x) for x in lembeds_expression.npvalue()])))
        OUT.close()


    def save_cw_embeds(self, out_filename):
        """
        save final character-based word-embeddings to file
        :param out_filename: filename
        """
        # construct reverse mapping using word embeddings
        i2cw = {self.w2i[w]: w for w in self.w2i.keys()}

        OUT = open(out_filename+".cw.emb","w")
        for word_id in i2cw.keys():
            word = i2cw[word_id]
            cwembeds = [v.npvalue()[0] for v in self.get_c_repr(word)]
            OUT.write("{} {}\n".format(word," ".join([str(x) for x in cwembeds])))
        OUT.close()


    def save_wordlex_map(self, out_filename):
        """
        save final word-to-lexicon-embedding map to file
        :param out_filename: filename
        """
        # construct reverse mapping using word embeddings
        i2wl = {self.w2i[w]: w for w in self.w2i.keys()}

        OUT = open(out_filename+".wlmap.emb","w")
        for word_id in i2wl.keys():
            word = i2wl[word_id]

            lex_feats = []
            for property in self.dictionary_values:
                values = is_in_dict(word, self.dictionary)
                if values:
                    if property in values:
                        lex_feats.append(property)
                    else:
                        lex_feats.append(UNK)
                else:
                    lex_feats.append(UNK) # unknown word

            OUT.write("{} {}\n".format(word," ".join([str(x) for x in lex_feats])))
        OUT.close()
        
    def save_transition_matrix(self, out_filename):
        """
        save transition matrix
        :param out_filename: filename
        """
        for task_id in self.predictors["output_layers_dict"].keys():
            output_predictor = self.predictors["output_layers_dict"][task_id]
            output_predictor.save_parameters(out_filename)
Esempio n. 5
0
    def build_computation_graph(self, num_words, num_chars):
        """
        build graph and link to parameters
        self.predictors, self.char_rnn, self.wembeds, self.cembeds =
        """
        ## initialize word embeddings
        if self.embeds_file:
            print("loading embeddings")
            embeddings, emb_dim = load_embeddings_file(self.embeds_file)
            assert (emb_dim == self.in_dim)
            num_words = len(
                set(embeddings.keys()).union(set(
                    self.w2i.keys())))  # initialize all with embeddings
            # init model parameters and initialize them
            self.wembeds = self.model.add_lookup_parameters(
                (num_words, self.in_dim), init=self.initializer)

            init = 0
            for word in embeddings.keys():
                if word not in self.w2i:
                    self.w2i[word] = len(self.w2i.keys())  # add new word
                    self.wembeds.init_row(self.w2i[word], embeddings[word])
                    init += 1
                elif word in embeddings:
                    self.wembeds.init_row(self.w2i[word], embeddings[word])
                    init += 1
            print("initialized: {}".format(init))
            del embeddings  # clean up
        else:
            self.wembeds = self.model.add_lookup_parameters(
                (num_words, self.in_dim), init=self.initializer)

        ## initialize character embeddings
        self.cembeds = None
        if self.c_in_dim > 0:
            self.cembeds = self.model.add_lookup_parameters(
                (num_chars, self.c_in_dim), init=self.initializer)
        if self.lex_dim > 0 and self.embed_lex:
            # +1 for UNK property
            self.lembeds = self.model.add_lookup_parameters(
                (len(self.dictionary_values) + 1, self.lex_dim),
                init=dynet.GlorotInitializer())  #init=self.initializer)

        # make it more flexible to add number of layers as specified by parameter
        layers = []  # inner layers
        output_layers_dict = {}  # from task_id to actual softmax predictor
        for layer_num in range(0, self.h_layers):
            if layer_num == 0:
                if self.c_in_dim > 0:
                    # in_dim: size of each layer
                    if self.lex_dim > 0 and self.embed_lex:
                        lex_embed_size = self.lex_dim * len(
                            self.dictionary_values)
                        f_builder = self.builder(
                            1, self.in_dim + self.c_h_dim * 2 + lex_embed_size,
                            self.h_dim, self.model)
                        b_builder = self.builder(
                            1, self.in_dim + self.c_h_dim * 2 + lex_embed_size,
                            self.h_dim, self.model)
                    else:
                        f_builder = self.builder(
                            1, self.in_dim + self.c_h_dim * 2 + self.lex_dim,
                            self.h_dim, self.model)
                        b_builder = self.builder(
                            1, self.in_dim + self.c_h_dim * 2 + self.lex_dim,
                            self.h_dim, self.model)
                else:
                    f_builder = self.builder(1, self.in_dim + self.lex_dim,
                                             self.h_dim, self.model)
                    b_builder = self.builder(1, self.in_dim + self.lex_dim,
                                             self.h_dim, self.model)

                layers.append(BiRNNSequencePredictor(
                    f_builder,
                    b_builder))  #returns forward and backward sequence
            else:
                # add inner layers (if h_layers >1)
                f_builder = self.builder(1, self.h_dim, self.h_dim, self.model)
                b_builder = self.builder(1, self.h_dim, self.h_dim, self.model)
                layers.append(BiRNNSequencePredictor(f_builder, b_builder))

        # store at which layer to predict task
        task2layer = {
            task_id: out_layer
            for task_id, out_layer in zip(self.task2tag2idx, self.pred_layer)
        }
        if len(task2layer) > 1:
            print("task2layer", task2layer)
        for task_id in task2layer:
            task_num_labels = len(self.task2tag2idx[task_id])
            if not self.crf:
                output_layers_dict[task_id] = FFSequencePredictor(
                    self.task2tag2idx[task_id],
                    Layer(self.model,
                          self.h_dim * 2,
                          task_num_labels,
                          dynet.softmax,
                          mlp=self.mlp,
                          mlp_activation=self.activation_mlp))
            else:
                print("CRF")
                output_layers_dict[task_id] = CRFSequencePredictor(
                    self.model,
                    task_num_labels,
                    self.task2tag2idx[task_id],
                    Layer(self.model,
                          self.h_dim * 2,
                          task_num_labels,
                          None,
                          mlp=self.mlp,
                          mlp_activation=self.activation_mlp),
                    viterbi_loss=self.viterbi_loss)

        self.char_rnn = BiRNNSequencePredictor(
            self.builder(1, self.c_in_dim, self.c_h_dim, self.model),
            self.builder(1, self.c_in_dim, self.c_h_dim, self.model))

        self.predictors = {}
        self.predictors["inner"] = layers
        self.predictors["output_layers_dict"] = output_layers_dict
        self.predictors["task_expected_at"] = task2layer
Esempio n. 6
0
    for word in embeddings.keys():
        if word not in w2i:
            w2i[word] = len(w2i.keys())  # add new word
        wembeds.init_row(w2i[word], embeddings[word])
        init += 1
    print("initialized: {}".format(init), file=sys.stderr)
else:
    wembeds = model.add_lookup_parameters((len(w2i), args.in_dim))  #l376

layers = []

for layer_num in range(args.layers):  #l411
    if layer_num == 0:
        f_builder = dy.LSTMBuilder(1, args.in_dim, args.h_dim, model)
        b_builder = dy.LSTMBuilder(1, args.in_dim, args.h_dim, model)
        layers.append(BiRNNSequencePredictor(f_builder, b_builder))
    else:
        f_builder = dy.LSTMBuilder(1, args.h_dim, args.h_dim, model)
        b_builder = dy.LSTMBuilder(1, args.h_dim, args.h_dim, model)
        layers.append(BiRNNSequencePredictor(f_builder, b_builder))

predictors = {}
predictors["inner"] = layers
predictors["outer"] = {}
for task_id in tasks_ids:
    task_num_labels = len(task2t2i[task_id])
    predictors["outer"][task_id] = FFSequencePredictor(
        Layer(model, args.h_dim * 2, len(task_labels), dy.softmax))

# TRAINING
Esempio n. 7
0
class NNTagger(object):

    # turn dynamic allocation off by defining slots
    __slots__ = [
        'w2i', 'c2i', 'wcount', 'ccount', 'wtotal', 'ctotal', 'w2c_cache',
        'w_dropout_rate', 'c_dropout_rate', 'task2tag2idx', 'model', 'in_dim',
        'c_in_dim', 'c_h_dim', 'h_dim', 'activation', 'noise_sigma',
        'pred_layer', 'mlp', 'activation_mlp', 'backprob_embeds',
        'initializer', 'h_layers', 'predictors', 'wembeds', 'cembeds',
        'embeds_file', 'char_rnn', 'trainer', 'builder', 'crf', 'viterbi_loss',
        'mimickx_model_path', 'mimickx_model', 'dictionary',
        'dictionary_values', 'path_to_dictionary', 'lex_dim',
        'type_constraint', 'embed_lex', 'l2i', 'lembeds'
    ]

    def __init__(self,
                 in_dim,
                 h_dim,
                 c_in_dim,
                 c_h_dim,
                 h_layers,
                 pred_layer,
                 learning_algo="sgd",
                 learning_rate=0,
                 embeds_file=None,
                 activation=ACTIVATION_MAP["tanh"],
                 mlp=0,
                 activation_mlp=ACTIVATION_MAP["rectify"],
                 backprob_embeds=True,
                 noise_sigma=0.1,
                 w_dropout_rate=0.25,
                 c_dropout_rate=0.25,
                 initializer=INITIALIZER_MAP["glorot"],
                 builder=BUILDERS["lstmc"],
                 crf=False,
                 viterbi_loss=False,
                 mimickx_model_path=None,
                 dictionary=None,
                 type_constraint=False,
                 lex_dim=0,
                 embed_lex=False):
        self.w2i = {}  # word to index mapping
        self.c2i = {}  # char to index mapping
        self.w2c_cache = {}  # word to char index cache for frequent words
        self.wcount = None  # word count
        self.ccount = None  # char count
        self.task2tag2idx = {}  # need one dictionary per task
        self.pred_layer = [int(layer) for layer in pred_layer
                           ]  # at which layer to predict each task
        self.model = dynet.ParameterCollection()  #init model
        self.in_dim = in_dim
        self.h_dim = h_dim
        self.c_in_dim = c_in_dim
        self.c_h_dim = c_h_dim
        self.w_dropout_rate = w_dropout_rate
        self.c_dropout_rate = c_dropout_rate
        self.activation = activation
        self.mlp = mlp
        self.activation_mlp = activation_mlp
        self.noise_sigma = noise_sigma
        self.h_layers = h_layers
        self.predictors = {
            "inner": [],
            "output_layers_dict": {},
            "task_expected_at": {}
        }  # the inner layers and predictors
        self.wembeds = None  # lookup: embeddings for words
        self.cembeds = None  # lookup: embeddings for characters
        self.lembeds = None  # lookup: embeddings for lexical features (optional)
        self.embeds_file = embeds_file
        trainer_algo = TRAINER_MAP[learning_algo]
        if learning_rate > 0:
            ### TODO: better handling of additional learning-specific parameters
            self.trainer = trainer_algo(self.model,
                                        learning_rate=learning_rate)
        else:
            # using default learning rate
            self.trainer = trainer_algo(self.model)
        self.backprob_embeds = backprob_embeds
        self.initializer = initializer
        self.char_rnn = None  # biRNN for character input
        self.builder = builder  # default biRNN is an LSTM
        self.crf = crf
        self.viterbi_loss = viterbi_loss
        self.mimickx_model_path = mimickx_model_path
        if mimickx_model_path:  # load
            self.mimickx_model = load_model(mimickx_model_path)
        self.dictionary = None
        self.type_constraint = type_constraint
        self.embed_lex = False
        self.l2i = {UNK: 0}  # lex feature to index mapping
        if dictionary:
            self.dictionary, self.dictionary_values = load_dict(dictionary)
            self.path_to_dictionary = dictionary
            if type_constraint:
                self.lex_dim = 0
            else:
                if embed_lex:
                    self.lex_dim = lex_dim
                    self.embed_lex = True
                    print("Embed lexical features")
                    # register property indices
                    for prop in self.dictionary_values:
                        self.l2i[prop] = len(self.l2i)
                else:
                    self.lex_dim = len(self.dictionary_values)  #n-hot encoding
                print("Lex_dim: {}".format(self.lex_dim), file=sys.stderr)
        else:
            self.dictionary = None
            self.path_to_dictionary = None
            self.lex_dim = 0

    def fit(self,
            train,
            num_iterations,
            dev=None,
            model_path=None,
            patience=0,
            minibatch_size=0,
            log_losses=False):
        """
        train the tagger
        """
        losses_log = {}  # log losses

        print("init parameters")
        self.init_parameters(train)

        # init lookup parameters and define graph
        print("build graph")
        self.build_computation_graph(len(self.w2i), len(self.c2i))

        update_embeds = True
        if self.backprob_embeds == False:  ## disable backprob into embeds
            print(">>> disable wembeds update <<<")
            update_embeds = False

        best_val_acc, epochs_no_improvement = 0.0, 0

        if dev and model_path is not None and patience > 0:
            print(
                'Using early stopping with patience of {}...'.format(patience))

        batch = []
        print("train..")
        for iteration in range(num_iterations):

            total_loss = 0.0
            total_tagged = 0.0

            indices = [i for i in range(len(train.seqs))]
            random.shuffle(indices)

            loss_accum_loss = defaultdict(float)
            loss_accum_tagged = defaultdict(float)

            for idx in indices:
                seq = train.seqs[idx]

                if seq.task_id not in losses_log:
                    losses_log[seq.task_id] = []  #initialize

                if minibatch_size > 1:
                    # accumulate instances for minibatch update
                    loss1 = self.predict(seq,
                                         train=True,
                                         update_embeds=update_embeds)
                    total_tagged += len(seq.words)
                    batch.append(loss1)
                    if len(batch) == minibatch_size:
                        loss = dynet.esum(batch)
                        total_loss += loss.value()

                        # logging
                        loss_accum_tagged[seq.task_id] += len(seq.words)
                        loss_accum_loss[seq.task_id] += loss.value()

                        loss.backward()
                        self.trainer.update()
                        dynet.renew_cg(
                        )  # use new computational graph for each BATCH when batching is active
                        batch = []
                else:
                    dynet.renew_cg()  # new graph per item
                    loss1 = self.predict(seq,
                                         train=True,
                                         update_embeds=update_embeds)
                    total_tagged += len(seq.words)
                    lv = loss1.value()
                    total_loss += lv

                    # logging
                    loss_accum_tagged[seq.task_id] += len(seq.words)
                    loss_accum_loss[seq.task_id] += loss1.value()

                    loss1.backward()
                    self.trainer.update()

            print("iter {2} {0:>12}: {1:.2f}".format("total loss",
                                                     total_loss / total_tagged,
                                                     iteration))

            # log losses
            for task_id in sorted(losses_log):
                losses_log[task_id].append(loss_accum_loss[task_id] /
                                           loss_accum_tagged[task_id])

            if log_losses:
                dill.dump(losses_log,
                          open(model_path + ".model" + ".losses.pickle", "wb"))

            if dev:
                # evaluate after every epoch
                correct, total = self.evaluate(dev, "task0")
                val_accuracy = correct / total
                print("dev accuracy: {0:.4f}".format(val_accuracy))

                if val_accuracy > best_val_acc:
                    print('Accuracy {0:.4f} is better than best val accuracy '
                          '{1:.4f}.'.format(val_accuracy, best_val_acc))
                    best_val_acc = val_accuracy
                    epochs_no_improvement = 0
                    save(self, model_path)
                else:
                    print(
                        'Accuracy {0:.4f} is worse than best val loss {1:.4f}.'
                        .format(val_accuracy, best_val_acc))
                    epochs_no_improvement += 1

                if patience > 0:
                    if epochs_no_improvement == patience:
                        print(
                            'No improvement for {} epochs. Early stopping...'.
                            format(epochs_no_improvement))
                        break

    def set_indices(self, w2i, c2i, task2t2i, w2c_cache, l2i=None):
        """ helper function for loading model"""
        for task_id in task2t2i:
            self.task2tag2idx[task_id] = task2t2i[task_id]
        self.w2i = w2i
        self.c2i = c2i
        self.w2c_cache = w2c_cache
        self.l2i = l2i

    def set_counts(self, wcount, wtotal, ccount, ctotal):
        """ helper function for loading model"""
        self.wcount = wcount
        self.wtotal = wtotal
        self.ccount = ccount
        self.ctotal = ctotal

    def build_computation_graph(self, num_words, num_chars):
        """
        build graph and link to parameters
        self.predictors, self.char_rnn, self.wembeds, self.cembeds =
        """
        ## initialize word embeddings
        if self.embeds_file:
            print("loading embeddings")
            embeddings, emb_dim = load_embeddings_file(self.embeds_file)
            assert (emb_dim == self.in_dim)
            num_words = len(
                set(embeddings.keys()).union(set(
                    self.w2i.keys())))  # initialize all with embeddings
            # init model parameters and initialize them
            self.wembeds = self.model.add_lookup_parameters(
                (num_words, self.in_dim), init=self.initializer)

            init = 0
            for word in embeddings.keys():
                if word not in self.w2i:
                    self.w2i[word] = len(self.w2i.keys())  # add new word
                    self.wembeds.init_row(self.w2i[word], embeddings[word])
                    init += 1
                elif word in embeddings:
                    self.wembeds.init_row(self.w2i[word], embeddings[word])
                    init += 1
            print("initialized: {}".format(init))
            del embeddings  # clean up
        else:
            self.wembeds = self.model.add_lookup_parameters(
                (num_words, self.in_dim), init=self.initializer)

        ## initialize character embeddings
        self.cembeds = None
        if self.c_in_dim > 0:
            self.cembeds = self.model.add_lookup_parameters(
                (num_chars, self.c_in_dim), init=self.initializer)
        if self.lex_dim > 0 and self.embed_lex:
            # +1 for UNK property
            self.lembeds = self.model.add_lookup_parameters(
                (len(self.dictionary_values) + 1, self.lex_dim),
                init=dynet.GlorotInitializer())  #init=self.initializer)

        # make it more flexible to add number of layers as specified by parameter
        layers = []  # inner layers
        output_layers_dict = {}  # from task_id to actual softmax predictor
        for layer_num in range(0, self.h_layers):
            if layer_num == 0:
                if self.c_in_dim > 0:
                    # in_dim: size of each layer
                    if self.lex_dim > 0 and self.embed_lex:
                        lex_embed_size = self.lex_dim * len(
                            self.dictionary_values)
                        f_builder = self.builder(
                            1, self.in_dim + self.c_h_dim * 2 + lex_embed_size,
                            self.h_dim, self.model)
                        b_builder = self.builder(
                            1, self.in_dim + self.c_h_dim * 2 + lex_embed_size,
                            self.h_dim, self.model)
                    else:
                        f_builder = self.builder(
                            1, self.in_dim + self.c_h_dim * 2 + self.lex_dim,
                            self.h_dim, self.model)
                        b_builder = self.builder(
                            1, self.in_dim + self.c_h_dim * 2 + self.lex_dim,
                            self.h_dim, self.model)
                else:
                    f_builder = self.builder(1, self.in_dim + self.lex_dim,
                                             self.h_dim, self.model)
                    b_builder = self.builder(1, self.in_dim + self.lex_dim,
                                             self.h_dim, self.model)

                layers.append(BiRNNSequencePredictor(
                    f_builder,
                    b_builder))  #returns forward and backward sequence
            else:
                # add inner layers (if h_layers >1)
                f_builder = self.builder(1, self.h_dim, self.h_dim, self.model)
                b_builder = self.builder(1, self.h_dim, self.h_dim, self.model)
                layers.append(BiRNNSequencePredictor(f_builder, b_builder))

        # store at which layer to predict task
        task2layer = {
            task_id: out_layer
            for task_id, out_layer in zip(self.task2tag2idx, self.pred_layer)
        }
        if len(task2layer) > 1:
            print("task2layer", task2layer)
        for task_id in task2layer:
            task_num_labels = len(self.task2tag2idx[task_id])
            if not self.crf:
                output_layers_dict[task_id] = FFSequencePredictor(
                    self.task2tag2idx[task_id],
                    Layer(self.model,
                          self.h_dim * 2,
                          task_num_labels,
                          dynet.softmax,
                          mlp=self.mlp,
                          mlp_activation=self.activation_mlp))
            else:
                print("CRF")
                output_layers_dict[task_id] = CRFSequencePredictor(
                    self.model,
                    task_num_labels,
                    self.task2tag2idx[task_id],
                    Layer(self.model,
                          self.h_dim * 2,
                          task_num_labels,
                          None,
                          mlp=self.mlp,
                          mlp_activation=self.activation_mlp),
                    viterbi_loss=self.viterbi_loss)

        self.char_rnn = BiRNNSequencePredictor(
            self.builder(1, self.c_in_dim, self.c_h_dim, self.model),
            self.builder(1, self.c_in_dim, self.c_h_dim, self.model))

        self.predictors = {}
        self.predictors["inner"] = layers
        self.predictors["output_layers_dict"] = output_layers_dict
        self.predictors["task_expected_at"] = task2layer

    def get_features(self, words, train=False, update=True):
        """
        get feature representations
        """
        # word embeddings
        wfeatures = np.array([
            self.get_w_repr(word, train=train, update=update) for word in words
        ])

        lex_features = []
        if self.dictionary and not self.type_constraint:
            ## add lexicon features
            lex_features = np.array(
                [self.get_lex_repr(word) for word in words])
        # char embeddings
        if self.c_in_dim > 0:
            cfeatures = [self.get_c_repr(word, train=train) for word in words]
            if len(lex_features) > 0:
                lex_features = dynet.inputTensor(lex_features)
                features = [
                    dynet.concatenate([w, c, l])
                    for w, c, l in zip(wfeatures, cfeatures, lex_features)
                ]
            else:
                features = [
                    dynet.concatenate([w, c])
                    for w, c in zip(wfeatures, cfeatures)
                ]
        else:
            features = wfeatures
        if train:  # only do at training time
            features = [dynet.noise(fe, self.noise_sigma) for fe in features]
        return features

    def predict(self,
                seq,
                train=False,
                output_confidences=False,
                unk_tag=None,
                update_embeds=True):
        """
        predict tags for a sentence represented as char+word embeddings and compute losses for this instance
        """
        if not train:
            dynet.renew_cg()
        features = self.get_features(seq.words,
                                     train=train,
                                     update=update_embeds)

        output_expected_at_layer = self.predictors["task_expected_at"][
            seq.task_id]
        output_expected_at_layer -= 1

        # go through layers
        # input is now combination of w + char emb
        prev = features
        prev_rev = features
        num_layers = self.h_layers

        for i in range(0, num_layers):
            predictor = self.predictors["inner"][i]
            forward_sequence, backward_sequence = predictor.predict_sequence(
                prev, prev_rev)
            if i > 0 and self.activation:
                # activation between LSTM layers
                forward_sequence = [
                    self.activation(s) for s in forward_sequence
                ]
                backward_sequence = [
                    self.activation(s) for s in backward_sequence
                ]

            if i == output_expected_at_layer:
                output_predictor = self.predictors["output_layers_dict"][
                    seq.task_id]
                concat_layer = [
                    dynet.concatenate([f, b]) for f, b in zip(
                        forward_sequence, reversed(backward_sequence))
                ]

                if train and self.noise_sigma > 0.0:
                    concat_layer = [
                        dynet.noise(fe, self.noise_sigma)
                        for fe in concat_layer
                    ]
                # fill-in predictions and get loss per tag
                losses = output_predictor.predict_sequence(
                    seq,
                    concat_layer,
                    train=train,
                    output_confidences=output_confidences,
                    unk_tag=unk_tag,
                    dictionary=self.dictionary,
                    type_constraint=self.type_constraint)

            prev = forward_sequence
            prev_rev = backward_sequence

        if train:
            # return losses
            return losses
        else:
            return seq.pred_tags, seq.tag_confidences

    def output_preds(self, seq, raw=False, output_confidences=False):
        """
        output predictions to a file
        """
        i = 0
        for w, g, p in zip(seq.words, seq.tags, seq.pred_tags):
            if raw:
                if output_confidences:
                    print(u"{0}\t{1}\t{2:.2f}".format(w, p,
                                                      seq.tag_confidences[i]))
                else:
                    print(u"{}\t{}".format(
                        w, p))  # do not print DUMMY tag when --raw is on
            else:
                if output_confidences:
                    print(u"{0}\t{1}\t{2}\t{3:.2f}".format(
                        w, g, p, seq.tag_confidences[i]))
                else:
                    print(u"{}\t{}\t{}".format(w, g, p))
            i += 1
        print("")

    def evaluate(self,
                 test_file,
                 task_id,
                 output_predictions=None,
                 raw=False,
                 output_confidences=False,
                 unk_tag=None):
        """
        compute accuracy on a test file, optionally output to file
        """
        correct = 0
        total = 0

        for seq in test_file:
            if seq.task_id != task_id:
                continue  # we evaluate only on a specific task
            self.predict(seq,
                         output_confidences=output_confidences,
                         unk_tag=unk_tag)
            if output_predictions:
                self.output_preds(seq,
                                  raw=raw,
                                  output_confidences=output_confidences)
            correct_inst, total_inst = seq.evaluate()
            correct += correct_inst
            total += total_inst
        return correct, total

    def get_w_repr(self, word, train=False, update=True):
        """
        Get representation of word (word embedding)
        """
        if train:
            if self.w_dropout_rate > 0.0:
                w_id = self.w2i[UNK] if drop(
                    word, self.wcount, self.w_dropout_rate) else self.w2i.get(
                        word, self.w2i[UNK])
        else:
            if self.mimickx_model_path:  # if given use MIMICKX
                if word not in self.w2i:  #
                    #print("predict with MIMICKX for: ", word)
                    return dynet.inputVector(
                        self.mimickx_model.predict(word).npvalue())
            w_id = self.w2i.get(word, self.w2i[UNK])
        if not update:
            return dynet.nobackprop(self.wembeds[w_id])
        else:
            return self.wembeds[w_id]

    def get_c_repr(self, word, train=False):
        """
        Get representation of word via characters sub-LSTMs
        """
        # get representation for words
        if word in self.w2c_cache:
            chars_of_token = self.w2c_cache[word]
            if train:
                chars_of_token = [
                    drop(c, self.ccount, self.c_dropout_rate)
                    for c in chars_of_token
                ]
        else:
            chars_of_token = array.array(
                'I', [self.c2i[WORD_START]]) + array.array(
                    'I', [self.get_c_idx(c, train=train)
                          for c in word]) + array.array(
                              'I', [self.c2i[WORD_END]])

        char_feats = [self.cembeds[c_id] for c_id in chars_of_token]
        # use last state as word representation
        f_char, b_char = self.char_rnn.predict_sequence(char_feats, char_feats)
        return dynet.concatenate([f_char[-1], b_char[-1]])

    def get_c_idx(self, c, train=False):
        """ helper function to get index of character"""
        if self.c_dropout_rate > 0.0 and train and drop(
                c, self.ccount, self.c_dropout_rate):
            return self.c2i.get(UNK)
        else:
            return self.c2i.get(c, self.c2i[UNK])

    def get_lex_repr(self, word):
        """
        Get representation for lexical feature
        """
        if not self.embed_lex:  ## n-hot representation
            n_hot = np.zeros(len(self.dictionary_values))
            values = is_in_dict(word, self.dictionary)
            if values:
                for v in values:
                    n_hot[self.dictionary_values.index(v)] = 1.0
            return n_hot
        else:
            lex_feats = []
            for property in self.dictionary_values:
                values = is_in_dict(word, self.dictionary)
                if values:
                    if property in values:
                        lex_feats.append(
                            self.lembeds[self.l2i[property]].npvalue())
                    else:
                        lex_feats.append(self.lembeds[self.l2i[UNK]].npvalue())
                else:
                    lex_feats.append(
                        self.lembeds[self.l2i[UNK]].npvalue())  # unknown word
            return np.concatenate(lex_feats)

    def init_parameters(self, train_data):
        """init parameters from training data"""
        # word 2 indices and tag 2 indices
        self.w2i = {}  # word to index
        self.c2i = {}  # char to index
        self.task2tag2idx = {}  # id of the task -> tag2idx

        self.w2i[UNK] = 0  # unk word / OOV
        self.c2i[UNK] = 0  # unk char
        self.c2i[WORD_START] = 1  # word start
        self.c2i[WORD_END] = 2  # word end index

        # word and char counters
        self.wcount = Counter()
        self.ccount = Counter()

        for seq in train_data:
            self.wcount.update([w for w in seq.words])
            self.ccount.update([c for w in seq.words for c in w])

            if seq.task_id not in self.task2tag2idx:
                self.task2tag2idx[seq.task_id] = {
                    "<START>": START_TAG,
                    "<END>": END_TAG
                }

            # record words and chars
            for word, tag in zip(seq.words, seq.tags):
                if word not in self.w2i:
                    self.w2i[word] = len(self.w2i)

                if self.c_in_dim > 0:
                    for char in word:
                        if char not in self.c2i:
                            self.c2i[char] = len(self.c2i)

                if tag not in self.task2tag2idx[seq.task_id]:
                    self.task2tag2idx[seq.task_id][tag] = len(
                        self.task2tag2idx[seq.task_id])

        n = int(len(self.w2i) * 0.3)  # top 30%
        print("Caching top {} words".format(n))
        for word in self.wcount.most_common(n):
            self.w2c_cache[word] = array.array(
                'I', [self.c2i[WORD_START]]) + array.array(
                    'I', [self.get_c_idx(c) for c in word]) + array.array(
                        'I', [self.c2i[WORD_END]])
        # get total counts
        self.wtotal = np.sum([self.wcount[w] for w in self.wcount])
        self.ctotal = np.sum([self.ccount[c] for c in self.ccount])
        print("{} w features, {} c features".format(len(self.w2i),
                                                    len(self.c2i)))
        #print(self.wtotal, self.ctotal)

    def save_embeds(self, out_filename):
        """
        save final embeddings to file
        :param out_filename: filename
        """
        # construct reverse mapping
        i2w = {self.w2i[w]: w for w in self.w2i.keys()}

        OUT = open(out_filename + ".w.emb", "w")
        for word_id in i2w.keys():
            wembeds_expression = self.wembeds[word_id]
            word = i2w[word_id]
            OUT.write("{} {}\n".format(
                word,
                " ".join([str(x) for x in wembeds_expression.npvalue()])))
        OUT.close()

    def save_lex_embeds(self, out_filename):
        """
        save final embeddings to file
        :param out_filename: filename
        """
        # construct reverse mapping
        i2l = {self.l2i[w]: w for w in self.l2i.keys()}

        OUT = open(out_filename + ".l.emb", "w")
        for lex_id in i2l.keys():
            lembeds_expression = self.lembeds[lex_id]
            lex = i2l[lex_id]
            OUT.write("{} {}\n".format(
                lex, " ".join([str(x) for x in lembeds_expression.npvalue()])))
        OUT.close()

    def save_cw_embeds(self, out_filename):
        """
        save final character-based word-embeddings to file
        :param out_filename: filename
        """
        # construct reverse mapping using word embeddings
        i2cw = {self.w2i[w]: w for w in self.w2i.keys()}

        OUT = open(out_filename + ".cw.emb", "w")
        for word_id in i2cw.keys():
            word = i2cw[word_id]
            cwembeds = [v.npvalue()[0] for v in self.get_c_repr(word)]
            OUT.write("{} {}\n".format(word,
                                       " ".join([str(x) for x in cwembeds])))
        OUT.close()

    def save_wordlex_map(self, out_filename):
        """
        save final word-to-lexicon-embedding map to file
        :param out_filename: filename
        """
        # construct reverse mapping using word embeddings
        i2wl = {self.w2i[w]: w for w in self.w2i.keys()}

        OUT = open(out_filename + ".wlmap.emb", "w")
        for word_id in i2wl.keys():
            word = i2wl[word_id]

            lex_feats = []
            for property in self.dictionary_values:
                values = is_in_dict(word, self.dictionary)
                if values:
                    if property in values:
                        lex_feats.append(property)
                    else:
                        lex_feats.append(UNK)
                else:
                    lex_feats.append(UNK)  # unknown word

            OUT.write("{} {}\n".format(word,
                                       " ".join([str(x) for x in lex_feats])))
        OUT.close()

    def save_transition_matrix(self, out_filename):
        """
        save transition matrix
        :param out_filename: filename
        """
        for task_id in self.predictors["output_layers_dict"].keys():
            output_predictor = self.predictors["output_layers_dict"][task_id]
            output_predictor.save_parameters(out_filename)
Esempio n. 8
0
class SimpleBiltyTagger(object):

    # turn dynamic allocation off by defining slots
    __slots__ = [
        'w2i', 'c2i', 'tag2idx', 'model', 'in_dim', 'c_in_dim', 'h_dim',
        'activation', 'noise_sigma', 'h_layers', 'predictors', 'wembeds',
        'cembeds', 'embeds_file', 'char_rnn', 'trainer'
    ]

    def __init__(self,
                 in_dim,
                 h_dim,
                 c_in_dim,
                 h_layers,
                 embeds_file=None,
                 activation=dynet.tanh,
                 noise_sigma=0.1,
                 word2id=None,
                 trainer="adam",
                 clip_threshold=5.0,
                 learning_rate=0.001):
        # use default Adam learning rate - TODO: support other optimizer-specific options
        self.w2i = {} if word2id is None else word2id  # word to index mapping
        self.c2i = {}  # char to index mapping
        self.tag2idx = {}  # tag to tag_id mapping
        self.model = dynet.ParameterCollection()  #init model
        # init trainer
        train_algo = TRAINER_MAP[trainer]
        self.trainer = train_algo(self.model, learning_rate)
        if clip_threshold:
            self.trainer.set_clip_threshold(clip_threshold)
        self.in_dim = in_dim
        self.h_dim = h_dim
        self.c_in_dim = c_in_dim
        self.activation = activation
        self.noise_sigma = noise_sigma
        self.h_layers = h_layers
        self.predictors = {
            "inner": [],
            "output_layers_dict": {},
            "task_expected_at": {}
        }  # the inner layers and predictors
        self.wembeds = None  # lookup: embeddings for words
        self.cembeds = None  # lookup: embeddings for characters
        self.embeds_file = embeds_file
        self.char_rnn = None  # RNN for character input

    def pick_neg_log(self, pred, gold):
        if hasattr(gold, "__len__"):
            # calculate cross-entropy loss against the whole vector
            dy_gold = dynet.inputVector(gold)
            return -dynet.sum_elems(dynet.cmult(dy_gold, dynet.log(pred)))
        return -dynet.log(dynet.pick(pred, gold))

    def set_indices(self, w2i, c2i, tag2idx):
        self.tag2idx = tag2idx
        self.w2i = w2i
        self.c2i = c2i

    def cosine(self, e1, e2):
        return dynet.cdiv(
            dynet.dot_product(e1, e2),
            (dynet.cmult(dynet.squared_norm(e1), dynet.squared_norm(e2))))

    def fit(self,
            train_X,
            train_Y,
            num_epochs,
            val_X=None,
            val_Y=None,
            patience=2,
            model_path=None,
            seed=None,
            word_dropout_rate=0.25,
            trg_vectors=None,
            unsup_weight=1.0,
            variance_weights=None,
            labeled_weight_proportion=1.0):
        """
        train the tagger
        :param trg_vectors: the prediction targets used for the unsupervised loss
                            in temporal ensembling
        :param unsup_weight: weight for the unsupervised consistency loss
                                    used in temporal ensembling
        :param clip_threshold: use gradient clipping with threshold (on if >0; default: 5.0)
        :param labeled_weight_proportion: proportion of the unsupervised weight
                                          that should be assigned to labeled
                                          examples
        """
        print("read training data", file=sys.stderr)

        if variance_weights is not None:
            print('First 20 variance weights:', variance_weights[:20])

        if seed:
            print(">>> using seed: ", seed, file=sys.stderr)
            random.seed(seed)  #setting random seed

        # if we use word dropout keep track of counts
        if word_dropout_rate > 0.0:
            widCount = Counter()
            for sentence, _ in train_X:
                widCount.update([w for w in sentence])

        assert (len(train_X) == len(train_Y))
        train_data = list(zip(train_X, train_Y))

        # if we use target vectors, keep track of the targets per sentence
        if trg_vectors is not None:
            trg_start_id = 0
            sentence_trg_vectors = []
            sentence_var_weights = []
            for i, (example, y) in enumerate(train_data):
                sentence_trg_vectors.append(
                    trg_vectors[trg_start_id:trg_start_id +
                                len(example[0]), :])
                if variance_weights is not None:
                    sentence_var_weights.append(
                        variance_weights[trg_start_id:trg_start_id +
                                         len(example[0])])
                trg_start_id += len(example[0])
            assert trg_start_id == len(trg_vectors),\
                'Error: Idx {} is not at {}.'.format(trg_start_id, len(trg_vectors))
            assert len(sentence_trg_vectors) == len(train_X)
            if variance_weights is not None:
                assert trg_start_id == len(variance_weights)
                assert len(sentence_var_weights) == len(train_X)

        print('Starting training for {} epochs...'.format(num_epochs))
        best_val_acc, epochs_no_improvement = 0., 0
        if val_X is not None and val_Y is not None and model_path is not None:
            print(
                'Using early stopping with patience of {}...'.format(patience))

        for cur_iter in range(num_epochs):
            bar = Bar('Training epoch {}/{}...'.format(cur_iter + 1,
                                                       num_epochs),
                      max=len(train_data),
                      flush=True)
            total_loss = 0.0
            total_tagged = 0.0

            total_other_loss, total_other_loss_weighted = 0.0, 0.0

            random_indices = np.arange(len(train_data))
            random.shuffle(random_indices)

            for i, idx in enumerate(random_indices):
                (word_indices, char_indices), y = train_data[idx]

                if word_dropout_rate > 0.0:
                    word_indices = [
                        self.w2i["_UNK"] if
                        (random.random() >
                         (widCount.get(w) /
                          (word_dropout_rate + widCount.get(w)))) else w
                        for w in word_indices
                    ]
                output = self.predict(word_indices, char_indices, train=True)

                if len(y) == 1 and y[0] == 0:
                    # in temporal ensembling, we assign a dummy label of [0] for
                    # unlabeled sequences; we skip the supervised loss for these
                    loss = dynet.scalarInput(0)
                else:
                    loss = dynet.esum([
                        self.pick_neg_log(pred, gold)
                        for pred, gold in zip(output, y)
                    ])

                if trg_vectors is not None:
                    # the consistency loss in temporal ensembling is used for
                    # both supervised and unsupervised input
                    targets = sentence_trg_vectors[idx]
                    assert len(output) == len(targets)
                    if variance_weights is not None:
                        var_weights = sentence_var_weights[idx]
                        assert len(output) == len(var_weights)
                        # multiply the normalized mean variance with each loss
                        other_loss = dynet.esum([
                            v * dynet.squared_distance(o, dynet.inputVector(t))
                            for o, t, v in zip(output, targets, var_weights)
                        ])
                    else:
                        other_loss = dynet.esum([
                            dynet.squared_distance(o, dynet.inputVector(t))
                            for o, t in zip(output, targets)
                        ])

                    total_other_loss += other_loss.value()
                    if len(y) == 1 and y[0] == 0:  #unlab_ex
                        other_loss += other_loss * unsup_weight
                    else:  #lab_ex
                        # assign the unsupervised weight for labeled examples
                        other_loss += other_loss * unsup_weight * labeled_weight_proportion
                    # keep track for logging
                    total_loss += loss.value()  # main loss
                    total_tagged += len(word_indices)
                    total_other_loss_weighted += other_loss.value()

                    # combine losses
                    loss += other_loss

                else:
                    # keep track for logging
                    total_loss += loss.value()
                    total_tagged += len(word_indices)

                loss.backward()
                self.trainer.update()
                bar.next()

            if trg_vectors is None:
                print("iter {2} {0:>12}: {1:.2f}".format(
                    "total loss", total_loss / total_tagged, cur_iter),
                      file=sys.stderr)
            else:
                print(
                    "iter {2} {0:>12}: {1:.2f} unsupervised loss: {3:.2f} (weighted: {4:.2f})"
                    .format("supervised loss", total_loss / total_tagged,
                            cur_iter, total_other_loss / total_tagged,
                            total_other_loss_weighted / total_tagged),
                    file=sys.stderr)

            if val_X is not None and val_Y is not None and model_path is not None:
                # get the best accuracy on the validation set
                val_correct, val_total = self.evaluate(val_X, val_Y)
                val_accuracy = val_correct / val_total

                if val_accuracy > best_val_acc:
                    print(
                        'Accuracy {:.4f} is better than best val accuracy {:.4f}'
                        .format(val_accuracy, best_val_acc))
                    best_val_acc = val_accuracy
                    epochs_no_improvement = 0
                    save_tagger(self, model_path)
                else:
                    print(
                        'Accuracy {:.4f} is worse than best val loss {:.4f}.'.
                        format(val_accuracy, best_val_acc))
                    epochs_no_improvement += 1
                if epochs_no_improvement == patience:
                    print('No improvement for {} epochs. Early stopping...'.
                          format(epochs_no_improvement))
                    break

    def initialize_graph(self, num_words=None, num_chars=None):
        """
        build graph and link to parameters
        """
        num_words = num_words if num_words is not None else len(self.w2i)
        num_chars = num_chars if num_chars is not None else len(self.c2i)
        if num_words == 0 or num_chars == 0:
            raise ValueError('Word2id and char2id have to be loaded before '
                             'initializing the graph.')
        print('Initializing the graph...')

        # initialize the word embeddings and the parameters
        self.cembeds = None
        if self.embeds_file:
            print("loading embeddings", file=sys.stderr)
            embeddings, emb_dim = load_embeddings_file(self.embeds_file)
            assert (emb_dim == self.in_dim)
            num_words = len(
                set(embeddings.keys()).union(set(
                    self.w2i.keys())))  # initialize all with embeddings
            # init model parameters and initialize them
            self.wembeds = self.model.add_lookup_parameters(
                (num_words, self.in_dim), init=dynet.ConstInitializer(0.01))

            if self.c_in_dim > 0:
                self.cembeds = self.model.add_lookup_parameters(
                    (num_chars, self.c_in_dim),
                    init=dynet.ConstInitializer(0.01))

            init = 0
            l = len(embeddings.keys())
            for word in embeddings.keys():
                # for those words we have already in w2i, update vector, otherwise add to w2i (since we keep data as integers)
                if word in self.w2i:
                    self.wembeds.init_row(self.w2i[word], embeddings[word])
                else:
                    self.w2i[word] = len(self.w2i.keys())  # add new word
                    self.wembeds.init_row(self.w2i[word], embeddings[word])
                init += 1
            print("initialized: {}".format(init), file=sys.stderr)

        else:
            self.wembeds = self.model.add_lookup_parameters(
                (num_words, self.in_dim), init=dynet.ConstInitializer(0.01))
            if self.c_in_dim > 0:
                self.cembeds = self.model.add_lookup_parameters(
                    (num_chars, self.c_in_dim),
                    init=dynet.ConstInitializer(0.01))

        # make it more flexible to add number of layers as specified by parameter
        layers = []  # inner layers

        for layer_num in range(0, self.h_layers):

            if layer_num == 0:
                if self.c_in_dim > 0:
                    f_builder = dynet.CoupledLSTMBuilder(
                        1, self.in_dim + self.c_in_dim * 2, self.h_dim,
                        self.model)  # in_dim: size of each layer
                    b_builder = dynet.CoupledLSTMBuilder(
                        1, self.in_dim + self.c_in_dim * 2, self.h_dim,
                        self.model)
                else:
                    f_builder = dynet.CoupledLSTMBuilder(
                        1, self.in_dim, self.h_dim, self.model)
                    b_builder = dynet.CoupledLSTMBuilder(
                        1, self.in_dim, self.h_dim, self.model)
                layers.append(BiRNNSequencePredictor(
                    f_builder,
                    b_builder))  #returns forward and backward sequence
            else:
                # add inner layers (if h_layers >1)
                f_builder = dynet.LSTMBuilder(1, self.h_dim, self.h_dim,
                                              self.model)
                b_builder = dynet.LSTMBuilder(1, self.h_dim, self.h_dim,
                                              self.model)
                layers.append(BiRNNSequencePredictor(f_builder, b_builder))

        # store at which layer to predict task
        task_num_labels = len(self.tag2idx)
        output_layer = FFSequencePredictor(
            Layer(self.model, self.h_dim * 2, task_num_labels, dynet.softmax))

        if self.c_in_dim > 0:
            self.char_rnn = BiRNNSequencePredictor(
                dynet.CoupledLSTMBuilder(1, self.c_in_dim, self.c_in_dim,
                                         self.model),
                dynet.CoupledLSTMBuilder(1, self.c_in_dim, self.c_in_dim,
                                         self.model))
        else:
            self.char_rnn = None

        self.predictors = dict()
        self.predictors["inner"] = layers
        self.predictors["output_layers_dict"] = output_layer
        self.predictors["task_expected_at"] = self.h_layers

    def get_features(self, words):
        """
        from a list of words, return the word and word char indices
        """
        word_indices = []
        word_char_indices = []
        for word in words:
            if word in self.w2i:
                word_indices.append(self.w2i[word])
            else:
                word_indices.append(self.w2i["_UNK"])

            if self.c_in_dim > 0:
                chars_of_word = [self.c2i["<w>"]]
                for char in word:
                    if char in self.c2i:
                        chars_of_word.append(self.c2i[char])
                    else:
                        chars_of_word.append(self.c2i["_UNK"])
                chars_of_word.append(self.c2i["</w>"])
                word_char_indices.append(chars_of_word)
        return word_indices, word_char_indices

    def __get_instances_from_file(self, file_name):
        """
        helper function to convert input file to lists of lists holding input words|tags
        """
        data = [(words, tags)
                for (words, tags) in list(read_conll_file(file_name))]
        words = [words for (words, _) in data]
        tags = [tags for (_, tags) in data]
        return words, tags

    def get_data_as_indices(self, file_name):
        """
        X = list of (word_indices, word_char_indices)
        Y = list of tag indices
        """
        words, tags = self.__get_instances_from_file(file_name)
        return self.get_data_as_indices_from_instances(words, tags)

    def get_data_as_indices_from_instances(self, dev_words, dev_tags):
        """
        Extension of get_data_as_indices. Use words and tags rather than a file as input.
        X = list of (word_indices, word_char_indices)
        Y = list of tag indices
        """
        X, Y = [], []
        org_X, org_Y = [], []

        for (words, tags) in zip(dev_words, dev_tags):
            word_indices, word_char_indices = self.get_features(words)
            # if tag does not exist in source domain tags, return as default
            # first idx outside of dictionary
            tag_indices = [
                self.tag2idx.get(tag, len(self.tag2idx)) for tag in tags
            ]
            X.append((word_indices, word_char_indices))
            Y.append(tag_indices)
            org_X.append(words)
            org_Y.append(tags)
        return X, Y  # , org_X, org_Y - for now don't use

    def predict(self,
                word_indices,
                char_indices,
                train=False,
                soft_labels=False,
                temperature=None):
        """
        predict tags for a sentence represented as char+word embeddings
        """
        dynet.renew_cg()  # new graph

        char_emb = []
        rev_char_emb = []

        wfeatures = [self.wembeds[w] for w in word_indices]

        if self.c_in_dim > 0:
            # get representation for words
            for chars_of_token in char_indices:
                char_feats = [self.cembeds[c] for c in chars_of_token]
                # use last state as word representation
                f_char, b_char = self.char_rnn.predict_sequence(
                    char_feats, char_feats)
                last_state = f_char[-1]
                rev_last_state = b_char[-1]
                char_emb.append(last_state)
                rev_char_emb.append(rev_last_state)

            features = [
                dynet.concatenate([w, c, rev_c])
                for w, c, rev_c in zip(wfeatures, char_emb, rev_char_emb)
            ]
        else:
            features = wfeatures

        if train:  # only do at training time
            features = [dynet.noise(fe, self.noise_sigma) for fe in features]

        output_expected_at_layer = self.h_layers
        output_expected_at_layer -= 1

        # go through layers
        prev = features
        prev_rev = features
        num_layers = self.h_layers
        for i in range(0, num_layers):
            predictor = self.predictors["inner"][i]
            forward_sequence, backward_sequence = predictor.predict_sequence(
                prev, prev_rev)
            if i > 0 and self.activation:
                # activation between LSTM layers
                forward_sequence = [
                    self.activation(s) for s in forward_sequence
                ]
                backward_sequence = [
                    self.activation(s) for s in backward_sequence
                ]

            if i == output_expected_at_layer:
                output_predictor = self.predictors["output_layers_dict"]
                concat_layer = [
                    dynet.concatenate([f, b]) for f, b in zip(
                        forward_sequence, reversed(backward_sequence))
                ]
                if train and self.noise_sigma > 0.0:
                    concat_layer = [
                        dynet.noise(fe, self.noise_sigma)
                        for fe in concat_layer
                    ]
                output = output_predictor.predict_sequence(
                    concat_layer,
                    soft_labels=soft_labels,
                    temperature=temperature)
                return output

            prev = forward_sequence
            prev_rev = backward_sequence

        raise Exception("oops should not be here")
        return None

    def evaluate(self, test_X, test_Y):
        """
        compute accuracy on a test file
        """
        correct = 0
        total = 0.0

        for i, ((word_indices, word_char_indices),
                gold_tag_indices) in enumerate(zip(test_X, test_Y)):

            output = self.predict(word_indices, word_char_indices)
            predicted_tag_indices = [np.argmax(o.value()) for o in output]

            correct += sum([
                1 for (predicted,
                       gold) in zip(predicted_tag_indices, gold_tag_indices)
                if predicted == gold
            ])
            total += len(gold_tag_indices)

        return correct, total

    def get_predictions(self, test_X, soft_labels=False):
        """
        get flat list of predictions
        """
        predictions = []
        for word_indices, word_char_indices in test_X:
            output = self.predict(word_indices, word_char_indices)
            predictions += [
                o.value() if soft_labels else int(np.argmax(o.value()))
                for o in output
            ]
        return predictions

    def get_predictions_output(self, test_X, test_labels, output_filename):
        """
        get predictions to output to file
        assume test_labels are not indices (as target domain can have tags that are not in source)
        text_X: indices
        test_labels: original labels
        """
        i2w = {self.w2i[w]: w for w in self.w2i.keys()}
        i2t = {self.tag2idx[t]: t for t in self.tag2idx.keys()}

        OUT = open(output_filename, "w")
        for (word_indices,
             word_char_indices), gold_tags in zip(test_X, test_labels):
            output = self.predict(word_indices, word_char_indices)
            predicted_tag_ids = [int(np.argmax(o.value())) for o in output]

            for word_id, tag_id, gold_tag in zip(word_indices,
                                                 predicted_tag_ids, gold_tags):
                known_tag_prefix = "{}" if gold_tag in self.tag2idx else "*{}"
                word, pred_tag, gold_tag = i2w[word_id], i2t[
                    tag_id], known_tag_prefix.format(gold_tag)
                OUT.write("{}\t{}\t{}\n".format(word, gold_tag, pred_tag))
            OUT.write("\n")
        OUT.close()

    def get_train_data_from_instances(self, train_words, train_tags):
        """
        Extension of get_train_data method. Extracts training data from two arrays of word and label lists.
        transform training data to features (word indices)
        map tags to integers
        :param train_words: a numpy array containing lists of words
        :param train_tags: a numpy array containing lists of corresponding tags
        """
        X = []
        Y = []

        # check if we continue training
        continue_training = False
        if self.w2i and self.tag2idx:
            continue_training = True

        if continue_training:
            print("update existing vocabulary")
            # fetch already existing
            w2i = self.w2i.copy()
            c2i = self.c2i.copy()
            tag2idx = self.tag2idx

            assert w2i["_UNK"] == 0, "No _UNK found!"
        else:
            # word 2 indices and tag 2 indices
            w2i = self.w2i.copy(
            )  # get a copy that refers to a different object
            c2i = {}  # char to index
            tag2idx = {}  # tag2idx

            if len(w2i) > 0:
                assert w2i["_UNK"] == 0
            else:
                w2i["_UNK"] = 0  # unk word / OOV

            c2i["_UNK"] = 0  # unk char
            c2i["<w>"] = 1  # word start
            c2i["</w>"] = 2  # word end index

        num_sentences = 0
        num_tokens = 0
        for instance_idx, (words,
                           tags) in enumerate(zip(train_words, train_tags)):
            instance_word_indices = []  # sequence of word indices
            instance_char_indices = []  # sequence of char indices
            instance_tags_indices = []  # sequence of tag indices

            for i, (word, tag) in enumerate(zip(words, tags)):
                # map words and tags to indices
                if word not in w2i:
                    w2i[word] = len(w2i)
                    instance_word_indices.append(w2i[word])
                else:
                    instance_word_indices.append(w2i[word])

                chars_of_word = [c2i["<w>"]]
                for char in word:
                    if char not in c2i:
                        c2i[char] = len(c2i)
                    chars_of_word.append(c2i[char])
                chars_of_word.append(c2i["</w>"])
                instance_char_indices.append(chars_of_word)

                if tag not in tag2idx:
                    tag2idx[tag] = len(tag2idx)

                instance_tags_indices.append(tag2idx.get(tag))

                num_tokens += 1

            num_sentences += 1

            X.append(
                (instance_word_indices, instance_char_indices)
            )  # list of word indices, for every word list of char indices
            Y.append(instance_tags_indices)

        print("%s sentences %s tokens" % (num_sentences, num_tokens),
              file=sys.stderr)
        print("%s w features, %s c features " % (len(w2i), len(c2i)),
              file=sys.stderr)

        assert (len(X) == len(Y))

        # store mappings of words and tags to indices
        self.set_indices(w2i, c2i, tag2idx)

        return X, Y

    def get_train_data(self, train_data):
        """
        transform training data to features (word indices)
        map tags to integers
        """
        train_words, train_tags = self.__get_instances_from_file(train_data)
        return self.get_train_data_from_instances(train_words, train_tags)
Esempio n. 9
0
    def build_computation_graph(self, num_words, num_chars):
        """
        build graph and link to parameters
        """
        ## initialize word embeddings
        if self.embeds_file:
            print("loading embeddings", file=sys.stderr)
            embeddings, emb_dim = load_embeddings_file(self.embeds_file)
            assert(emb_dim==self.in_dim)
            num_words=len(set(embeddings.keys()).union(set(self.w2i.keys()))) # initialize all with embeddings
            # init model parameters and initialize them
            wembeds = self.model.add_lookup_parameters((num_words, self.in_dim), init=self.initializer)

            init=0
            for word in embeddings:
                if word not in self.w2i:
                    self.w2i[word]=len(self.w2i.keys()) # add new word
                    wembeds.init_row(self.w2i[word], embeddings[word])
                    init +=1 
                elif word in embeddings:
                    wembeds.init_row(self.w2i[word], embeddings[word])
                    init += 1
            print("initialized: {}".format(init), file=sys.stderr)

        else:
            wembeds = self.model.add_lookup_parameters((num_words, self.in_dim), init=self.initializer)


        ## initialize character embeddings
        cembeds = None
        if self.c_in_dim > 0:
            cembeds = self.model.add_lookup_parameters((num_chars, self.c_in_dim), init=self.initializer)
               

        # make it more flexible to add number of layers as specified by parameter
        layers = [] # inner layers
        output_layers_dict = {}   # from task_id to actual softmax predictor
        task_expected_at = {} # map task_id => output_layer_#

        # connect output layers to tasks
        for output_layer, task_id in zip(self.pred_layer, self.tasks_ids):
            if output_layer > self.h_layers:
                raise ValueError("cannot have a task at a layer (%d) which is "
                                 "beyond the model, increase h_layers (%d)"
                                 % (output_layer, self.h_layers))
            task_expected_at[task_id] = output_layer
        nb_tasks = len( self.tasks_ids )

        for layer_num in range(0,self.h_layers):
            if layer_num == 0:
                if self.c_in_dim > 0:
                    # in_dim: size of each layer
                    f_builder = self.builder(1, self.in_dim+self.c_in_dim*2, self.h_dim, self.model) 
                    b_builder = self.builder(1, self.in_dim+self.c_in_dim*2, self.h_dim, self.model) 
                else:
                    f_builder = self.builder(1, self.in_dim, self.h_dim, self.model)
                    b_builder = self.builder(1, self.in_dim, self.h_dim, self.model)

                layers.append(BiRNNSequencePredictor(f_builder, b_builder)) #returns forward and backward sequence
            else:
                # add inner layers (if h_layers >1)
                f_builder = self.builder(1, self.h_dim, self.h_dim, self.model)
                b_builder = self.builder(1, self.h_dim, self.h_dim, self.model)
                layers.append(BiRNNSequencePredictor(f_builder, b_builder))

        # store at which layer to predict task
        for task_id in self.tasks_ids:
            task_num_labels= len(self.task2tag2idx[task_id])
            output_layers_dict[task_id] = FFSequencePredictor(Layer(self.model, self.h_dim*2, task_num_labels, dynet.softmax, mlp=self.mlp, mlp_activation=self.activation_mlp))

        char_rnn = BiRNNSequencePredictor(self.builder(1, self.c_in_dim, self.c_in_dim, self.model),
                                          self.builder(1, self.c_in_dim, self.c_in_dim, self.model))

        predictors = {}
        predictors["inner"] = layers
        predictors["output_layers_dict"] = output_layers_dict
        predictors["task_expected_at"] = task_expected_at

        return predictors, char_rnn, wembeds, cembeds
Esempio n. 10
0
class Amt3Tagger(object):
    def __init__(self,
                 in_dim,
                 h_dim,
                 c_in_dim,
                 h_layers,
                 embeds_file=None,
                 activation=dynet.tanh,
                 noise_sigma=0.1,
                 word2id=None,
                 add_hidden=False,
                 trainer="adam",
                 clip_threshold=5.0,
                 learning_rate=0.001,
                 adversarial_domains=None):
        self.w2i = {} if word2id is None else word2id  # word to index mapping
        self.c2i = {}  # char to index mapping
        self.tag2idx = {}  # tag to tag_id mapping
        self.model = dynet.ParameterCollection()  # init model
        # init trainer
        train_algo = TRAINER_MAP[trainer]
        self.trainer = train_algo(self.model, learning_rate)
        if clip_threshold:
            self.trainer.set_clip_threshold(clip_threshold)
        self.in_dim = in_dim
        self.h_dim = h_dim
        self.c_in_dim = c_in_dim
        self.activation = activation
        self.noise_sigma = noise_sigma
        self.h_layers = h_layers
        self.predictors = {
            "inner": [],
            "output_layers_dict": {},
            "task_expected_at": {}
        }  # the inner layers and predictors
        self.wembeds = None  # lookup: embeddings for words
        self.cembeds = None  # lookup: embeddings for characters
        self.embeds_file = embeds_file
        self.char_rnn = None  # RNN for character input
        self.task_ids = ["F0", "F1", "Ft"]
        self.add_hidden = add_hidden
        self.adversarial_domains = adversarial_domains

    def add_adversarial_loss(self, num_domains=2):
        if not self.adversarial_domains:  # make sure they are set the latest here
            self.adversarial_domains = num_domains
        self.adv_layer = Layer(self.model,
                               2 * self.h_dim,
                               num_domains,
                               activation=dynet.softmax,
                               mlp=self.h_dim if self.add_hidden else 0)

    def pick_neg_log(self, pred, gold):
        if not isinstance(gold, int):
            # calculate cross-entropy loss against the whole vector
            dy_gold = dynet.inputVector(gold)
            return -dynet.sum_elems(dynet.cmult(dy_gold, dynet.log(pred)))
        return -dynet.log(dynet.pick(pred, gold))

    def set_indices(self, w2i, c2i, tag2idx):
        self.tag2idx = tag2idx
        self.w2i = w2i
        self.c2i = c2i

    def fit(self,
            train_dict,
            num_epochs,
            val_X=None,
            val_Y=None,
            patience=2,
            model_path=None,
            seed=None,
            word_dropout_rate=0.25,
            trg_vectors=None,
            unsup_weight=1.0,
            clip_threshold=5.0,
            orthogonality_weight=0.0,
            adversarial=False,
            adversarial_weight=1.0,
            ignore_src_Ft=False):
        """
        train the tagger
        :param trg_vectors: the prediction targets used for the unsupervised loss
                            in temporal ensembling
        :param unsup_weight: weight for the unsupervised consistency loss
                                    used in temporal ensembling
        :param adversarial: note: if we want to use adversarial, we have to
                            call add_adversarial_loss before;
        :param adversarial_weight: 1 by default (do not weigh adv loss)
        :param ignore_src_Ft: if asymm.tri. 2nd stage, do not further train Ft on 'src'
        :param train_dict: a dictionary mapping tasks ("F0", "F1", and "Ft")
                           to a dictionary
                           {"X": list of examples,
                            "Y": list of labels,
                            "domain": list of domain tag (0,1) of example}
        Three tasks are indexed as "F0", "F1" and "Ft"

        Note: if a task 'src' is given than a single model with three heads is trained where
        all data is given to all outputs
        """
        print("read training data")

        widCount = Counter()
        train_data = []
        for task, task_dict in train_dict.items():  #task: eg. "F0"
            for key in ["X", "Y", "domain"]:
                assert key in task_dict, "Error: %s is not available." % key
            examples, labels, domain_tags = task_dict["X"], task_dict[
                "Y"], task_dict["domain"]
            assert len(examples) == len(labels)
            if word_dropout_rate > 0.0:
                # keep track of the counts for word dropout
                for sentence, _ in examples:
                    widCount.update([w for w in sentence])

            # train data is a list of 4-tuples: (example, label, task_id, domain_id)
            train_data += list(
                zip(examples, labels, [[task] * len(labels)][0], domain_tags))

        # if we use target vectors, keep track of the targets per sentence
        if trg_vectors is not None:
            trg_start_id = 0
            sentence_trg_vectors = []
            for i, (example, y) in enumerate(train_data):
                sentence_trg_vectors.append(
                    trg_vectors[trg_start_id:trg_start_id +
                                len(example[0]), :])
                trg_start_id += len(example[0])
            assert trg_start_id == len(trg_vectors),\
                'Error: Idx {} is not at {}.'.format(trg_start_id, len(trg_vectors))

        print('Starting training for {} epochs...'.format(num_epochs))
        best_val_acc, epochs_no_improvement = 0., 0
        if val_X is not None and val_Y is not None and model_path is not None:
            print(
                'Using early stopping with patience of {}...'.format(patience))

        if seed:
            random.seed(seed)

        for cur_iter in range(num_epochs):
            bar = Bar('Training epoch {}/{}...'.format(cur_iter + 1,
                                                       num_epochs),
                      max=len(train_data),
                      flush=True)

            random_indices = np.arange(len(train_data))
            random.shuffle(random_indices)

            total_loss, total_tagged, total_constraint, total_adversarial = 0.0, 0.0, 0.0, 0.0
            total_orth_constr = 0  # count how many updates

            # log separate losses
            log_losses = {}
            log_total = {}
            for task_id in self.task_ids:
                log_losses[task_id] = 0.0
                log_total[task_id] = 0

            for i, idx in enumerate(random_indices):
                (word_indices,
                 char_indices), y, task_id, domain_id = train_data[idx]

                if word_dropout_rate > 0.0:
                    word_indices = [
                        self.w2i["_UNK"] if
                        (random.random() >
                         (widCount.get(w) /
                          (word_dropout_rate + widCount.get(w)))) else w
                        for w in word_indices
                    ]

                output, constraint, adv = self.predict(
                    word_indices,
                    char_indices,
                    task_id,
                    train=True,
                    orthogonality_weight=orthogonality_weight,
                    domain_id=domain_id if adversarial else None)

                if task_id not in ['src', 'trg']:

                    if len(y) == 1 and y[0] == 0:
                        # in temporal ensembling, we assign a dummy label of [0] for
                        # unlabeled sequences; we skip the supervised loss for these
                        loss = dynet.scalarInput(0)
                    else:
                        loss = dynet.esum([
                            self.pick_neg_log(pred, gold)
                            for pred, gold in zip(output, y)
                        ])

                    if trg_vectors is not None:
                        # the consistency loss in temporal ensembling is used for
                        # both supervised and unsupervised input
                        targets = sentence_trg_vectors[idx]
                        assert len(output) == len(targets)
                        other_loss = unsup_weight * dynet.average([
                            dynet.squared_distance(o, dynet.inputVector(t))
                            for o, t in zip(output, targets)
                        ])
                        loss += other_loss

                    if orthogonality_weight != 0.0 and task_id != 'Ft':
                        # add the orthogonality constraint to the loss
                        total_constraint += constraint.value(
                        ) * orthogonality_weight
                        total_orth_constr += 1
                        loss += constraint * orthogonality_weight

                    if adversarial:
                        total_adversarial += adv.value() * adversarial_weight
                        loss += adv * adversarial_weight

                    total_loss += loss.value()  # for output

                    log_losses[task_id] += total_loss
                    total_tagged += len(word_indices)
                    log_total[task_id] += total_tagged

                    loss.backward()
                    self.trainer.update()
                    bar.next()
                else:
                    # bootstrap=False, the output contains list of outputs one for each task
                    assert trg_vectors is None, 'temporal ensembling not implemented for bootstrap=False'
                    loss = dynet.scalarInput(1)  #initialize
                    if ignore_src_Ft:
                        output = output[:
                                        -1]  # ignore last = Ft when further training with 'src'

                    for t_i, output_t in enumerate(
                            output):  # get loss for each task
                        loss += dynet.esum([
                            self.pick_neg_log(pred, gold)
                            for pred, gold in zip(output_t, y)
                        ])
                        task_id = self.task_ids[t_i]
                        log_losses[task_id] += total_loss
                        log_total[task_id] += total_tagged

                    if orthogonality_weight != 0.0:
                        # add the orthogonality constraint to the loss
                        total_constraint += constraint.value(
                        ) * orthogonality_weight
                        total_orth_constr += 1
                        loss += constraint * orthogonality_weight

                    if adversarial:
                        total_adversarial += adv.value() * adversarial_weight
                        loss += adv * adversarial_weight

                    total_loss += loss.value()  # for output
                    total_tagged += len(word_indices)

                    loss.backward()
                    self.trainer.update()
                    bar.next()

            if adversarial and orthogonality_weight:
                print(
                    "iter {}. Total loss: {:.3f}, total penalty: {:.3f}, total weighted adv loss: {:.3f}"
                    .format(cur_iter, total_loss / total_tagged,
                            total_constraint / total_orth_constr,
                            total_adversarial / total_tagged),
                    file=sys.stderr)
            elif orthogonality_weight:
                print("iter {}. Total loss: {:.3f}, total penalty: {:.3f}".
                      format(cur_iter, total_loss / total_tagged,
                             total_constraint / total_orth_constr),
                      file=sys.stderr)
            else:
                print("iter {}. Total loss: {:.3f} ".format(
                    cur_iter, total_loss / total_tagged),
                      file=sys.stderr)

            for task_id in self.task_ids:
                if log_total[task_id] > 0:
                    print("{0}: {1:.3f}".format(
                        task_id, log_losses[task_id] / log_total[task_id]))

            if val_X is not None and val_Y is not None and model_path is not None:
                # get the best accuracy on the validation set
                val_correct, val_total = self.evaluate(val_X, val_Y)
                val_accuracy = val_correct / val_total

                if val_accuracy > best_val_acc:
                    print(
                        'Accuracy {:.4f} is better than best val accuracy {:.4f}.'
                        .format(val_accuracy, best_val_acc))
                    best_val_acc = val_accuracy
                    epochs_no_improvement = 0
                    save_tagger(self, model_path)
                else:
                    print(
                        'Accuracy {:.4f} is worse than best val loss {:.4f}.'.
                        format(val_accuracy, best_val_acc))
                    epochs_no_improvement += 1
                if epochs_no_improvement == patience:
                    print('No improvement for {} epochs. Early stopping...'.
                          format(epochs_no_improvement))
                    break

    def initialize_graph(self, num_words=None, num_chars=None):
        """
        build graph and link to parameters

        F2=True: activate second auxiliary output
        Ft=True: activate third auxiliary output

        """
        num_words = num_words if num_words is not None else len(self.w2i)
        num_chars = num_chars if num_chars is not None else len(self.c2i)
        if num_words == 0 or num_chars == 0:
            raise ValueError('Word2id and char2id have to be loaded before '
                             'initializing the graph.')
        print('Initializing the graph...')

        # initialize the word embeddings and the parameters
        self.cembeds = None
        if self.embeds_file:
            print("loading embeddings", file=sys.stderr)
            embeddings, emb_dim = load_embeddings_file(self.embeds_file)
            assert (emb_dim == self.in_dim)
            num_words = len(
                set(embeddings.keys()).union(set(
                    self.w2i.keys())))  # initialize all with embeddings
            # init model parameters and initialize them
            self.wembeds = self.model.add_lookup_parameters(
                (num_words, self.in_dim), init=dynet.ConstInitializer(0.01))

            if self.c_in_dim > 0:
                self.cembeds = self.model.add_lookup_parameters(
                    (num_chars, self.c_in_dim),
                    init=dynet.ConstInitializer(0.01))

            init = 0
            l = len(embeddings.keys())
            for word in embeddings.keys():
                # for those words we have already in w2i, update vector, otherwise add to w2i (since we keep data as integers)
                if word in self.w2i:
                    self.wembeds.init_row(self.w2i[word], embeddings[word])
                else:
                    self.w2i[word] = len(self.w2i.keys())  # add new word
                    self.wembeds.init_row(self.w2i[word], embeddings[word])
                init += 1
            print("initialized: {}".format(init), file=sys.stderr)

        else:
            self.wembeds = self.model.add_lookup_parameters(
                (num_words, self.in_dim), init=dynet.ConstInitializer(0.01))
            if self.c_in_dim > 0:
                self.cembeds = self.model.add_lookup_parameters(
                    (num_chars, self.c_in_dim),
                    init=dynet.ConstInitializer(0.01))

        # make it more flexible to add number of layers as specified by parameter
        layers = []  # inner layers

        for layer_num in range(0, self.h_layers):

            if layer_num == 0:
                if self.c_in_dim > 0:
                    f_builder = dynet.CoupledLSTMBuilder(
                        1, self.in_dim + self.c_in_dim * 2, self.h_dim,
                        self.model)  # in_dim: size of each layer
                    b_builder = dynet.CoupledLSTMBuilder(
                        1, self.in_dim + self.c_in_dim * 2, self.h_dim,
                        self.model)
                else:
                    f_builder = dynet.CoupledLSTMBuilder(
                        1, self.in_dim, self.h_dim, self.model)
                    b_builder = dynet.CoupledLSTMBuilder(
                        1, self.in_dim, self.h_dim, self.model)
                layers.append(BiRNNSequencePredictor(
                    f_builder,
                    b_builder))  #returns forward and backward sequence
            else:
                # add inner layers (if h_layers >1)
                f_builder = dynet.LSTMBuilder(1, self.h_dim, self.h_dim,
                                              self.model)
                b_builder = dynet.LSTMBuilder(1, self.h_dim, self.h_dim,
                                              self.model)
                layers.append(BiRNNSequencePredictor(f_builder, b_builder))

        # store at which layer to predict task
        task_num_labels = len(self.tag2idx)
        output_layers_dict = {}
        output_layers_dict["F0"] = FFSequencePredictor(
            Layer(self.model,
                  self.h_dim * 2,
                  task_num_labels,
                  dynet.softmax,
                  mlp=self.h_dim if self.add_hidden else 0))

        # for simplicity always add additional outputs, even if they are then not used
        output_layers_dict["F1"] = FFSequencePredictor(
            Layer(self.model,
                  self.h_dim * 2,
                  task_num_labels,
                  dynet.softmax,
                  mlp=self.h_dim if self.add_hidden else 0))

        output_layers_dict["Ft"] = FFSequencePredictor(
            Layer(self.model,
                  self.h_dim * 2,
                  task_num_labels,
                  dynet.softmax,
                  mlp=self.h_dim if self.add_hidden else 0))
        if self.c_in_dim > 0:
            self.char_rnn = BiRNNSequencePredictor(
                dynet.CoupledLSTMBuilder(1, self.c_in_dim, self.c_in_dim,
                                         self.model),
                dynet.CoupledLSTMBuilder(1, self.c_in_dim, self.c_in_dim,
                                         self.model))
        else:
            self.char_rnn = None

        self.predictors = dict()
        self.predictors["inner"] = layers
        self.predictors["output_layers_dict"] = output_layers_dict
        self.predictors["task_expected_at"] = self.h_layers

    def get_features(self, words):
        """
        from a list of words, return the word and word char indices
        """
        word_indices = []
        word_char_indices = []
        for word in words:
            if word in self.w2i:
                word_indices.append(self.w2i[word])
            else:
                word_indices.append(self.w2i["_UNK"])

            if self.c_in_dim > 0:
                chars_of_word = [self.c2i["<w>"]]
                for char in word:
                    if char in self.c2i:
                        chars_of_word.append(self.c2i[char])
                    else:
                        chars_of_word.append(self.c2i["_UNK"])
                chars_of_word.append(self.c2i["</w>"])
                word_char_indices.append(chars_of_word)
        return word_indices, word_char_indices

    def __get_instances_from_file(self, file_name):
        """
        helper function to convert input file to lists of lists holding input words|tags
        """
        data = [(words, tags)
                for (words, tags) in list(read_conll_file(file_name))]
        words = [words for (words, _) in data]
        tags = [tags for (_, tags) in data]
        return words, tags

    def get_data_as_indices(self, file_name):
        """
        X = list of (word_indices, word_char_indices)
        Y = list of tag indices
        """
        words, tags = self.__get_instances_from_file(file_name)
        return self.get_data_as_indices_from_instances(words, tags)

    def get_data_as_indices_from_instances(self, dev_words, dev_tags):
        """
        Extension of get_data_as_indices. Use words and tags rather than a file as input.
        X = list of (word_indices, word_char_indices)
        Y = list of tag indices
        """
        X, Y = [], []
        org_X, org_Y = [], []

        for (words, tags) in zip(dev_words, dev_tags):
            word_indices, word_char_indices = self.get_features(words)
            # if tag does not exist in source domain tags, return as default
            # first idx outside of dictionary
            tag_indices = [
                self.tag2idx.get(tag, len(self.tag2idx)) for tag in tags
            ]
            X.append((word_indices, word_char_indices))
            Y.append(tag_indices)
            org_X.append(words)
            org_Y.append(tags)
        return X, Y  # , org_X, org_Y - for now don't use

    def predict(self,
                word_indices,
                char_indices,
                task_id,
                train=False,
                soft_labels=False,
                temperature=None,
                orthogonality_weight=0.0,
                domain_id=None):
        """
        predict tags for a sentence represented as char+word embeddings
        :param domain_id: Predict adversarial loss if domain id is provided.
        """
        dynet.renew_cg()  # new graph

        char_emb = []
        rev_char_emb = []

        wfeatures = [self.wembeds[w] for w in word_indices]

        if self.c_in_dim > 0:
            # get representation for words
            for chars_of_token in char_indices:
                char_feats = [self.cembeds[c] for c in chars_of_token]
                # use last state as word representation
                f_char, b_char = self.char_rnn.predict_sequence(
                    char_feats, char_feats)
                last_state = f_char[-1]
                rev_last_state = b_char[-1]
                char_emb.append(last_state)
                rev_char_emb.append(rev_last_state)

            features = [
                dynet.concatenate([w, c, rev_c])
                for w, c, rev_c in zip(wfeatures, char_emb, rev_char_emb)
            ]
        else:
            features = wfeatures

        if train:  # only do at training time
            features = [dynet.noise(fe, self.noise_sigma) for fe in features]

        output_expected_at_layer = self.h_layers
        output_expected_at_layer -= 1

        # go through layers
        prev = features
        prev_rev = features
        num_layers = self.h_layers
        constraint = 0
        adv_loss = 0
        for i in range(0, num_layers):
            predictor = self.predictors["inner"][i]
            forward_sequence, backward_sequence = predictor.predict_sequence(
                prev, prev_rev)
            if i > 0 and self.activation:
                # activation between LSTM layers
                forward_sequence = [
                    self.activation(s) for s in forward_sequence
                ]
                backward_sequence = [
                    self.activation(s) for s in backward_sequence
                ]

            if i == output_expected_at_layer:

                concat_layer = [
                    dynet.concatenate([f, b]) for f, b in zip(
                        forward_sequence, reversed(backward_sequence))
                ]
                if train and self.noise_sigma > 0.0:
                    concat_layer = [
                        dynet.noise(fe, self.noise_sigma)
                        for fe in concat_layer
                    ]

                if task_id not in ["src", "trg"]:
                    output_predictor = self.predictors["output_layers_dict"][
                        task_id]
                    output = output_predictor.predict_sequence(
                        concat_layer,
                        soft_labels=soft_labels,
                        temperature=temperature)
                else:
                    # one src example for all three outputs
                    output = []  # in this case it is a list
                    for t_id in self.task_ids:
                        output_predictor = self.predictors[
                            "output_layers_dict"][t_id]
                        output_t = output_predictor.predict_sequence(
                            concat_layer,
                            soft_labels=soft_labels,
                            temperature=temperature)
                        output.append(output_t)

                if orthogonality_weight != 0 and task_id != "Ft":
                    # put the orthogonality constraint either directly on the
                    # output layer or on the hidden layer if it's an MLP
                    # use orthogonality_weight only between F0 and F1
                    builder = self.predictors["output_layers_dict"][
                        "F0"].network_builder
                    task_param = builder.W_mlp if self.add_hidden else builder.W
                    task_W = dynet.parameter(task_param)

                    builder = self.predictors["output_layers_dict"][
                        "F1"].network_builder
                    other_param = builder.W_mlp if self.add_hidden else builder.W
                    other_task_W = dynet.parameter(other_param)

                    # calculate the matrix product of the task matrix with the other
                    matrix_product_1 = dynet.transpose(task_W) * other_task_W

                    # take the squared Frobenius norm by squaring
                    # every element and then summing them
                    squared_frobenius_norm = dynet.sum_elems(
                        dynet.square(matrix_product_1))
                    constraint = squared_frobenius_norm

                    #print('Constraint with first matrix:', squared_frobenius_norm.value())

                if domain_id is not None:
                    # flip the gradient when back-propagating through here
                    adv_input = dynet.flip_gradient(
                        concat_layer[-1])  # last state
                    adv_output = self.adv_layer(adv_input)
                    adv_loss = self.pick_neg_log(adv_output, domain_id)
                    #print('Adversarial loss:', avg_adv_loss.value())

                # output is list if task_id = 'src'
                return output, constraint, adv_loss

            prev = forward_sequence
            prev_rev = backward_sequence

        raise Exception("oops should not be here")
        return None

    def evaluate(self, test_X, test_Y, task_id="F0"):
        """
        compute accuracy on a test file; by default use "F0" as predictor
        """
        correct = 0
        total = 0.0

        for i, ((word_indices, word_char_indices),
                gold_tag_indices) in enumerate(zip(test_X, test_Y)):

            output, _, _ = self.predict(word_indices, word_char_indices,
                                        task_id)
            predicted_tag_indices = [np.argmax(o.value()) for o in output]

            correct += sum([
                1 for (predicted,
                       gold) in zip(predicted_tag_indices, gold_tag_indices)
                if predicted == gold
            ])
            total += len(gold_tag_indices)

        return correct, total

    def get_predictions(self, test_X, soft_labels=False, task_id="F0"):
        """
        get flat list of predictions
        """
        predictions = []
        for word_indices, word_char_indices in test_X:
            output, _, _ = self.predict(word_indices, word_char_indices,
                                        task_id)
            predictions += [
                o.value() if soft_labels else int(np.argmax(o.value()))
                for o in output
            ]
        return predictions

    def get_train_data_from_instances(self, train_words, train_tags):
        """
        Extension of get_train_data method. Extracts training data from two arrays of word and label lists.
        transform training data to features (word indices)
        map tags to integers
        :param train_words: a numpy array containing lists of words
        :param train_tags: a numpy array containing lists of corresponding tags
        """
        X = []
        Y = []

        # check if we continue training
        continue_training = False
        if self.w2i and self.tag2idx:
            continue_training = True

        if continue_training:
            print("update existing vocabulary")
            # fetch already existing
            w2i = self.w2i.copy()
            c2i = self.c2i.copy()
            tag2idx = self.tag2idx

            assert w2i["_UNK"] == 0, "No _UNK found!"
        else:
            # word 2 indices and tag 2 indices
            w2i = self.w2i.copy(
            )  # get a copy that refers to a different object
            c2i = {}  # char to index
            tag2idx = {}  # tag2idx

            if len(w2i) > 0:
                assert w2i["_UNK"] == 0
            else:
                w2i["_UNK"] = 0  # unk word / OOV

            c2i["_UNK"] = 0  # unk char
            c2i["<w>"] = 1  # word start
            c2i["</w>"] = 2  # word end index

        num_sentences = 0
        num_tokens = 0
        for instance_idx, (words,
                           tags) in enumerate(zip(train_words, train_tags)):
            instance_word_indices = []  # sequence of word indices
            instance_char_indices = []  # sequence of char indices
            instance_tags_indices = []  # sequence of tag indices

            for i, (word, tag) in enumerate(zip(words, tags)):
                # map words and tags to indices
                if word not in w2i:
                    w2i[word] = len(w2i)
                    instance_word_indices.append(w2i[word])
                else:
                    instance_word_indices.append(w2i[word])

                chars_of_word = [c2i["<w>"]]
                for char in word:
                    if char not in c2i:
                        c2i[char] = len(c2i)
                    chars_of_word.append(c2i[char])
                chars_of_word.append(c2i["</w>"])
                instance_char_indices.append(chars_of_word)

                if tag not in tag2idx:
                    tag2idx[tag] = len(tag2idx)

                instance_tags_indices.append(tag2idx.get(tag))

                num_tokens += 1

            num_sentences += 1

            X.append(
                (instance_word_indices, instance_char_indices)
            )  # list of word indices, for every word list of char indices
            Y.append(instance_tags_indices)

        print("%s sentences %s tokens" % (num_sentences, num_tokens),
              file=sys.stderr)
        print("%s w features, %s c features " % (len(w2i), len(c2i)),
              file=sys.stderr)

        assert (len(X) == len(Y))

        # store mappings of words and tags to indices
        self.set_indices(w2i, c2i, tag2idx)

        return X, Y

    def get_train_data(self, train_data):
        """
        transform training data to features (word indices)
        map tags to integers
        """
        train_words, train_tags = self.__get_instances_from_file(train_data)
        return self.get_train_data_from_instances(train_words, train_tags)

    def get_predictions_output(self,
                               test_X,
                               test_labels,
                               output_filename,
                               task_id="F0"):
        """
        get predictions to output to file
        assume test_labels are not indices (as target domain can have tags that are not in source)
        text_X: indices
        test_labels: original labels
        """
        i2w = {self.w2i[w]: w for w in self.w2i.keys()}
        i2t = {self.tag2idx[t]: t for t in self.tag2idx.keys()}

        OUT = open(output_filename, "w")
        for (word_indices,
             word_char_indices), gold_tags in zip(test_X, test_labels):
            output, _, _ = self.predict(word_indices, word_char_indices,
                                        task_id)
            predicted_tag_ids = [int(np.argmax(o.value())) for o in output]

            for word_id, tag_id, gold_tag in zip(word_indices,
                                                 predicted_tag_ids, gold_tags):
                known_tag_prefix = "{}" if gold_tag in self.tag2idx else "*{}"
                word, pred_tag, gold_tag = i2w[word_id], i2t[
                    tag_id], known_tag_prefix.format(gold_tag)
                OUT.write("{}\t{}\t{}\n".format(word, gold_tag, pred_tag))
            OUT.write("\n")
        OUT.close()
Esempio n. 11
0
    def initialize_graph(self, num_words=None, num_chars=None):
        """
        build graph and link to parameters

        F2=True: activate second auxiliary output
        Ft=True: activate third auxiliary output

        """
        num_words = num_words if num_words is not None else len(self.w2i)
        num_chars = num_chars if num_chars is not None else len(self.c2i)
        if num_words == 0 or num_chars == 0:
            raise ValueError('Word2id and char2id have to be loaded before '
                             'initializing the graph.')
        print('Initializing the graph...')

        # initialize the word embeddings and the parameters
        self.cembeds = None
        if self.embeds_file:
            print("loading embeddings", file=sys.stderr)
            embeddings, emb_dim = load_embeddings_file(self.embeds_file)
            assert (emb_dim == self.in_dim)
            num_words = len(
                set(embeddings.keys()).union(set(
                    self.w2i.keys())))  # initialize all with embeddings
            # init model parameters and initialize them
            self.wembeds = self.model.add_lookup_parameters(
                (num_words, self.in_dim), init=dynet.ConstInitializer(0.01))

            if self.c_in_dim > 0:
                self.cembeds = self.model.add_lookup_parameters(
                    (num_chars, self.c_in_dim),
                    init=dynet.ConstInitializer(0.01))

            init = 0
            l = len(embeddings.keys())
            for word in embeddings.keys():
                # for those words we have already in w2i, update vector, otherwise add to w2i (since we keep data as integers)
                if word in self.w2i:
                    self.wembeds.init_row(self.w2i[word], embeddings[word])
                else:
                    self.w2i[word] = len(self.w2i.keys())  # add new word
                    self.wembeds.init_row(self.w2i[word], embeddings[word])
                init += 1
            print("initialized: {}".format(init), file=sys.stderr)

        else:
            self.wembeds = self.model.add_lookup_parameters(
                (num_words, self.in_dim), init=dynet.ConstInitializer(0.01))
            if self.c_in_dim > 0:
                self.cembeds = self.model.add_lookup_parameters(
                    (num_chars, self.c_in_dim),
                    init=dynet.ConstInitializer(0.01))

        # make it more flexible to add number of layers as specified by parameter
        layers = []  # inner layers

        for layer_num in range(0, self.h_layers):

            if layer_num == 0:
                if self.c_in_dim > 0:
                    f_builder = dynet.CoupledLSTMBuilder(
                        1, self.in_dim + self.c_in_dim * 2, self.h_dim,
                        self.model)  # in_dim: size of each layer
                    b_builder = dynet.CoupledLSTMBuilder(
                        1, self.in_dim + self.c_in_dim * 2, self.h_dim,
                        self.model)
                else:
                    f_builder = dynet.CoupledLSTMBuilder(
                        1, self.in_dim, self.h_dim, self.model)
                    b_builder = dynet.CoupledLSTMBuilder(
                        1, self.in_dim, self.h_dim, self.model)
                layers.append(BiRNNSequencePredictor(
                    f_builder,
                    b_builder))  #returns forward and backward sequence
            else:
                # add inner layers (if h_layers >1)
                f_builder = dynet.LSTMBuilder(1, self.h_dim, self.h_dim,
                                              self.model)
                b_builder = dynet.LSTMBuilder(1, self.h_dim, self.h_dim,
                                              self.model)
                layers.append(BiRNNSequencePredictor(f_builder, b_builder))

        # store at which layer to predict task
        task_num_labels = len(self.tag2idx)
        output_layers_dict = {}
        output_layers_dict["F0"] = FFSequencePredictor(
            Layer(self.model,
                  self.h_dim * 2,
                  task_num_labels,
                  dynet.softmax,
                  mlp=self.h_dim if self.add_hidden else 0))

        # for simplicity always add additional outputs, even if they are then not used
        output_layers_dict["F1"] = FFSequencePredictor(
            Layer(self.model,
                  self.h_dim * 2,
                  task_num_labels,
                  dynet.softmax,
                  mlp=self.h_dim if self.add_hidden else 0))

        output_layers_dict["Ft"] = FFSequencePredictor(
            Layer(self.model,
                  self.h_dim * 2,
                  task_num_labels,
                  dynet.softmax,
                  mlp=self.h_dim if self.add_hidden else 0))
        if self.c_in_dim > 0:
            self.char_rnn = BiRNNSequencePredictor(
                dynet.CoupledLSTMBuilder(1, self.c_in_dim, self.c_in_dim,
                                         self.model),
                dynet.CoupledLSTMBuilder(1, self.c_in_dim, self.c_in_dim,
                                         self.model))
        else:
            self.char_rnn = None

        self.predictors = dict()
        self.predictors["inner"] = layers
        self.predictors["output_layers_dict"] = output_layers_dict
        self.predictors["task_expected_at"] = self.h_layers