def build_computation_graph(self, num_words, num_chars): """ build graph and link to parameters self.predictors, self.char_rnn, self.wembeds, self.cembeds = """ ## initialize word embeddings if self.embeds_file: print("loading embeddings") embeddings, emb_dim = load_embeddings_file(self.embeds_file) assert(emb_dim==self.in_dim) num_words=len(set(embeddings.keys()).union(set(self.w2i.keys()))) # initialize all with embeddings # init model parameters and initialize them self.wembeds = self.model.add_lookup_parameters((num_words, self.in_dim), init=self.initializer) init=0 for word in embeddings.keys(): if word not in self.w2i: self.w2i[word]=len(self.w2i.keys()) # add new word self.wembeds.init_row(self.w2i[word], embeddings[word]) init +=1 elif word in embeddings: self.wembeds.init_row(self.w2i[word], embeddings[word]) init += 1 print("initialized: {}".format(init)) del embeddings # clean up else: self.wembeds = self.model.add_lookup_parameters((num_words, self.in_dim), init=self.initializer) ## initialize character embeddings self.cembeds = None if self.c_in_dim > 0: self.cembeds = self.model.add_lookup_parameters((num_chars, self.c_in_dim), init=self.initializer) if self.lex_dim > 0 and self.embed_lex: # +1 for UNK property self.lembeds = self.model.add_lookup_parameters((len(self.dictionary_values)+1, self.lex_dim), init=dynet.GlorotInitializer()) #init=self.initializer) # make it more flexible to add number of layers as specified by parameter layers = [] # inner layers output_layers_dict = {} # from task_id to actual softmax predictor for layer_num in range(0,self.h_layers): if layer_num == 0: if self.c_in_dim > 0: # in_dim: size of each layer if self.lex_dim > 0 and self.embed_lex: lex_embed_size = self.lex_dim * len(self.dictionary_values) f_builder = self.builder(1, self.in_dim+self.c_h_dim*2+lex_embed_size, self.h_dim, self.model) b_builder = self.builder(1, self.in_dim+self.c_h_dim*2+lex_embed_size, self.h_dim, self.model) else: f_builder = self.builder(1, self.in_dim + self.c_h_dim * 2 + self.lex_dim, self.h_dim, self.model) b_builder = self.builder(1, self.in_dim + self.c_h_dim * 2 + self.lex_dim, self.h_dim, self.model) else: f_builder = self.builder(1, self.in_dim+self.lex_dim, self.h_dim, self.model) b_builder = self.builder(1, self.in_dim+self.lex_dim, self.h_dim, self.model) layers.append(BiRNNSequencePredictor(f_builder, b_builder)) #returns forward and backward sequence else: # add inner layers (if h_layers >1) f_builder = self.builder(1, self.h_dim, self.h_dim, self.model) b_builder = self.builder(1, self.h_dim, self.h_dim, self.model) layers.append(BiRNNSequencePredictor(f_builder, b_builder)) # store at which layer to predict task task2layer = {task_id: out_layer for task_id, out_layer in zip(self.task2tag2idx, self.pred_layer)} if len(task2layer) > 1: print("task2layer", task2layer) for task_id in task2layer: task_num_labels= len(self.task2tag2idx[task_id]) if not self.crf: output_layers_dict[task_id] = FFSequencePredictor(self.task2tag2idx[task_id], Layer(self.model, self.h_dim*2, task_num_labels, dynet.softmax, mlp=self.mlp, mlp_activation=self.activation_mlp)) else: print("CRF") output_layers_dict[task_id] = CRFSequencePredictor(self.model, task_num_labels, self.task2tag2idx[task_id], Layer(self.model, self.h_dim * 2, task_num_labels, None, mlp=self.mlp, mlp_activation=self.activation_mlp), viterbi_loss=self.viterbi_loss) self.char_rnn = BiRNNSequencePredictor(self.builder(1, self.c_in_dim, self.c_h_dim, self.model), self.builder(1, self.c_in_dim, self.c_h_dim, self.model)) self.predictors = {} self.predictors["inner"] = layers self.predictors["output_layers_dict"] = output_layers_dict self.predictors["task_expected_at"] = task2layer
def build_computation_graph(self, num_words, num_chars): """ build graph and link to parameters """ # initialize the word embeddings and the parameters cembeds = None if self.embeds_file: print("loading embeddings", file=sys.stderr) embeddings, emb_dim = load_embeddings_file(self.embeds_file) assert(emb_dim==self.in_dim) num_words=len(set(embeddings.keys()).union(set(self.w2i.keys()))) # initialize all with embeddings # init model parameters and initialize them wembeds = self.model.add_lookup_parameters((num_words, self.in_dim),init=dynet.ConstInitializer(0.01)) if self.c_in_dim > 0: cembeds = self.model.add_lookup_parameters((num_chars, self.c_in_dim),init=dynet.ConstInitializer(0.01)) init=0 l = len(embeddings.keys()) for word in embeddings.keys(): # for those words we have already in w2i, update vector, otherwise add to w2i (since we keep data as integers) if word in self.w2i: wembeds.init_row(self.w2i[word], embeddings[word]) else: self.w2i[word]=len(self.w2i.keys()) # add new word wembeds.init_row(self.w2i[word], embeddings[word]) init+=1 print("initialized: {}".format(init), file=sys.stderr) else: wembeds = self.model.add_lookup_parameters((num_words, self.in_dim),init=dynet.ConstInitializer(0.01)) if self.c_in_dim > 0: cembeds = self.model.add_lookup_parameters((num_chars, self.c_in_dim),init=dynet.ConstInitializer(0.01)) #make it more flexible to add number of layers as specified by parameter layers = [] # inner layers for layer_num in range(0,self.h_layers): if layer_num == 0: if self.c_in_dim > 0: f_builder = dynet.CoupledLSTMBuilder(1, self.in_dim+self.c_in_dim*2, self.h_dim, self.model) # in_dim: size of each layer b_builder = dynet.CoupledLSTMBuilder(1, self.in_dim+self.c_in_dim*2, self.h_dim, self.model) else: f_builder = dynet.CoupledLSTMBuilder(1, self.in_dim, self.h_dim, self.model) b_builder = dynet.CoupledLSTMBuilder(1, self.in_dim, self.h_dim, self.model) layers.append(BiRNNSequencePredictor(f_builder, b_builder)) #returns forward and backward sequence else: # add inner layers (if h_layers >1) f_builder = dynet.LSTMBuilder(1, self.h_dim, self.h_dim, self.model) b_builder = dynet.LSTMBuilder(1, self.h_dim, self.h_dim, self.model) layers.append(BiRNNSequencePredictor(f_builder,b_builder)) # store at which layer to predict task task_num_labels= len(self.tag2idx) output_layer = FFSequencePredictor(Layer(self.model, self.h_dim*2, task_num_labels, dynet.softmax)) if self.c_in_dim > 0: char_rnn = BiRNNSequencePredictor(dynet.CoupledLSTMBuilder(1, self.c_in_dim, self.c_in_dim, self.model), dynet.CoupledLSTMBuilder(1, self.c_in_dim, self.c_in_dim, self.model)) else: char_rnn = None predictors = {} predictors["inner"] = layers predictors["output_layers_dict"] = output_layer predictors["task_expected_at"] = self.h_layers return predictors, char_rnn, wembeds, cembeds
def build_computation_graph(self, num_words, num_chars): """ build graph and link to parameters """ # initialize the word embeddings and the parameters if self.embeds_file: print("loading embeddings", file=sys.stderr) embeddings, emb_dim = load_embeddings_file(self.embeds_file, lower=self.lower) assert(emb_dim==self.in_dim) num_words=len(set(embeddings.keys()).union(set(self.w2i.keys()))) # initialize all with embeddings # init model parameters and initialize them wembeds = self.model.add_lookup_parameters((num_words, self.in_dim)) cembeds = self.model.add_lookup_parameters((num_chars, self.c_in_dim)) init=0 l = len(embeddings.keys()) for word in embeddings.keys(): # for those words we have already in w2i, update vector, otherwise add to w2i (since we keep data as integers) if word in self.w2i: wembeds.init_row(self.w2i[word], embeddings[word]) else: self.w2i[word]=len(self.w2i.keys()) # add new word wembeds.init_row(self.w2i[word], embeddings[word]) init+=1 print("initialized: {}".format(init), file=sys.stderr) else: wembeds = self.model.add_lookup_parameters((num_words, self.in_dim)) cembeds = self.model.add_lookup_parameters((num_chars, self.c_in_dim)) #make it more flexible to add number of layers as specified by parameter layers = [] # inner layers output_layers_dict = {} # from task_id to actual softmax predictor task_expected_at = {} # map task_id => output_layer_# # connect output layers to tasks for output_layer, task_id in zip(self.pred_layer, self.tasks_ids): if output_layer > self.h_layers: raise ValueError("cannot have a task at a layer which is beyond the model, increase h_layers") task_expected_at[task_id] = output_layer print("task expected at", task_expected_at, file=sys.stderr) nb_tasks = len( self.tasks_ids ) print("h_layers:", self.h_layers, file=sys.stderr) for layer_num in range(0,self.h_layers): print(">>>", layer_num, "layer_num") if layer_num == 0: builder = dynet.LSTMBuilder(1, self.in_dim+self.c_in_dim*2, self.h_dim, self.model) # in_dim: size of each layer layers.append(BiRNNSequencePredictor(builder)) #returns forward and backward sequence else: # add inner layers (if h_layers >1) builder = dynet.LSTMBuilder(1, self.h_dim, self.h_dim, self.model) layers.append(BiRNNSequencePredictor(builder)) # store at which layer to predict task for task_id in self.tasks_ids: task_num_labels= len(self.task2tag2idx[task_id]) output_layers_dict[task_id] = FFSequencePredictor(Layer(self.model, self.h_dim*2, task_num_labels, dynet.softmax)) sys.stderr.write('#\nOutput layers'+str(len(output_layers_dict))+'\n') char_rnn = RNNSequencePredictor(dynet.LSTMBuilder(1, self.c_in_dim, self.c_in_dim, self.model)) predictors = {} predictors["inner"] = layers predictors["output_layers_dict"] = output_layers_dict predictors["task_expected_at"] = task_expected_at return predictors, char_rnn, wembeds, cembeds
class NNTagger(object): # turn dynamic allocation off by defining slots __slots__ = ['w2i', 'c2i', 'wcount', 'ccount','wtotal','ctotal','w2c_cache','w_dropout_rate','c_dropout_rate', 'task2tag2idx', 'model', 'in_dim', 'c_in_dim', 'c_h_dim','h_dim', 'activation', 'noise_sigma', 'pred_layer', 'mlp', 'activation_mlp', 'backprob_embeds', 'initializer', 'h_layers', 'predictors', 'wembeds', 'cembeds', 'embeds_file', 'char_rnn', 'trainer', 'builder', 'crf', 'viterbi_loss', 'mimickx_model_path', 'mimickx_model', 'dictionary', 'dictionary_values', 'path_to_dictionary', 'lex_dim', 'type_constraint', 'embed_lex', 'l2i', 'lembeds'] def __init__(self,in_dim,h_dim,c_in_dim,c_h_dim,h_layers,pred_layer,learning_algo="sgd", learning_rate=0, embeds_file=None,activation=ACTIVATION_MAP["tanh"],mlp=0,activation_mlp=ACTIVATION_MAP["rectify"], backprob_embeds=True,noise_sigma=0.1, w_dropout_rate=0.25, c_dropout_rate=0.25, initializer=INITIALIZER_MAP["glorot"], builder=BUILDERS["lstmc"], crf=False, viterbi_loss=False, mimickx_model_path=None, dictionary=None, type_constraint=False, lex_dim=0, embed_lex=False): self.w2i = {} # word to index mapping self.c2i = {} # char to index mapping self.w2c_cache = {} # word to char index cache for frequent words self.wcount = None # word count self.ccount = None # char count self.task2tag2idx = {} # need one dictionary per task self.pred_layer = [int(layer) for layer in pred_layer] # at which layer to predict each task self.model = dynet.ParameterCollection() #init model self.in_dim = in_dim self.h_dim = h_dim self.c_in_dim = c_in_dim self.c_h_dim = c_h_dim self.w_dropout_rate = w_dropout_rate self.c_dropout_rate = c_dropout_rate self.activation = activation self.mlp = mlp self.activation_mlp = activation_mlp self.noise_sigma = noise_sigma self.h_layers = h_layers self.predictors = {"inner": [], "output_layers_dict": {}, "task_expected_at": {} } # the inner layers and predictors self.wembeds = None # lookup: embeddings for words self.cembeds = None # lookup: embeddings for characters self.lembeds = None # lookup: embeddings for lexical features (optional) self.embeds_file = embeds_file trainer_algo = TRAINER_MAP[learning_algo] if learning_rate > 0: ### TODO: better handling of additional learning-specific parameters self.trainer = trainer_algo(self.model, learning_rate=learning_rate) else: # using default learning rate self.trainer = trainer_algo(self.model) self.backprob_embeds = backprob_embeds self.initializer = initializer self.char_rnn = None # biRNN for character input self.builder = builder # default biRNN is an LSTM self.crf = crf self.viterbi_loss = viterbi_loss self.mimickx_model_path = mimickx_model_path if mimickx_model_path: # load self.mimickx_model = load_model(mimickx_model_path) self.dictionary = None self.type_constraint = type_constraint self.embed_lex = False self.l2i = {UNK: 0} # lex feature to index mapping if dictionary: self.dictionary, self.dictionary_values = load_dict(dictionary) self.path_to_dictionary = dictionary if type_constraint: self.lex_dim = 0 else: if embed_lex: self.lex_dim = lex_dim self.embed_lex = True print("Embed lexical features") # register property indices for prop in self.dictionary_values: self.l2i[prop] = len(self.l2i) else: self.lex_dim = len(self.dictionary_values) #n-hot encoding print("Lex_dim: {}".format(self.lex_dim), file=sys.stderr) else: self.dictionary = None self.path_to_dictionary = None self.lex_dim = 0 def fit(self, train, num_iterations, dev=None, model_path=None, patience=0, minibatch_size=0, log_losses=False): """ train the tagger """ losses_log = {} # log losses print("init parameters") self.init_parameters(train) # init lookup parameters and define graph print("build graph") self.build_computation_graph(len(self.w2i), len(self.c2i)) update_embeds = True if self.backprob_embeds == False: ## disable backprob into embeds print(">>> disable wembeds update <<<") update_embeds = False best_val_acc, epochs_no_improvement = 0.0, 0 if dev and model_path is not None and patience > 0: print('Using early stopping with patience of {}...'.format(patience)) batch = [] print("train..") for iteration in range(num_iterations): total_loss=0.0 total_tagged=0.0 indices = [i for i in range(len(train.seqs))] random.shuffle(indices) loss_accum_loss = defaultdict(float) loss_accum_tagged = defaultdict(float) for idx in indices: seq = train.seqs[idx] if seq.task_id not in losses_log: losses_log[seq.task_id] = [] #initialize if minibatch_size > 1: # accumulate instances for minibatch update loss1 = self.predict(seq, train=True, update_embeds=update_embeds) total_tagged += len(seq.words) batch.append(loss1) if len(batch) == minibatch_size: loss = dynet.esum(batch) total_loss += loss.value() # logging loss_accum_tagged[seq.task_id] += len(seq.words) loss_accum_loss[seq.task_id] += loss.value() loss.backward() self.trainer.update() dynet.renew_cg() # use new computational graph for each BATCH when batching is active batch = [] else: dynet.renew_cg() # new graph per item loss1 = self.predict(seq, train=True, update_embeds=update_embeds) total_tagged += len(seq.words) lv = loss1.value() total_loss += lv # logging loss_accum_tagged[seq.task_id] += len(seq.words) loss_accum_loss[seq.task_id] += loss1.value() loss1.backward() self.trainer.update() print("iter {2} {0:>12}: {1:.2f}".format("total loss", total_loss/total_tagged, iteration)) # log losses for task_id in sorted(losses_log): losses_log[task_id].append(loss_accum_loss[task_id] / loss_accum_tagged[task_id]) if log_losses: dill.dump(losses_log, open(model_path + ".model" + ".losses.pickle", "wb")) if dev: # evaluate after every epoch correct, total = self.evaluate(dev, "task0") val_accuracy = correct/total print("dev accuracy: {0:.4f}".format(val_accuracy)) if val_accuracy > best_val_acc: print('Accuracy {0:.4f} is better than best val accuracy ' '{1:.4f}.'.format(val_accuracy, best_val_acc)) best_val_acc = val_accuracy epochs_no_improvement = 0 save(self, model_path) else: print('Accuracy {0:.4f} is worse than best val loss {1:.4f}.'.format(val_accuracy, best_val_acc)) epochs_no_improvement += 1 if patience > 0: if epochs_no_improvement == patience: print('No improvement for {} epochs. Early stopping...'.format(epochs_no_improvement)) break def set_indices(self, w2i, c2i, task2t2i, w2c_cache, l2i=None): """ helper function for loading model""" for task_id in task2t2i: self.task2tag2idx[task_id] = task2t2i[task_id] self.w2i = w2i self.c2i = c2i self.w2c_cache = w2c_cache self.l2i = l2i def set_counts(self, wcount, wtotal, ccount, ctotal): """ helper function for loading model""" self.wcount = wcount self.wtotal = wtotal self.ccount = ccount self.ctotal = ctotal def build_computation_graph(self, num_words, num_chars): """ build graph and link to parameters self.predictors, self.char_rnn, self.wembeds, self.cembeds = """ ## initialize word embeddings if self.embeds_file: print("loading embeddings") embeddings, emb_dim = load_embeddings_file(self.embeds_file) assert(emb_dim==self.in_dim) num_words=len(set(embeddings.keys()).union(set(self.w2i.keys()))) # initialize all with embeddings # init model parameters and initialize them self.wembeds = self.model.add_lookup_parameters((num_words, self.in_dim), init=self.initializer) init=0 for word in embeddings.keys(): if word not in self.w2i: self.w2i[word]=len(self.w2i.keys()) # add new word self.wembeds.init_row(self.w2i[word], embeddings[word]) init +=1 elif word in embeddings: self.wembeds.init_row(self.w2i[word], embeddings[word]) init += 1 print("initialized: {}".format(init)) del embeddings # clean up else: self.wembeds = self.model.add_lookup_parameters((num_words, self.in_dim), init=self.initializer) ## initialize character embeddings self.cembeds = None if self.c_in_dim > 0: self.cembeds = self.model.add_lookup_parameters((num_chars, self.c_in_dim), init=self.initializer) if self.lex_dim > 0 and self.embed_lex: # +1 for UNK property self.lembeds = self.model.add_lookup_parameters((len(self.dictionary_values)+1, self.lex_dim), init=dynet.GlorotInitializer()) #init=self.initializer) # make it more flexible to add number of layers as specified by parameter layers = [] # inner layers output_layers_dict = {} # from task_id to actual softmax predictor for layer_num in range(0,self.h_layers): if layer_num == 0: if self.c_in_dim > 0: # in_dim: size of each layer if self.lex_dim > 0 and self.embed_lex: lex_embed_size = self.lex_dim * len(self.dictionary_values) f_builder = self.builder(1, self.in_dim+self.c_h_dim*2+lex_embed_size, self.h_dim, self.model) b_builder = self.builder(1, self.in_dim+self.c_h_dim*2+lex_embed_size, self.h_dim, self.model) else: f_builder = self.builder(1, self.in_dim + self.c_h_dim * 2 + self.lex_dim, self.h_dim, self.model) b_builder = self.builder(1, self.in_dim + self.c_h_dim * 2 + self.lex_dim, self.h_dim, self.model) else: f_builder = self.builder(1, self.in_dim+self.lex_dim, self.h_dim, self.model) b_builder = self.builder(1, self.in_dim+self.lex_dim, self.h_dim, self.model) layers.append(BiRNNSequencePredictor(f_builder, b_builder)) #returns forward and backward sequence else: # add inner layers (if h_layers >1) f_builder = self.builder(1, self.h_dim, self.h_dim, self.model) b_builder = self.builder(1, self.h_dim, self.h_dim, self.model) layers.append(BiRNNSequencePredictor(f_builder, b_builder)) # store at which layer to predict task task2layer = {task_id: out_layer for task_id, out_layer in zip(self.task2tag2idx, self.pred_layer)} if len(task2layer) > 1: print("task2layer", task2layer) for task_id in task2layer: task_num_labels= len(self.task2tag2idx[task_id]) if not self.crf: output_layers_dict[task_id] = FFSequencePredictor(self.task2tag2idx[task_id], Layer(self.model, self.h_dim*2, task_num_labels, dynet.softmax, mlp=self.mlp, mlp_activation=self.activation_mlp)) else: print("CRF") output_layers_dict[task_id] = CRFSequencePredictor(self.model, task_num_labels, self.task2tag2idx[task_id], Layer(self.model, self.h_dim * 2, task_num_labels, None, mlp=self.mlp, mlp_activation=self.activation_mlp), viterbi_loss=self.viterbi_loss) self.char_rnn = BiRNNSequencePredictor(self.builder(1, self.c_in_dim, self.c_h_dim, self.model), self.builder(1, self.c_in_dim, self.c_h_dim, self.model)) self.predictors = {} self.predictors["inner"] = layers self.predictors["output_layers_dict"] = output_layers_dict self.predictors["task_expected_at"] = task2layer def get_features(self, words, train=False, update=True): """ get feature representations """ # word embeddings wfeatures = np.array([self.get_w_repr(word, train=train, update=update) for word in words]) lex_features = [] if self.dictionary and not self.type_constraint: ## add lexicon features lex_features = np.array([self.get_lex_repr(word) for word in words]) # char embeddings if self.c_in_dim > 0: cfeatures = [self.get_c_repr(word, train=train) for word in words] if len(lex_features) > 0: lex_features = dynet.inputTensor(lex_features) features = [dynet.concatenate([w,c,l]) for w,c,l in zip(wfeatures,cfeatures,lex_features)] else: features = [dynet.concatenate([w, c]) for w, c in zip(wfeatures, cfeatures)] else: features = wfeatures if train: # only do at training time features = [dynet.noise(fe,self.noise_sigma) for fe in features] return features def predict(self, seq, train=False, output_confidences=False, unk_tag=None, update_embeds=True): """ predict tags for a sentence represented as char+word embeddings and compute losses for this instance """ if not train: dynet.renew_cg() features = self.get_features(seq.words, train=train, update=update_embeds) output_expected_at_layer = self.predictors["task_expected_at"][seq.task_id] output_expected_at_layer -=1 # go through layers # input is now combination of w + char emb prev = features prev_rev = features num_layers = self.h_layers for i in range(0,num_layers): predictor = self.predictors["inner"][i] forward_sequence, backward_sequence = predictor.predict_sequence(prev, prev_rev) if i > 0 and self.activation: # activation between LSTM layers forward_sequence = [self.activation(s) for s in forward_sequence] backward_sequence = [self.activation(s) for s in backward_sequence] if i == output_expected_at_layer: output_predictor = self.predictors["output_layers_dict"][seq.task_id] concat_layer = [dynet.concatenate([f, b]) for f, b in zip(forward_sequence,reversed(backward_sequence))] if train and self.noise_sigma > 0.0: concat_layer = [dynet.noise(fe,self.noise_sigma) for fe in concat_layer] # fill-in predictions and get loss per tag losses = output_predictor.predict_sequence(seq, concat_layer, train=train, output_confidences=output_confidences, unk_tag=unk_tag, dictionary=self.dictionary, type_constraint=self.type_constraint) prev = forward_sequence prev_rev = backward_sequence if train: # return losses return losses else: return seq.pred_tags, seq.tag_confidences def output_preds(self, seq, raw=False, output_confidences=False): """ output predictions to a file """ i = 0 for w, g, p in zip(seq.words, seq.tags, seq.pred_tags): if raw: if output_confidences: print(u"{0}\t{1}\t{2:.2f}".format(w, p, seq.tag_confidences[i])) else: print(u"{}\t{}".format(w, p)) # do not print DUMMY tag when --raw is on else: if output_confidences: print(u"{0}\t{1}\t{2}\t{3:.2f}".format(w, g, p, seq.tag_confidences[i])) else: print(u"{}\t{}\t{}".format(w, g, p)) i += 1 print("") def evaluate(self, test_file, task_id, output_predictions=None, raw=False, output_confidences=False, unk_tag=None): """ compute accuracy on a test file, optionally output to file """ correct = 0 total = 0 for seq in test_file: if seq.task_id != task_id: continue # we evaluate only on a specific task self.predict(seq, output_confidences=output_confidences, unk_tag=unk_tag) if output_predictions: self.output_preds(seq, raw=raw, output_confidences=output_confidences) correct_inst, total_inst = seq.evaluate() correct+=correct_inst total+= total_inst return correct, total def get_w_repr(self, word, train=False, update=True): """ Get representation of word (word embedding) """ if train: if self.w_dropout_rate > 0.0: w_id = self.w2i[UNK] if drop(word, self.wcount, self.w_dropout_rate) else self.w2i.get(word, self.w2i[UNK]) else: if self.mimickx_model_path: # if given use MIMICKX if word not in self.w2i: # #print("predict with MIMICKX for: ", word) return dynet.inputVector(self.mimickx_model.predict(word).npvalue()) w_id = self.w2i.get(word, self.w2i[UNK]) if not update: return dynet.nobackprop(self.wembeds[w_id]) else: return self.wembeds[w_id] def get_c_repr(self, word, train=False): """ Get representation of word via characters sub-LSTMs """ # get representation for words if word in self.w2c_cache: chars_of_token = self.w2c_cache[word] if train: chars_of_token = [drop(c, self.ccount, self.c_dropout_rate) for c in chars_of_token] else: chars_of_token = array.array('I',[self.c2i[WORD_START]]) + array.array('I',[self.get_c_idx(c, train=train) for c in word]) + array.array('I',[self.c2i[WORD_END]]) char_feats = [self.cembeds[c_id] for c_id in chars_of_token] # use last state as word representation f_char, b_char = self.char_rnn.predict_sequence(char_feats, char_feats) return dynet.concatenate([f_char[-1], b_char[-1]]) def get_c_idx(self, c, train=False): """ helper function to get index of character""" if self.c_dropout_rate > 0.0 and train and drop(c, self.ccount, self.c_dropout_rate): return self.c2i.get(UNK) else: return self.c2i.get(c, self.c2i[UNK]) def get_lex_repr(self, word): """ Get representation for lexical feature """ if not self.embed_lex: ## n-hot representation n_hot = np.zeros(len(self.dictionary_values)) values = is_in_dict(word, self.dictionary) if values: for v in values: n_hot[self.dictionary_values.index(v)] = 1.0 return n_hot else: lex_feats = [] for property in self.dictionary_values: values = is_in_dict(word, self.dictionary) if values: if property in values: lex_feats.append(self.lembeds[self.l2i[property]].npvalue()) else: lex_feats.append(self.lembeds[self.l2i[UNK]].npvalue()) else: lex_feats.append(self.lembeds[self.l2i[UNK]].npvalue()) # unknown word return np.concatenate(lex_feats) def init_parameters(self, train_data): """init parameters from training data""" # word 2 indices and tag 2 indices self.w2i = {} # word to index self.c2i = {} # char to index self.task2tag2idx = {} # id of the task -> tag2idx self.w2i[UNK] = 0 # unk word / OOV self.c2i[UNK] = 0 # unk char self.c2i[WORD_START] = 1 # word start self.c2i[WORD_END] = 2 # word end index # word and char counters self.wcount = Counter() self.ccount = Counter() for seq in train_data: self.wcount.update([w for w in seq.words]) self.ccount.update([c for w in seq.words for c in w]) if seq.task_id not in self.task2tag2idx: self.task2tag2idx[seq.task_id] = {"<START>": START_TAG, "<END>": END_TAG} # record words and chars for word, tag in zip(seq.words, seq.tags): if word not in self.w2i: self.w2i[word] = len(self.w2i) if self.c_in_dim > 0: for char in word: if char not in self.c2i: self.c2i[char] = len(self.c2i) if tag not in self.task2tag2idx[seq.task_id]: self.task2tag2idx[seq.task_id][tag] = len(self.task2tag2idx[seq.task_id]) n = int(len(self.w2i) * 0.3) # top 30% print("Caching top {} words".format(n)) for word in self.wcount.most_common(n): self.w2c_cache[word] = array.array('I', [self.c2i[WORD_START]]) + array.array('I', [self.get_c_idx(c) for c in word]) + array.array('I', [self.c2i[WORD_END]]) # get total counts self.wtotal = np.sum([self.wcount[w] for w in self.wcount]) self.ctotal = np.sum([self.ccount[c] for c in self.ccount]) print("{} w features, {} c features".format(len(self.w2i), len(self.c2i))) #print(self.wtotal, self.ctotal) def save_embeds(self, out_filename): """ save final embeddings to file :param out_filename: filename """ # construct reverse mapping i2w = {self.w2i[w]: w for w in self.w2i.keys()} OUT = open(out_filename+".w.emb","w") for word_id in i2w.keys(): wembeds_expression = self.wembeds[word_id] word = i2w[word_id] OUT.write("{} {}\n".format(word," ".join([str(x) for x in wembeds_expression.npvalue()]))) OUT.close() def save_lex_embeds(self, out_filename): """ save final embeddings to file :param out_filename: filename """ # construct reverse mapping i2l = {self.l2i[w]: w for w in self.l2i.keys()} OUT = open(out_filename+".l.emb","w") for lex_id in i2l.keys(): lembeds_expression = self.lembeds[lex_id] lex = i2l[lex_id] OUT.write("{} {}\n".format(lex," ".join([str(x) for x in lembeds_expression.npvalue()]))) OUT.close() def save_cw_embeds(self, out_filename): """ save final character-based word-embeddings to file :param out_filename: filename """ # construct reverse mapping using word embeddings i2cw = {self.w2i[w]: w for w in self.w2i.keys()} OUT = open(out_filename+".cw.emb","w") for word_id in i2cw.keys(): word = i2cw[word_id] cwembeds = [v.npvalue()[0] for v in self.get_c_repr(word)] OUT.write("{} {}\n".format(word," ".join([str(x) for x in cwembeds]))) OUT.close() def save_wordlex_map(self, out_filename): """ save final word-to-lexicon-embedding map to file :param out_filename: filename """ # construct reverse mapping using word embeddings i2wl = {self.w2i[w]: w for w in self.w2i.keys()} OUT = open(out_filename+".wlmap.emb","w") for word_id in i2wl.keys(): word = i2wl[word_id] lex_feats = [] for property in self.dictionary_values: values = is_in_dict(word, self.dictionary) if values: if property in values: lex_feats.append(property) else: lex_feats.append(UNK) else: lex_feats.append(UNK) # unknown word OUT.write("{} {}\n".format(word," ".join([str(x) for x in lex_feats]))) OUT.close() def save_transition_matrix(self, out_filename): """ save transition matrix :param out_filename: filename """ for task_id in self.predictors["output_layers_dict"].keys(): output_predictor = self.predictors["output_layers_dict"][task_id] output_predictor.save_parameters(out_filename)
def build_computation_graph(self, num_words, num_chars): """ build graph and link to parameters self.predictors, self.char_rnn, self.wembeds, self.cembeds = """ ## initialize word embeddings if self.embeds_file: print("loading embeddings") embeddings, emb_dim = load_embeddings_file(self.embeds_file) assert (emb_dim == self.in_dim) num_words = len( set(embeddings.keys()).union(set( self.w2i.keys()))) # initialize all with embeddings # init model parameters and initialize them self.wembeds = self.model.add_lookup_parameters( (num_words, self.in_dim), init=self.initializer) init = 0 for word in embeddings.keys(): if word not in self.w2i: self.w2i[word] = len(self.w2i.keys()) # add new word self.wembeds.init_row(self.w2i[word], embeddings[word]) init += 1 elif word in embeddings: self.wembeds.init_row(self.w2i[word], embeddings[word]) init += 1 print("initialized: {}".format(init)) del embeddings # clean up else: self.wembeds = self.model.add_lookup_parameters( (num_words, self.in_dim), init=self.initializer) ## initialize character embeddings self.cembeds = None if self.c_in_dim > 0: self.cembeds = self.model.add_lookup_parameters( (num_chars, self.c_in_dim), init=self.initializer) if self.lex_dim > 0 and self.embed_lex: # +1 for UNK property self.lembeds = self.model.add_lookup_parameters( (len(self.dictionary_values) + 1, self.lex_dim), init=dynet.GlorotInitializer()) #init=self.initializer) # make it more flexible to add number of layers as specified by parameter layers = [] # inner layers output_layers_dict = {} # from task_id to actual softmax predictor for layer_num in range(0, self.h_layers): if layer_num == 0: if self.c_in_dim > 0: # in_dim: size of each layer if self.lex_dim > 0 and self.embed_lex: lex_embed_size = self.lex_dim * len( self.dictionary_values) f_builder = self.builder( 1, self.in_dim + self.c_h_dim * 2 + lex_embed_size, self.h_dim, self.model) b_builder = self.builder( 1, self.in_dim + self.c_h_dim * 2 + lex_embed_size, self.h_dim, self.model) else: f_builder = self.builder( 1, self.in_dim + self.c_h_dim * 2 + self.lex_dim, self.h_dim, self.model) b_builder = self.builder( 1, self.in_dim + self.c_h_dim * 2 + self.lex_dim, self.h_dim, self.model) else: f_builder = self.builder(1, self.in_dim + self.lex_dim, self.h_dim, self.model) b_builder = self.builder(1, self.in_dim + self.lex_dim, self.h_dim, self.model) layers.append(BiRNNSequencePredictor( f_builder, b_builder)) #returns forward and backward sequence else: # add inner layers (if h_layers >1) f_builder = self.builder(1, self.h_dim, self.h_dim, self.model) b_builder = self.builder(1, self.h_dim, self.h_dim, self.model) layers.append(BiRNNSequencePredictor(f_builder, b_builder)) # store at which layer to predict task task2layer = { task_id: out_layer for task_id, out_layer in zip(self.task2tag2idx, self.pred_layer) } if len(task2layer) > 1: print("task2layer", task2layer) for task_id in task2layer: task_num_labels = len(self.task2tag2idx[task_id]) if not self.crf: output_layers_dict[task_id] = FFSequencePredictor( self.task2tag2idx[task_id], Layer(self.model, self.h_dim * 2, task_num_labels, dynet.softmax, mlp=self.mlp, mlp_activation=self.activation_mlp)) else: print("CRF") output_layers_dict[task_id] = CRFSequencePredictor( self.model, task_num_labels, self.task2tag2idx[task_id], Layer(self.model, self.h_dim * 2, task_num_labels, None, mlp=self.mlp, mlp_activation=self.activation_mlp), viterbi_loss=self.viterbi_loss) self.char_rnn = BiRNNSequencePredictor( self.builder(1, self.c_in_dim, self.c_h_dim, self.model), self.builder(1, self.c_in_dim, self.c_h_dim, self.model)) self.predictors = {} self.predictors["inner"] = layers self.predictors["output_layers_dict"] = output_layers_dict self.predictors["task_expected_at"] = task2layer
for word in embeddings.keys(): if word not in w2i: w2i[word] = len(w2i.keys()) # add new word wembeds.init_row(w2i[word], embeddings[word]) init += 1 print("initialized: {}".format(init), file=sys.stderr) else: wembeds = model.add_lookup_parameters((len(w2i), args.in_dim)) #l376 layers = [] for layer_num in range(args.layers): #l411 if layer_num == 0: f_builder = dy.LSTMBuilder(1, args.in_dim, args.h_dim, model) b_builder = dy.LSTMBuilder(1, args.in_dim, args.h_dim, model) layers.append(BiRNNSequencePredictor(f_builder, b_builder)) else: f_builder = dy.LSTMBuilder(1, args.h_dim, args.h_dim, model) b_builder = dy.LSTMBuilder(1, args.h_dim, args.h_dim, model) layers.append(BiRNNSequencePredictor(f_builder, b_builder)) predictors = {} predictors["inner"] = layers predictors["outer"] = {} for task_id in tasks_ids: task_num_labels = len(task2t2i[task_id]) predictors["outer"][task_id] = FFSequencePredictor( Layer(model, args.h_dim * 2, len(task_labels), dy.softmax)) # TRAINING
class NNTagger(object): # turn dynamic allocation off by defining slots __slots__ = [ 'w2i', 'c2i', 'wcount', 'ccount', 'wtotal', 'ctotal', 'w2c_cache', 'w_dropout_rate', 'c_dropout_rate', 'task2tag2idx', 'model', 'in_dim', 'c_in_dim', 'c_h_dim', 'h_dim', 'activation', 'noise_sigma', 'pred_layer', 'mlp', 'activation_mlp', 'backprob_embeds', 'initializer', 'h_layers', 'predictors', 'wembeds', 'cembeds', 'embeds_file', 'char_rnn', 'trainer', 'builder', 'crf', 'viterbi_loss', 'mimickx_model_path', 'mimickx_model', 'dictionary', 'dictionary_values', 'path_to_dictionary', 'lex_dim', 'type_constraint', 'embed_lex', 'l2i', 'lembeds' ] def __init__(self, in_dim, h_dim, c_in_dim, c_h_dim, h_layers, pred_layer, learning_algo="sgd", learning_rate=0, embeds_file=None, activation=ACTIVATION_MAP["tanh"], mlp=0, activation_mlp=ACTIVATION_MAP["rectify"], backprob_embeds=True, noise_sigma=0.1, w_dropout_rate=0.25, c_dropout_rate=0.25, initializer=INITIALIZER_MAP["glorot"], builder=BUILDERS["lstmc"], crf=False, viterbi_loss=False, mimickx_model_path=None, dictionary=None, type_constraint=False, lex_dim=0, embed_lex=False): self.w2i = {} # word to index mapping self.c2i = {} # char to index mapping self.w2c_cache = {} # word to char index cache for frequent words self.wcount = None # word count self.ccount = None # char count self.task2tag2idx = {} # need one dictionary per task self.pred_layer = [int(layer) for layer in pred_layer ] # at which layer to predict each task self.model = dynet.ParameterCollection() #init model self.in_dim = in_dim self.h_dim = h_dim self.c_in_dim = c_in_dim self.c_h_dim = c_h_dim self.w_dropout_rate = w_dropout_rate self.c_dropout_rate = c_dropout_rate self.activation = activation self.mlp = mlp self.activation_mlp = activation_mlp self.noise_sigma = noise_sigma self.h_layers = h_layers self.predictors = { "inner": [], "output_layers_dict": {}, "task_expected_at": {} } # the inner layers and predictors self.wembeds = None # lookup: embeddings for words self.cembeds = None # lookup: embeddings for characters self.lembeds = None # lookup: embeddings for lexical features (optional) self.embeds_file = embeds_file trainer_algo = TRAINER_MAP[learning_algo] if learning_rate > 0: ### TODO: better handling of additional learning-specific parameters self.trainer = trainer_algo(self.model, learning_rate=learning_rate) else: # using default learning rate self.trainer = trainer_algo(self.model) self.backprob_embeds = backprob_embeds self.initializer = initializer self.char_rnn = None # biRNN for character input self.builder = builder # default biRNN is an LSTM self.crf = crf self.viterbi_loss = viterbi_loss self.mimickx_model_path = mimickx_model_path if mimickx_model_path: # load self.mimickx_model = load_model(mimickx_model_path) self.dictionary = None self.type_constraint = type_constraint self.embed_lex = False self.l2i = {UNK: 0} # lex feature to index mapping if dictionary: self.dictionary, self.dictionary_values = load_dict(dictionary) self.path_to_dictionary = dictionary if type_constraint: self.lex_dim = 0 else: if embed_lex: self.lex_dim = lex_dim self.embed_lex = True print("Embed lexical features") # register property indices for prop in self.dictionary_values: self.l2i[prop] = len(self.l2i) else: self.lex_dim = len(self.dictionary_values) #n-hot encoding print("Lex_dim: {}".format(self.lex_dim), file=sys.stderr) else: self.dictionary = None self.path_to_dictionary = None self.lex_dim = 0 def fit(self, train, num_iterations, dev=None, model_path=None, patience=0, minibatch_size=0, log_losses=False): """ train the tagger """ losses_log = {} # log losses print("init parameters") self.init_parameters(train) # init lookup parameters and define graph print("build graph") self.build_computation_graph(len(self.w2i), len(self.c2i)) update_embeds = True if self.backprob_embeds == False: ## disable backprob into embeds print(">>> disable wembeds update <<<") update_embeds = False best_val_acc, epochs_no_improvement = 0.0, 0 if dev and model_path is not None and patience > 0: print( 'Using early stopping with patience of {}...'.format(patience)) batch = [] print("train..") for iteration in range(num_iterations): total_loss = 0.0 total_tagged = 0.0 indices = [i for i in range(len(train.seqs))] random.shuffle(indices) loss_accum_loss = defaultdict(float) loss_accum_tagged = defaultdict(float) for idx in indices: seq = train.seqs[idx] if seq.task_id not in losses_log: losses_log[seq.task_id] = [] #initialize if minibatch_size > 1: # accumulate instances for minibatch update loss1 = self.predict(seq, train=True, update_embeds=update_embeds) total_tagged += len(seq.words) batch.append(loss1) if len(batch) == minibatch_size: loss = dynet.esum(batch) total_loss += loss.value() # logging loss_accum_tagged[seq.task_id] += len(seq.words) loss_accum_loss[seq.task_id] += loss.value() loss.backward() self.trainer.update() dynet.renew_cg( ) # use new computational graph for each BATCH when batching is active batch = [] else: dynet.renew_cg() # new graph per item loss1 = self.predict(seq, train=True, update_embeds=update_embeds) total_tagged += len(seq.words) lv = loss1.value() total_loss += lv # logging loss_accum_tagged[seq.task_id] += len(seq.words) loss_accum_loss[seq.task_id] += loss1.value() loss1.backward() self.trainer.update() print("iter {2} {0:>12}: {1:.2f}".format("total loss", total_loss / total_tagged, iteration)) # log losses for task_id in sorted(losses_log): losses_log[task_id].append(loss_accum_loss[task_id] / loss_accum_tagged[task_id]) if log_losses: dill.dump(losses_log, open(model_path + ".model" + ".losses.pickle", "wb")) if dev: # evaluate after every epoch correct, total = self.evaluate(dev, "task0") val_accuracy = correct / total print("dev accuracy: {0:.4f}".format(val_accuracy)) if val_accuracy > best_val_acc: print('Accuracy {0:.4f} is better than best val accuracy ' '{1:.4f}.'.format(val_accuracy, best_val_acc)) best_val_acc = val_accuracy epochs_no_improvement = 0 save(self, model_path) else: print( 'Accuracy {0:.4f} is worse than best val loss {1:.4f}.' .format(val_accuracy, best_val_acc)) epochs_no_improvement += 1 if patience > 0: if epochs_no_improvement == patience: print( 'No improvement for {} epochs. Early stopping...'. format(epochs_no_improvement)) break def set_indices(self, w2i, c2i, task2t2i, w2c_cache, l2i=None): """ helper function for loading model""" for task_id in task2t2i: self.task2tag2idx[task_id] = task2t2i[task_id] self.w2i = w2i self.c2i = c2i self.w2c_cache = w2c_cache self.l2i = l2i def set_counts(self, wcount, wtotal, ccount, ctotal): """ helper function for loading model""" self.wcount = wcount self.wtotal = wtotal self.ccount = ccount self.ctotal = ctotal def build_computation_graph(self, num_words, num_chars): """ build graph and link to parameters self.predictors, self.char_rnn, self.wembeds, self.cembeds = """ ## initialize word embeddings if self.embeds_file: print("loading embeddings") embeddings, emb_dim = load_embeddings_file(self.embeds_file) assert (emb_dim == self.in_dim) num_words = len( set(embeddings.keys()).union(set( self.w2i.keys()))) # initialize all with embeddings # init model parameters and initialize them self.wembeds = self.model.add_lookup_parameters( (num_words, self.in_dim), init=self.initializer) init = 0 for word in embeddings.keys(): if word not in self.w2i: self.w2i[word] = len(self.w2i.keys()) # add new word self.wembeds.init_row(self.w2i[word], embeddings[word]) init += 1 elif word in embeddings: self.wembeds.init_row(self.w2i[word], embeddings[word]) init += 1 print("initialized: {}".format(init)) del embeddings # clean up else: self.wembeds = self.model.add_lookup_parameters( (num_words, self.in_dim), init=self.initializer) ## initialize character embeddings self.cembeds = None if self.c_in_dim > 0: self.cembeds = self.model.add_lookup_parameters( (num_chars, self.c_in_dim), init=self.initializer) if self.lex_dim > 0 and self.embed_lex: # +1 for UNK property self.lembeds = self.model.add_lookup_parameters( (len(self.dictionary_values) + 1, self.lex_dim), init=dynet.GlorotInitializer()) #init=self.initializer) # make it more flexible to add number of layers as specified by parameter layers = [] # inner layers output_layers_dict = {} # from task_id to actual softmax predictor for layer_num in range(0, self.h_layers): if layer_num == 0: if self.c_in_dim > 0: # in_dim: size of each layer if self.lex_dim > 0 and self.embed_lex: lex_embed_size = self.lex_dim * len( self.dictionary_values) f_builder = self.builder( 1, self.in_dim + self.c_h_dim * 2 + lex_embed_size, self.h_dim, self.model) b_builder = self.builder( 1, self.in_dim + self.c_h_dim * 2 + lex_embed_size, self.h_dim, self.model) else: f_builder = self.builder( 1, self.in_dim + self.c_h_dim * 2 + self.lex_dim, self.h_dim, self.model) b_builder = self.builder( 1, self.in_dim + self.c_h_dim * 2 + self.lex_dim, self.h_dim, self.model) else: f_builder = self.builder(1, self.in_dim + self.lex_dim, self.h_dim, self.model) b_builder = self.builder(1, self.in_dim + self.lex_dim, self.h_dim, self.model) layers.append(BiRNNSequencePredictor( f_builder, b_builder)) #returns forward and backward sequence else: # add inner layers (if h_layers >1) f_builder = self.builder(1, self.h_dim, self.h_dim, self.model) b_builder = self.builder(1, self.h_dim, self.h_dim, self.model) layers.append(BiRNNSequencePredictor(f_builder, b_builder)) # store at which layer to predict task task2layer = { task_id: out_layer for task_id, out_layer in zip(self.task2tag2idx, self.pred_layer) } if len(task2layer) > 1: print("task2layer", task2layer) for task_id in task2layer: task_num_labels = len(self.task2tag2idx[task_id]) if not self.crf: output_layers_dict[task_id] = FFSequencePredictor( self.task2tag2idx[task_id], Layer(self.model, self.h_dim * 2, task_num_labels, dynet.softmax, mlp=self.mlp, mlp_activation=self.activation_mlp)) else: print("CRF") output_layers_dict[task_id] = CRFSequencePredictor( self.model, task_num_labels, self.task2tag2idx[task_id], Layer(self.model, self.h_dim * 2, task_num_labels, None, mlp=self.mlp, mlp_activation=self.activation_mlp), viterbi_loss=self.viterbi_loss) self.char_rnn = BiRNNSequencePredictor( self.builder(1, self.c_in_dim, self.c_h_dim, self.model), self.builder(1, self.c_in_dim, self.c_h_dim, self.model)) self.predictors = {} self.predictors["inner"] = layers self.predictors["output_layers_dict"] = output_layers_dict self.predictors["task_expected_at"] = task2layer def get_features(self, words, train=False, update=True): """ get feature representations """ # word embeddings wfeatures = np.array([ self.get_w_repr(word, train=train, update=update) for word in words ]) lex_features = [] if self.dictionary and not self.type_constraint: ## add lexicon features lex_features = np.array( [self.get_lex_repr(word) for word in words]) # char embeddings if self.c_in_dim > 0: cfeatures = [self.get_c_repr(word, train=train) for word in words] if len(lex_features) > 0: lex_features = dynet.inputTensor(lex_features) features = [ dynet.concatenate([w, c, l]) for w, c, l in zip(wfeatures, cfeatures, lex_features) ] else: features = [ dynet.concatenate([w, c]) for w, c in zip(wfeatures, cfeatures) ] else: features = wfeatures if train: # only do at training time features = [dynet.noise(fe, self.noise_sigma) for fe in features] return features def predict(self, seq, train=False, output_confidences=False, unk_tag=None, update_embeds=True): """ predict tags for a sentence represented as char+word embeddings and compute losses for this instance """ if not train: dynet.renew_cg() features = self.get_features(seq.words, train=train, update=update_embeds) output_expected_at_layer = self.predictors["task_expected_at"][ seq.task_id] output_expected_at_layer -= 1 # go through layers # input is now combination of w + char emb prev = features prev_rev = features num_layers = self.h_layers for i in range(0, num_layers): predictor = self.predictors["inner"][i] forward_sequence, backward_sequence = predictor.predict_sequence( prev, prev_rev) if i > 0 and self.activation: # activation between LSTM layers forward_sequence = [ self.activation(s) for s in forward_sequence ] backward_sequence = [ self.activation(s) for s in backward_sequence ] if i == output_expected_at_layer: output_predictor = self.predictors["output_layers_dict"][ seq.task_id] concat_layer = [ dynet.concatenate([f, b]) for f, b in zip( forward_sequence, reversed(backward_sequence)) ] if train and self.noise_sigma > 0.0: concat_layer = [ dynet.noise(fe, self.noise_sigma) for fe in concat_layer ] # fill-in predictions and get loss per tag losses = output_predictor.predict_sequence( seq, concat_layer, train=train, output_confidences=output_confidences, unk_tag=unk_tag, dictionary=self.dictionary, type_constraint=self.type_constraint) prev = forward_sequence prev_rev = backward_sequence if train: # return losses return losses else: return seq.pred_tags, seq.tag_confidences def output_preds(self, seq, raw=False, output_confidences=False): """ output predictions to a file """ i = 0 for w, g, p in zip(seq.words, seq.tags, seq.pred_tags): if raw: if output_confidences: print(u"{0}\t{1}\t{2:.2f}".format(w, p, seq.tag_confidences[i])) else: print(u"{}\t{}".format( w, p)) # do not print DUMMY tag when --raw is on else: if output_confidences: print(u"{0}\t{1}\t{2}\t{3:.2f}".format( w, g, p, seq.tag_confidences[i])) else: print(u"{}\t{}\t{}".format(w, g, p)) i += 1 print("") def evaluate(self, test_file, task_id, output_predictions=None, raw=False, output_confidences=False, unk_tag=None): """ compute accuracy on a test file, optionally output to file """ correct = 0 total = 0 for seq in test_file: if seq.task_id != task_id: continue # we evaluate only on a specific task self.predict(seq, output_confidences=output_confidences, unk_tag=unk_tag) if output_predictions: self.output_preds(seq, raw=raw, output_confidences=output_confidences) correct_inst, total_inst = seq.evaluate() correct += correct_inst total += total_inst return correct, total def get_w_repr(self, word, train=False, update=True): """ Get representation of word (word embedding) """ if train: if self.w_dropout_rate > 0.0: w_id = self.w2i[UNK] if drop( word, self.wcount, self.w_dropout_rate) else self.w2i.get( word, self.w2i[UNK]) else: if self.mimickx_model_path: # if given use MIMICKX if word not in self.w2i: # #print("predict with MIMICKX for: ", word) return dynet.inputVector( self.mimickx_model.predict(word).npvalue()) w_id = self.w2i.get(word, self.w2i[UNK]) if not update: return dynet.nobackprop(self.wembeds[w_id]) else: return self.wembeds[w_id] def get_c_repr(self, word, train=False): """ Get representation of word via characters sub-LSTMs """ # get representation for words if word in self.w2c_cache: chars_of_token = self.w2c_cache[word] if train: chars_of_token = [ drop(c, self.ccount, self.c_dropout_rate) for c in chars_of_token ] else: chars_of_token = array.array( 'I', [self.c2i[WORD_START]]) + array.array( 'I', [self.get_c_idx(c, train=train) for c in word]) + array.array( 'I', [self.c2i[WORD_END]]) char_feats = [self.cembeds[c_id] for c_id in chars_of_token] # use last state as word representation f_char, b_char = self.char_rnn.predict_sequence(char_feats, char_feats) return dynet.concatenate([f_char[-1], b_char[-1]]) def get_c_idx(self, c, train=False): """ helper function to get index of character""" if self.c_dropout_rate > 0.0 and train and drop( c, self.ccount, self.c_dropout_rate): return self.c2i.get(UNK) else: return self.c2i.get(c, self.c2i[UNK]) def get_lex_repr(self, word): """ Get representation for lexical feature """ if not self.embed_lex: ## n-hot representation n_hot = np.zeros(len(self.dictionary_values)) values = is_in_dict(word, self.dictionary) if values: for v in values: n_hot[self.dictionary_values.index(v)] = 1.0 return n_hot else: lex_feats = [] for property in self.dictionary_values: values = is_in_dict(word, self.dictionary) if values: if property in values: lex_feats.append( self.lembeds[self.l2i[property]].npvalue()) else: lex_feats.append(self.lembeds[self.l2i[UNK]].npvalue()) else: lex_feats.append( self.lembeds[self.l2i[UNK]].npvalue()) # unknown word return np.concatenate(lex_feats) def init_parameters(self, train_data): """init parameters from training data""" # word 2 indices and tag 2 indices self.w2i = {} # word to index self.c2i = {} # char to index self.task2tag2idx = {} # id of the task -> tag2idx self.w2i[UNK] = 0 # unk word / OOV self.c2i[UNK] = 0 # unk char self.c2i[WORD_START] = 1 # word start self.c2i[WORD_END] = 2 # word end index # word and char counters self.wcount = Counter() self.ccount = Counter() for seq in train_data: self.wcount.update([w for w in seq.words]) self.ccount.update([c for w in seq.words for c in w]) if seq.task_id not in self.task2tag2idx: self.task2tag2idx[seq.task_id] = { "<START>": START_TAG, "<END>": END_TAG } # record words and chars for word, tag in zip(seq.words, seq.tags): if word not in self.w2i: self.w2i[word] = len(self.w2i) if self.c_in_dim > 0: for char in word: if char not in self.c2i: self.c2i[char] = len(self.c2i) if tag not in self.task2tag2idx[seq.task_id]: self.task2tag2idx[seq.task_id][tag] = len( self.task2tag2idx[seq.task_id]) n = int(len(self.w2i) * 0.3) # top 30% print("Caching top {} words".format(n)) for word in self.wcount.most_common(n): self.w2c_cache[word] = array.array( 'I', [self.c2i[WORD_START]]) + array.array( 'I', [self.get_c_idx(c) for c in word]) + array.array( 'I', [self.c2i[WORD_END]]) # get total counts self.wtotal = np.sum([self.wcount[w] for w in self.wcount]) self.ctotal = np.sum([self.ccount[c] for c in self.ccount]) print("{} w features, {} c features".format(len(self.w2i), len(self.c2i))) #print(self.wtotal, self.ctotal) def save_embeds(self, out_filename): """ save final embeddings to file :param out_filename: filename """ # construct reverse mapping i2w = {self.w2i[w]: w for w in self.w2i.keys()} OUT = open(out_filename + ".w.emb", "w") for word_id in i2w.keys(): wembeds_expression = self.wembeds[word_id] word = i2w[word_id] OUT.write("{} {}\n".format( word, " ".join([str(x) for x in wembeds_expression.npvalue()]))) OUT.close() def save_lex_embeds(self, out_filename): """ save final embeddings to file :param out_filename: filename """ # construct reverse mapping i2l = {self.l2i[w]: w for w in self.l2i.keys()} OUT = open(out_filename + ".l.emb", "w") for lex_id in i2l.keys(): lembeds_expression = self.lembeds[lex_id] lex = i2l[lex_id] OUT.write("{} {}\n".format( lex, " ".join([str(x) for x in lembeds_expression.npvalue()]))) OUT.close() def save_cw_embeds(self, out_filename): """ save final character-based word-embeddings to file :param out_filename: filename """ # construct reverse mapping using word embeddings i2cw = {self.w2i[w]: w for w in self.w2i.keys()} OUT = open(out_filename + ".cw.emb", "w") for word_id in i2cw.keys(): word = i2cw[word_id] cwembeds = [v.npvalue()[0] for v in self.get_c_repr(word)] OUT.write("{} {}\n".format(word, " ".join([str(x) for x in cwembeds]))) OUT.close() def save_wordlex_map(self, out_filename): """ save final word-to-lexicon-embedding map to file :param out_filename: filename """ # construct reverse mapping using word embeddings i2wl = {self.w2i[w]: w for w in self.w2i.keys()} OUT = open(out_filename + ".wlmap.emb", "w") for word_id in i2wl.keys(): word = i2wl[word_id] lex_feats = [] for property in self.dictionary_values: values = is_in_dict(word, self.dictionary) if values: if property in values: lex_feats.append(property) else: lex_feats.append(UNK) else: lex_feats.append(UNK) # unknown word OUT.write("{} {}\n".format(word, " ".join([str(x) for x in lex_feats]))) OUT.close() def save_transition_matrix(self, out_filename): """ save transition matrix :param out_filename: filename """ for task_id in self.predictors["output_layers_dict"].keys(): output_predictor = self.predictors["output_layers_dict"][task_id] output_predictor.save_parameters(out_filename)
class SimpleBiltyTagger(object): # turn dynamic allocation off by defining slots __slots__ = [ 'w2i', 'c2i', 'tag2idx', 'model', 'in_dim', 'c_in_dim', 'h_dim', 'activation', 'noise_sigma', 'h_layers', 'predictors', 'wembeds', 'cembeds', 'embeds_file', 'char_rnn', 'trainer' ] def __init__(self, in_dim, h_dim, c_in_dim, h_layers, embeds_file=None, activation=dynet.tanh, noise_sigma=0.1, word2id=None, trainer="adam", clip_threshold=5.0, learning_rate=0.001): # use default Adam learning rate - TODO: support other optimizer-specific options self.w2i = {} if word2id is None else word2id # word to index mapping self.c2i = {} # char to index mapping self.tag2idx = {} # tag to tag_id mapping self.model = dynet.ParameterCollection() #init model # init trainer train_algo = TRAINER_MAP[trainer] self.trainer = train_algo(self.model, learning_rate) if clip_threshold: self.trainer.set_clip_threshold(clip_threshold) self.in_dim = in_dim self.h_dim = h_dim self.c_in_dim = c_in_dim self.activation = activation self.noise_sigma = noise_sigma self.h_layers = h_layers self.predictors = { "inner": [], "output_layers_dict": {}, "task_expected_at": {} } # the inner layers and predictors self.wembeds = None # lookup: embeddings for words self.cembeds = None # lookup: embeddings for characters self.embeds_file = embeds_file self.char_rnn = None # RNN for character input def pick_neg_log(self, pred, gold): if hasattr(gold, "__len__"): # calculate cross-entropy loss against the whole vector dy_gold = dynet.inputVector(gold) return -dynet.sum_elems(dynet.cmult(dy_gold, dynet.log(pred))) return -dynet.log(dynet.pick(pred, gold)) def set_indices(self, w2i, c2i, tag2idx): self.tag2idx = tag2idx self.w2i = w2i self.c2i = c2i def cosine(self, e1, e2): return dynet.cdiv( dynet.dot_product(e1, e2), (dynet.cmult(dynet.squared_norm(e1), dynet.squared_norm(e2)))) def fit(self, train_X, train_Y, num_epochs, val_X=None, val_Y=None, patience=2, model_path=None, seed=None, word_dropout_rate=0.25, trg_vectors=None, unsup_weight=1.0, variance_weights=None, labeled_weight_proportion=1.0): """ train the tagger :param trg_vectors: the prediction targets used for the unsupervised loss in temporal ensembling :param unsup_weight: weight for the unsupervised consistency loss used in temporal ensembling :param clip_threshold: use gradient clipping with threshold (on if >0; default: 5.0) :param labeled_weight_proportion: proportion of the unsupervised weight that should be assigned to labeled examples """ print("read training data", file=sys.stderr) if variance_weights is not None: print('First 20 variance weights:', variance_weights[:20]) if seed: print(">>> using seed: ", seed, file=sys.stderr) random.seed(seed) #setting random seed # if we use word dropout keep track of counts if word_dropout_rate > 0.0: widCount = Counter() for sentence, _ in train_X: widCount.update([w for w in sentence]) assert (len(train_X) == len(train_Y)) train_data = list(zip(train_X, train_Y)) # if we use target vectors, keep track of the targets per sentence if trg_vectors is not None: trg_start_id = 0 sentence_trg_vectors = [] sentence_var_weights = [] for i, (example, y) in enumerate(train_data): sentence_trg_vectors.append( trg_vectors[trg_start_id:trg_start_id + len(example[0]), :]) if variance_weights is not None: sentence_var_weights.append( variance_weights[trg_start_id:trg_start_id + len(example[0])]) trg_start_id += len(example[0]) assert trg_start_id == len(trg_vectors),\ 'Error: Idx {} is not at {}.'.format(trg_start_id, len(trg_vectors)) assert len(sentence_trg_vectors) == len(train_X) if variance_weights is not None: assert trg_start_id == len(variance_weights) assert len(sentence_var_weights) == len(train_X) print('Starting training for {} epochs...'.format(num_epochs)) best_val_acc, epochs_no_improvement = 0., 0 if val_X is not None and val_Y is not None and model_path is not None: print( 'Using early stopping with patience of {}...'.format(patience)) for cur_iter in range(num_epochs): bar = Bar('Training epoch {}/{}...'.format(cur_iter + 1, num_epochs), max=len(train_data), flush=True) total_loss = 0.0 total_tagged = 0.0 total_other_loss, total_other_loss_weighted = 0.0, 0.0 random_indices = np.arange(len(train_data)) random.shuffle(random_indices) for i, idx in enumerate(random_indices): (word_indices, char_indices), y = train_data[idx] if word_dropout_rate > 0.0: word_indices = [ self.w2i["_UNK"] if (random.random() > (widCount.get(w) / (word_dropout_rate + widCount.get(w)))) else w for w in word_indices ] output = self.predict(word_indices, char_indices, train=True) if len(y) == 1 and y[0] == 0: # in temporal ensembling, we assign a dummy label of [0] for # unlabeled sequences; we skip the supervised loss for these loss = dynet.scalarInput(0) else: loss = dynet.esum([ self.pick_neg_log(pred, gold) for pred, gold in zip(output, y) ]) if trg_vectors is not None: # the consistency loss in temporal ensembling is used for # both supervised and unsupervised input targets = sentence_trg_vectors[idx] assert len(output) == len(targets) if variance_weights is not None: var_weights = sentence_var_weights[idx] assert len(output) == len(var_weights) # multiply the normalized mean variance with each loss other_loss = dynet.esum([ v * dynet.squared_distance(o, dynet.inputVector(t)) for o, t, v in zip(output, targets, var_weights) ]) else: other_loss = dynet.esum([ dynet.squared_distance(o, dynet.inputVector(t)) for o, t in zip(output, targets) ]) total_other_loss += other_loss.value() if len(y) == 1 and y[0] == 0: #unlab_ex other_loss += other_loss * unsup_weight else: #lab_ex # assign the unsupervised weight for labeled examples other_loss += other_loss * unsup_weight * labeled_weight_proportion # keep track for logging total_loss += loss.value() # main loss total_tagged += len(word_indices) total_other_loss_weighted += other_loss.value() # combine losses loss += other_loss else: # keep track for logging total_loss += loss.value() total_tagged += len(word_indices) loss.backward() self.trainer.update() bar.next() if trg_vectors is None: print("iter {2} {0:>12}: {1:.2f}".format( "total loss", total_loss / total_tagged, cur_iter), file=sys.stderr) else: print( "iter {2} {0:>12}: {1:.2f} unsupervised loss: {3:.2f} (weighted: {4:.2f})" .format("supervised loss", total_loss / total_tagged, cur_iter, total_other_loss / total_tagged, total_other_loss_weighted / total_tagged), file=sys.stderr) if val_X is not None and val_Y is not None and model_path is not None: # get the best accuracy on the validation set val_correct, val_total = self.evaluate(val_X, val_Y) val_accuracy = val_correct / val_total if val_accuracy > best_val_acc: print( 'Accuracy {:.4f} is better than best val accuracy {:.4f}' .format(val_accuracy, best_val_acc)) best_val_acc = val_accuracy epochs_no_improvement = 0 save_tagger(self, model_path) else: print( 'Accuracy {:.4f} is worse than best val loss {:.4f}.'. format(val_accuracy, best_val_acc)) epochs_no_improvement += 1 if epochs_no_improvement == patience: print('No improvement for {} epochs. Early stopping...'. format(epochs_no_improvement)) break def initialize_graph(self, num_words=None, num_chars=None): """ build graph and link to parameters """ num_words = num_words if num_words is not None else len(self.w2i) num_chars = num_chars if num_chars is not None else len(self.c2i) if num_words == 0 or num_chars == 0: raise ValueError('Word2id and char2id have to be loaded before ' 'initializing the graph.') print('Initializing the graph...') # initialize the word embeddings and the parameters self.cembeds = None if self.embeds_file: print("loading embeddings", file=sys.stderr) embeddings, emb_dim = load_embeddings_file(self.embeds_file) assert (emb_dim == self.in_dim) num_words = len( set(embeddings.keys()).union(set( self.w2i.keys()))) # initialize all with embeddings # init model parameters and initialize them self.wembeds = self.model.add_lookup_parameters( (num_words, self.in_dim), init=dynet.ConstInitializer(0.01)) if self.c_in_dim > 0: self.cembeds = self.model.add_lookup_parameters( (num_chars, self.c_in_dim), init=dynet.ConstInitializer(0.01)) init = 0 l = len(embeddings.keys()) for word in embeddings.keys(): # for those words we have already in w2i, update vector, otherwise add to w2i (since we keep data as integers) if word in self.w2i: self.wembeds.init_row(self.w2i[word], embeddings[word]) else: self.w2i[word] = len(self.w2i.keys()) # add new word self.wembeds.init_row(self.w2i[word], embeddings[word]) init += 1 print("initialized: {}".format(init), file=sys.stderr) else: self.wembeds = self.model.add_lookup_parameters( (num_words, self.in_dim), init=dynet.ConstInitializer(0.01)) if self.c_in_dim > 0: self.cembeds = self.model.add_lookup_parameters( (num_chars, self.c_in_dim), init=dynet.ConstInitializer(0.01)) # make it more flexible to add number of layers as specified by parameter layers = [] # inner layers for layer_num in range(0, self.h_layers): if layer_num == 0: if self.c_in_dim > 0: f_builder = dynet.CoupledLSTMBuilder( 1, self.in_dim + self.c_in_dim * 2, self.h_dim, self.model) # in_dim: size of each layer b_builder = dynet.CoupledLSTMBuilder( 1, self.in_dim + self.c_in_dim * 2, self.h_dim, self.model) else: f_builder = dynet.CoupledLSTMBuilder( 1, self.in_dim, self.h_dim, self.model) b_builder = dynet.CoupledLSTMBuilder( 1, self.in_dim, self.h_dim, self.model) layers.append(BiRNNSequencePredictor( f_builder, b_builder)) #returns forward and backward sequence else: # add inner layers (if h_layers >1) f_builder = dynet.LSTMBuilder(1, self.h_dim, self.h_dim, self.model) b_builder = dynet.LSTMBuilder(1, self.h_dim, self.h_dim, self.model) layers.append(BiRNNSequencePredictor(f_builder, b_builder)) # store at which layer to predict task task_num_labels = len(self.tag2idx) output_layer = FFSequencePredictor( Layer(self.model, self.h_dim * 2, task_num_labels, dynet.softmax)) if self.c_in_dim > 0: self.char_rnn = BiRNNSequencePredictor( dynet.CoupledLSTMBuilder(1, self.c_in_dim, self.c_in_dim, self.model), dynet.CoupledLSTMBuilder(1, self.c_in_dim, self.c_in_dim, self.model)) else: self.char_rnn = None self.predictors = dict() self.predictors["inner"] = layers self.predictors["output_layers_dict"] = output_layer self.predictors["task_expected_at"] = self.h_layers def get_features(self, words): """ from a list of words, return the word and word char indices """ word_indices = [] word_char_indices = [] for word in words: if word in self.w2i: word_indices.append(self.w2i[word]) else: word_indices.append(self.w2i["_UNK"]) if self.c_in_dim > 0: chars_of_word = [self.c2i["<w>"]] for char in word: if char in self.c2i: chars_of_word.append(self.c2i[char]) else: chars_of_word.append(self.c2i["_UNK"]) chars_of_word.append(self.c2i["</w>"]) word_char_indices.append(chars_of_word) return word_indices, word_char_indices def __get_instances_from_file(self, file_name): """ helper function to convert input file to lists of lists holding input words|tags """ data = [(words, tags) for (words, tags) in list(read_conll_file(file_name))] words = [words for (words, _) in data] tags = [tags for (_, tags) in data] return words, tags def get_data_as_indices(self, file_name): """ X = list of (word_indices, word_char_indices) Y = list of tag indices """ words, tags = self.__get_instances_from_file(file_name) return self.get_data_as_indices_from_instances(words, tags) def get_data_as_indices_from_instances(self, dev_words, dev_tags): """ Extension of get_data_as_indices. Use words and tags rather than a file as input. X = list of (word_indices, word_char_indices) Y = list of tag indices """ X, Y = [], [] org_X, org_Y = [], [] for (words, tags) in zip(dev_words, dev_tags): word_indices, word_char_indices = self.get_features(words) # if tag does not exist in source domain tags, return as default # first idx outside of dictionary tag_indices = [ self.tag2idx.get(tag, len(self.tag2idx)) for tag in tags ] X.append((word_indices, word_char_indices)) Y.append(tag_indices) org_X.append(words) org_Y.append(tags) return X, Y # , org_X, org_Y - for now don't use def predict(self, word_indices, char_indices, train=False, soft_labels=False, temperature=None): """ predict tags for a sentence represented as char+word embeddings """ dynet.renew_cg() # new graph char_emb = [] rev_char_emb = [] wfeatures = [self.wembeds[w] for w in word_indices] if self.c_in_dim > 0: # get representation for words for chars_of_token in char_indices: char_feats = [self.cembeds[c] for c in chars_of_token] # use last state as word representation f_char, b_char = self.char_rnn.predict_sequence( char_feats, char_feats) last_state = f_char[-1] rev_last_state = b_char[-1] char_emb.append(last_state) rev_char_emb.append(rev_last_state) features = [ dynet.concatenate([w, c, rev_c]) for w, c, rev_c in zip(wfeatures, char_emb, rev_char_emb) ] else: features = wfeatures if train: # only do at training time features = [dynet.noise(fe, self.noise_sigma) for fe in features] output_expected_at_layer = self.h_layers output_expected_at_layer -= 1 # go through layers prev = features prev_rev = features num_layers = self.h_layers for i in range(0, num_layers): predictor = self.predictors["inner"][i] forward_sequence, backward_sequence = predictor.predict_sequence( prev, prev_rev) if i > 0 and self.activation: # activation between LSTM layers forward_sequence = [ self.activation(s) for s in forward_sequence ] backward_sequence = [ self.activation(s) for s in backward_sequence ] if i == output_expected_at_layer: output_predictor = self.predictors["output_layers_dict"] concat_layer = [ dynet.concatenate([f, b]) for f, b in zip( forward_sequence, reversed(backward_sequence)) ] if train and self.noise_sigma > 0.0: concat_layer = [ dynet.noise(fe, self.noise_sigma) for fe in concat_layer ] output = output_predictor.predict_sequence( concat_layer, soft_labels=soft_labels, temperature=temperature) return output prev = forward_sequence prev_rev = backward_sequence raise Exception("oops should not be here") return None def evaluate(self, test_X, test_Y): """ compute accuracy on a test file """ correct = 0 total = 0.0 for i, ((word_indices, word_char_indices), gold_tag_indices) in enumerate(zip(test_X, test_Y)): output = self.predict(word_indices, word_char_indices) predicted_tag_indices = [np.argmax(o.value()) for o in output] correct += sum([ 1 for (predicted, gold) in zip(predicted_tag_indices, gold_tag_indices) if predicted == gold ]) total += len(gold_tag_indices) return correct, total def get_predictions(self, test_X, soft_labels=False): """ get flat list of predictions """ predictions = [] for word_indices, word_char_indices in test_X: output = self.predict(word_indices, word_char_indices) predictions += [ o.value() if soft_labels else int(np.argmax(o.value())) for o in output ] return predictions def get_predictions_output(self, test_X, test_labels, output_filename): """ get predictions to output to file assume test_labels are not indices (as target domain can have tags that are not in source) text_X: indices test_labels: original labels """ i2w = {self.w2i[w]: w for w in self.w2i.keys()} i2t = {self.tag2idx[t]: t for t in self.tag2idx.keys()} OUT = open(output_filename, "w") for (word_indices, word_char_indices), gold_tags in zip(test_X, test_labels): output = self.predict(word_indices, word_char_indices) predicted_tag_ids = [int(np.argmax(o.value())) for o in output] for word_id, tag_id, gold_tag in zip(word_indices, predicted_tag_ids, gold_tags): known_tag_prefix = "{}" if gold_tag in self.tag2idx else "*{}" word, pred_tag, gold_tag = i2w[word_id], i2t[ tag_id], known_tag_prefix.format(gold_tag) OUT.write("{}\t{}\t{}\n".format(word, gold_tag, pred_tag)) OUT.write("\n") OUT.close() def get_train_data_from_instances(self, train_words, train_tags): """ Extension of get_train_data method. Extracts training data from two arrays of word and label lists. transform training data to features (word indices) map tags to integers :param train_words: a numpy array containing lists of words :param train_tags: a numpy array containing lists of corresponding tags """ X = [] Y = [] # check if we continue training continue_training = False if self.w2i and self.tag2idx: continue_training = True if continue_training: print("update existing vocabulary") # fetch already existing w2i = self.w2i.copy() c2i = self.c2i.copy() tag2idx = self.tag2idx assert w2i["_UNK"] == 0, "No _UNK found!" else: # word 2 indices and tag 2 indices w2i = self.w2i.copy( ) # get a copy that refers to a different object c2i = {} # char to index tag2idx = {} # tag2idx if len(w2i) > 0: assert w2i["_UNK"] == 0 else: w2i["_UNK"] = 0 # unk word / OOV c2i["_UNK"] = 0 # unk char c2i["<w>"] = 1 # word start c2i["</w>"] = 2 # word end index num_sentences = 0 num_tokens = 0 for instance_idx, (words, tags) in enumerate(zip(train_words, train_tags)): instance_word_indices = [] # sequence of word indices instance_char_indices = [] # sequence of char indices instance_tags_indices = [] # sequence of tag indices for i, (word, tag) in enumerate(zip(words, tags)): # map words and tags to indices if word not in w2i: w2i[word] = len(w2i) instance_word_indices.append(w2i[word]) else: instance_word_indices.append(w2i[word]) chars_of_word = [c2i["<w>"]] for char in word: if char not in c2i: c2i[char] = len(c2i) chars_of_word.append(c2i[char]) chars_of_word.append(c2i["</w>"]) instance_char_indices.append(chars_of_word) if tag not in tag2idx: tag2idx[tag] = len(tag2idx) instance_tags_indices.append(tag2idx.get(tag)) num_tokens += 1 num_sentences += 1 X.append( (instance_word_indices, instance_char_indices) ) # list of word indices, for every word list of char indices Y.append(instance_tags_indices) print("%s sentences %s tokens" % (num_sentences, num_tokens), file=sys.stderr) print("%s w features, %s c features " % (len(w2i), len(c2i)), file=sys.stderr) assert (len(X) == len(Y)) # store mappings of words and tags to indices self.set_indices(w2i, c2i, tag2idx) return X, Y def get_train_data(self, train_data): """ transform training data to features (word indices) map tags to integers """ train_words, train_tags = self.__get_instances_from_file(train_data) return self.get_train_data_from_instances(train_words, train_tags)
def build_computation_graph(self, num_words, num_chars): """ build graph and link to parameters """ ## initialize word embeddings if self.embeds_file: print("loading embeddings", file=sys.stderr) embeddings, emb_dim = load_embeddings_file(self.embeds_file) assert(emb_dim==self.in_dim) num_words=len(set(embeddings.keys()).union(set(self.w2i.keys()))) # initialize all with embeddings # init model parameters and initialize them wembeds = self.model.add_lookup_parameters((num_words, self.in_dim), init=self.initializer) init=0 for word in embeddings: if word not in self.w2i: self.w2i[word]=len(self.w2i.keys()) # add new word wembeds.init_row(self.w2i[word], embeddings[word]) init +=1 elif word in embeddings: wembeds.init_row(self.w2i[word], embeddings[word]) init += 1 print("initialized: {}".format(init), file=sys.stderr) else: wembeds = self.model.add_lookup_parameters((num_words, self.in_dim), init=self.initializer) ## initialize character embeddings cembeds = None if self.c_in_dim > 0: cembeds = self.model.add_lookup_parameters((num_chars, self.c_in_dim), init=self.initializer) # make it more flexible to add number of layers as specified by parameter layers = [] # inner layers output_layers_dict = {} # from task_id to actual softmax predictor task_expected_at = {} # map task_id => output_layer_# # connect output layers to tasks for output_layer, task_id in zip(self.pred_layer, self.tasks_ids): if output_layer > self.h_layers: raise ValueError("cannot have a task at a layer (%d) which is " "beyond the model, increase h_layers (%d)" % (output_layer, self.h_layers)) task_expected_at[task_id] = output_layer nb_tasks = len( self.tasks_ids ) for layer_num in range(0,self.h_layers): if layer_num == 0: if self.c_in_dim > 0: # in_dim: size of each layer f_builder = self.builder(1, self.in_dim+self.c_in_dim*2, self.h_dim, self.model) b_builder = self.builder(1, self.in_dim+self.c_in_dim*2, self.h_dim, self.model) else: f_builder = self.builder(1, self.in_dim, self.h_dim, self.model) b_builder = self.builder(1, self.in_dim, self.h_dim, self.model) layers.append(BiRNNSequencePredictor(f_builder, b_builder)) #returns forward and backward sequence else: # add inner layers (if h_layers >1) f_builder = self.builder(1, self.h_dim, self.h_dim, self.model) b_builder = self.builder(1, self.h_dim, self.h_dim, self.model) layers.append(BiRNNSequencePredictor(f_builder, b_builder)) # store at which layer to predict task for task_id in self.tasks_ids: task_num_labels= len(self.task2tag2idx[task_id]) output_layers_dict[task_id] = FFSequencePredictor(Layer(self.model, self.h_dim*2, task_num_labels, dynet.softmax, mlp=self.mlp, mlp_activation=self.activation_mlp)) char_rnn = BiRNNSequencePredictor(self.builder(1, self.c_in_dim, self.c_in_dim, self.model), self.builder(1, self.c_in_dim, self.c_in_dim, self.model)) predictors = {} predictors["inner"] = layers predictors["output_layers_dict"] = output_layers_dict predictors["task_expected_at"] = task_expected_at return predictors, char_rnn, wembeds, cembeds
class Amt3Tagger(object): def __init__(self, in_dim, h_dim, c_in_dim, h_layers, embeds_file=None, activation=dynet.tanh, noise_sigma=0.1, word2id=None, add_hidden=False, trainer="adam", clip_threshold=5.0, learning_rate=0.001, adversarial_domains=None): self.w2i = {} if word2id is None else word2id # word to index mapping self.c2i = {} # char to index mapping self.tag2idx = {} # tag to tag_id mapping self.model = dynet.ParameterCollection() # init model # init trainer train_algo = TRAINER_MAP[trainer] self.trainer = train_algo(self.model, learning_rate) if clip_threshold: self.trainer.set_clip_threshold(clip_threshold) self.in_dim = in_dim self.h_dim = h_dim self.c_in_dim = c_in_dim self.activation = activation self.noise_sigma = noise_sigma self.h_layers = h_layers self.predictors = { "inner": [], "output_layers_dict": {}, "task_expected_at": {} } # the inner layers and predictors self.wembeds = None # lookup: embeddings for words self.cembeds = None # lookup: embeddings for characters self.embeds_file = embeds_file self.char_rnn = None # RNN for character input self.task_ids = ["F0", "F1", "Ft"] self.add_hidden = add_hidden self.adversarial_domains = adversarial_domains def add_adversarial_loss(self, num_domains=2): if not self.adversarial_domains: # make sure they are set the latest here self.adversarial_domains = num_domains self.adv_layer = Layer(self.model, 2 * self.h_dim, num_domains, activation=dynet.softmax, mlp=self.h_dim if self.add_hidden else 0) def pick_neg_log(self, pred, gold): if not isinstance(gold, int): # calculate cross-entropy loss against the whole vector dy_gold = dynet.inputVector(gold) return -dynet.sum_elems(dynet.cmult(dy_gold, dynet.log(pred))) return -dynet.log(dynet.pick(pred, gold)) def set_indices(self, w2i, c2i, tag2idx): self.tag2idx = tag2idx self.w2i = w2i self.c2i = c2i def fit(self, train_dict, num_epochs, val_X=None, val_Y=None, patience=2, model_path=None, seed=None, word_dropout_rate=0.25, trg_vectors=None, unsup_weight=1.0, clip_threshold=5.0, orthogonality_weight=0.0, adversarial=False, adversarial_weight=1.0, ignore_src_Ft=False): """ train the tagger :param trg_vectors: the prediction targets used for the unsupervised loss in temporal ensembling :param unsup_weight: weight for the unsupervised consistency loss used in temporal ensembling :param adversarial: note: if we want to use adversarial, we have to call add_adversarial_loss before; :param adversarial_weight: 1 by default (do not weigh adv loss) :param ignore_src_Ft: if asymm.tri. 2nd stage, do not further train Ft on 'src' :param train_dict: a dictionary mapping tasks ("F0", "F1", and "Ft") to a dictionary {"X": list of examples, "Y": list of labels, "domain": list of domain tag (0,1) of example} Three tasks are indexed as "F0", "F1" and "Ft" Note: if a task 'src' is given than a single model with three heads is trained where all data is given to all outputs """ print("read training data") widCount = Counter() train_data = [] for task, task_dict in train_dict.items(): #task: eg. "F0" for key in ["X", "Y", "domain"]: assert key in task_dict, "Error: %s is not available." % key examples, labels, domain_tags = task_dict["X"], task_dict[ "Y"], task_dict["domain"] assert len(examples) == len(labels) if word_dropout_rate > 0.0: # keep track of the counts for word dropout for sentence, _ in examples: widCount.update([w for w in sentence]) # train data is a list of 4-tuples: (example, label, task_id, domain_id) train_data += list( zip(examples, labels, [[task] * len(labels)][0], domain_tags)) # if we use target vectors, keep track of the targets per sentence if trg_vectors is not None: trg_start_id = 0 sentence_trg_vectors = [] for i, (example, y) in enumerate(train_data): sentence_trg_vectors.append( trg_vectors[trg_start_id:trg_start_id + len(example[0]), :]) trg_start_id += len(example[0]) assert trg_start_id == len(trg_vectors),\ 'Error: Idx {} is not at {}.'.format(trg_start_id, len(trg_vectors)) print('Starting training for {} epochs...'.format(num_epochs)) best_val_acc, epochs_no_improvement = 0., 0 if val_X is not None and val_Y is not None and model_path is not None: print( 'Using early stopping with patience of {}...'.format(patience)) if seed: random.seed(seed) for cur_iter in range(num_epochs): bar = Bar('Training epoch {}/{}...'.format(cur_iter + 1, num_epochs), max=len(train_data), flush=True) random_indices = np.arange(len(train_data)) random.shuffle(random_indices) total_loss, total_tagged, total_constraint, total_adversarial = 0.0, 0.0, 0.0, 0.0 total_orth_constr = 0 # count how many updates # log separate losses log_losses = {} log_total = {} for task_id in self.task_ids: log_losses[task_id] = 0.0 log_total[task_id] = 0 for i, idx in enumerate(random_indices): (word_indices, char_indices), y, task_id, domain_id = train_data[idx] if word_dropout_rate > 0.0: word_indices = [ self.w2i["_UNK"] if (random.random() > (widCount.get(w) / (word_dropout_rate + widCount.get(w)))) else w for w in word_indices ] output, constraint, adv = self.predict( word_indices, char_indices, task_id, train=True, orthogonality_weight=orthogonality_weight, domain_id=domain_id if adversarial else None) if task_id not in ['src', 'trg']: if len(y) == 1 and y[0] == 0: # in temporal ensembling, we assign a dummy label of [0] for # unlabeled sequences; we skip the supervised loss for these loss = dynet.scalarInput(0) else: loss = dynet.esum([ self.pick_neg_log(pred, gold) for pred, gold in zip(output, y) ]) if trg_vectors is not None: # the consistency loss in temporal ensembling is used for # both supervised and unsupervised input targets = sentence_trg_vectors[idx] assert len(output) == len(targets) other_loss = unsup_weight * dynet.average([ dynet.squared_distance(o, dynet.inputVector(t)) for o, t in zip(output, targets) ]) loss += other_loss if orthogonality_weight != 0.0 and task_id != 'Ft': # add the orthogonality constraint to the loss total_constraint += constraint.value( ) * orthogonality_weight total_orth_constr += 1 loss += constraint * orthogonality_weight if adversarial: total_adversarial += adv.value() * adversarial_weight loss += adv * adversarial_weight total_loss += loss.value() # for output log_losses[task_id] += total_loss total_tagged += len(word_indices) log_total[task_id] += total_tagged loss.backward() self.trainer.update() bar.next() else: # bootstrap=False, the output contains list of outputs one for each task assert trg_vectors is None, 'temporal ensembling not implemented for bootstrap=False' loss = dynet.scalarInput(1) #initialize if ignore_src_Ft: output = output[: -1] # ignore last = Ft when further training with 'src' for t_i, output_t in enumerate( output): # get loss for each task loss += dynet.esum([ self.pick_neg_log(pred, gold) for pred, gold in zip(output_t, y) ]) task_id = self.task_ids[t_i] log_losses[task_id] += total_loss log_total[task_id] += total_tagged if orthogonality_weight != 0.0: # add the orthogonality constraint to the loss total_constraint += constraint.value( ) * orthogonality_weight total_orth_constr += 1 loss += constraint * orthogonality_weight if adversarial: total_adversarial += adv.value() * adversarial_weight loss += adv * adversarial_weight total_loss += loss.value() # for output total_tagged += len(word_indices) loss.backward() self.trainer.update() bar.next() if adversarial and orthogonality_weight: print( "iter {}. Total loss: {:.3f}, total penalty: {:.3f}, total weighted adv loss: {:.3f}" .format(cur_iter, total_loss / total_tagged, total_constraint / total_orth_constr, total_adversarial / total_tagged), file=sys.stderr) elif orthogonality_weight: print("iter {}. Total loss: {:.3f}, total penalty: {:.3f}". format(cur_iter, total_loss / total_tagged, total_constraint / total_orth_constr), file=sys.stderr) else: print("iter {}. Total loss: {:.3f} ".format( cur_iter, total_loss / total_tagged), file=sys.stderr) for task_id in self.task_ids: if log_total[task_id] > 0: print("{0}: {1:.3f}".format( task_id, log_losses[task_id] / log_total[task_id])) if val_X is not None and val_Y is not None and model_path is not None: # get the best accuracy on the validation set val_correct, val_total = self.evaluate(val_X, val_Y) val_accuracy = val_correct / val_total if val_accuracy > best_val_acc: print( 'Accuracy {:.4f} is better than best val accuracy {:.4f}.' .format(val_accuracy, best_val_acc)) best_val_acc = val_accuracy epochs_no_improvement = 0 save_tagger(self, model_path) else: print( 'Accuracy {:.4f} is worse than best val loss {:.4f}.'. format(val_accuracy, best_val_acc)) epochs_no_improvement += 1 if epochs_no_improvement == patience: print('No improvement for {} epochs. Early stopping...'. format(epochs_no_improvement)) break def initialize_graph(self, num_words=None, num_chars=None): """ build graph and link to parameters F2=True: activate second auxiliary output Ft=True: activate third auxiliary output """ num_words = num_words if num_words is not None else len(self.w2i) num_chars = num_chars if num_chars is not None else len(self.c2i) if num_words == 0 or num_chars == 0: raise ValueError('Word2id and char2id have to be loaded before ' 'initializing the graph.') print('Initializing the graph...') # initialize the word embeddings and the parameters self.cembeds = None if self.embeds_file: print("loading embeddings", file=sys.stderr) embeddings, emb_dim = load_embeddings_file(self.embeds_file) assert (emb_dim == self.in_dim) num_words = len( set(embeddings.keys()).union(set( self.w2i.keys()))) # initialize all with embeddings # init model parameters and initialize them self.wembeds = self.model.add_lookup_parameters( (num_words, self.in_dim), init=dynet.ConstInitializer(0.01)) if self.c_in_dim > 0: self.cembeds = self.model.add_lookup_parameters( (num_chars, self.c_in_dim), init=dynet.ConstInitializer(0.01)) init = 0 l = len(embeddings.keys()) for word in embeddings.keys(): # for those words we have already in w2i, update vector, otherwise add to w2i (since we keep data as integers) if word in self.w2i: self.wembeds.init_row(self.w2i[word], embeddings[word]) else: self.w2i[word] = len(self.w2i.keys()) # add new word self.wembeds.init_row(self.w2i[word], embeddings[word]) init += 1 print("initialized: {}".format(init), file=sys.stderr) else: self.wembeds = self.model.add_lookup_parameters( (num_words, self.in_dim), init=dynet.ConstInitializer(0.01)) if self.c_in_dim > 0: self.cembeds = self.model.add_lookup_parameters( (num_chars, self.c_in_dim), init=dynet.ConstInitializer(0.01)) # make it more flexible to add number of layers as specified by parameter layers = [] # inner layers for layer_num in range(0, self.h_layers): if layer_num == 0: if self.c_in_dim > 0: f_builder = dynet.CoupledLSTMBuilder( 1, self.in_dim + self.c_in_dim * 2, self.h_dim, self.model) # in_dim: size of each layer b_builder = dynet.CoupledLSTMBuilder( 1, self.in_dim + self.c_in_dim * 2, self.h_dim, self.model) else: f_builder = dynet.CoupledLSTMBuilder( 1, self.in_dim, self.h_dim, self.model) b_builder = dynet.CoupledLSTMBuilder( 1, self.in_dim, self.h_dim, self.model) layers.append(BiRNNSequencePredictor( f_builder, b_builder)) #returns forward and backward sequence else: # add inner layers (if h_layers >1) f_builder = dynet.LSTMBuilder(1, self.h_dim, self.h_dim, self.model) b_builder = dynet.LSTMBuilder(1, self.h_dim, self.h_dim, self.model) layers.append(BiRNNSequencePredictor(f_builder, b_builder)) # store at which layer to predict task task_num_labels = len(self.tag2idx) output_layers_dict = {} output_layers_dict["F0"] = FFSequencePredictor( Layer(self.model, self.h_dim * 2, task_num_labels, dynet.softmax, mlp=self.h_dim if self.add_hidden else 0)) # for simplicity always add additional outputs, even if they are then not used output_layers_dict["F1"] = FFSequencePredictor( Layer(self.model, self.h_dim * 2, task_num_labels, dynet.softmax, mlp=self.h_dim if self.add_hidden else 0)) output_layers_dict["Ft"] = FFSequencePredictor( Layer(self.model, self.h_dim * 2, task_num_labels, dynet.softmax, mlp=self.h_dim if self.add_hidden else 0)) if self.c_in_dim > 0: self.char_rnn = BiRNNSequencePredictor( dynet.CoupledLSTMBuilder(1, self.c_in_dim, self.c_in_dim, self.model), dynet.CoupledLSTMBuilder(1, self.c_in_dim, self.c_in_dim, self.model)) else: self.char_rnn = None self.predictors = dict() self.predictors["inner"] = layers self.predictors["output_layers_dict"] = output_layers_dict self.predictors["task_expected_at"] = self.h_layers def get_features(self, words): """ from a list of words, return the word and word char indices """ word_indices = [] word_char_indices = [] for word in words: if word in self.w2i: word_indices.append(self.w2i[word]) else: word_indices.append(self.w2i["_UNK"]) if self.c_in_dim > 0: chars_of_word = [self.c2i["<w>"]] for char in word: if char in self.c2i: chars_of_word.append(self.c2i[char]) else: chars_of_word.append(self.c2i["_UNK"]) chars_of_word.append(self.c2i["</w>"]) word_char_indices.append(chars_of_word) return word_indices, word_char_indices def __get_instances_from_file(self, file_name): """ helper function to convert input file to lists of lists holding input words|tags """ data = [(words, tags) for (words, tags) in list(read_conll_file(file_name))] words = [words for (words, _) in data] tags = [tags for (_, tags) in data] return words, tags def get_data_as_indices(self, file_name): """ X = list of (word_indices, word_char_indices) Y = list of tag indices """ words, tags = self.__get_instances_from_file(file_name) return self.get_data_as_indices_from_instances(words, tags) def get_data_as_indices_from_instances(self, dev_words, dev_tags): """ Extension of get_data_as_indices. Use words and tags rather than a file as input. X = list of (word_indices, word_char_indices) Y = list of tag indices """ X, Y = [], [] org_X, org_Y = [], [] for (words, tags) in zip(dev_words, dev_tags): word_indices, word_char_indices = self.get_features(words) # if tag does not exist in source domain tags, return as default # first idx outside of dictionary tag_indices = [ self.tag2idx.get(tag, len(self.tag2idx)) for tag in tags ] X.append((word_indices, word_char_indices)) Y.append(tag_indices) org_X.append(words) org_Y.append(tags) return X, Y # , org_X, org_Y - for now don't use def predict(self, word_indices, char_indices, task_id, train=False, soft_labels=False, temperature=None, orthogonality_weight=0.0, domain_id=None): """ predict tags for a sentence represented as char+word embeddings :param domain_id: Predict adversarial loss if domain id is provided. """ dynet.renew_cg() # new graph char_emb = [] rev_char_emb = [] wfeatures = [self.wembeds[w] for w in word_indices] if self.c_in_dim > 0: # get representation for words for chars_of_token in char_indices: char_feats = [self.cembeds[c] for c in chars_of_token] # use last state as word representation f_char, b_char = self.char_rnn.predict_sequence( char_feats, char_feats) last_state = f_char[-1] rev_last_state = b_char[-1] char_emb.append(last_state) rev_char_emb.append(rev_last_state) features = [ dynet.concatenate([w, c, rev_c]) for w, c, rev_c in zip(wfeatures, char_emb, rev_char_emb) ] else: features = wfeatures if train: # only do at training time features = [dynet.noise(fe, self.noise_sigma) for fe in features] output_expected_at_layer = self.h_layers output_expected_at_layer -= 1 # go through layers prev = features prev_rev = features num_layers = self.h_layers constraint = 0 adv_loss = 0 for i in range(0, num_layers): predictor = self.predictors["inner"][i] forward_sequence, backward_sequence = predictor.predict_sequence( prev, prev_rev) if i > 0 and self.activation: # activation between LSTM layers forward_sequence = [ self.activation(s) for s in forward_sequence ] backward_sequence = [ self.activation(s) for s in backward_sequence ] if i == output_expected_at_layer: concat_layer = [ dynet.concatenate([f, b]) for f, b in zip( forward_sequence, reversed(backward_sequence)) ] if train and self.noise_sigma > 0.0: concat_layer = [ dynet.noise(fe, self.noise_sigma) for fe in concat_layer ] if task_id not in ["src", "trg"]: output_predictor = self.predictors["output_layers_dict"][ task_id] output = output_predictor.predict_sequence( concat_layer, soft_labels=soft_labels, temperature=temperature) else: # one src example for all three outputs output = [] # in this case it is a list for t_id in self.task_ids: output_predictor = self.predictors[ "output_layers_dict"][t_id] output_t = output_predictor.predict_sequence( concat_layer, soft_labels=soft_labels, temperature=temperature) output.append(output_t) if orthogonality_weight != 0 and task_id != "Ft": # put the orthogonality constraint either directly on the # output layer or on the hidden layer if it's an MLP # use orthogonality_weight only between F0 and F1 builder = self.predictors["output_layers_dict"][ "F0"].network_builder task_param = builder.W_mlp if self.add_hidden else builder.W task_W = dynet.parameter(task_param) builder = self.predictors["output_layers_dict"][ "F1"].network_builder other_param = builder.W_mlp if self.add_hidden else builder.W other_task_W = dynet.parameter(other_param) # calculate the matrix product of the task matrix with the other matrix_product_1 = dynet.transpose(task_W) * other_task_W # take the squared Frobenius norm by squaring # every element and then summing them squared_frobenius_norm = dynet.sum_elems( dynet.square(matrix_product_1)) constraint = squared_frobenius_norm #print('Constraint with first matrix:', squared_frobenius_norm.value()) if domain_id is not None: # flip the gradient when back-propagating through here adv_input = dynet.flip_gradient( concat_layer[-1]) # last state adv_output = self.adv_layer(adv_input) adv_loss = self.pick_neg_log(adv_output, domain_id) #print('Adversarial loss:', avg_adv_loss.value()) # output is list if task_id = 'src' return output, constraint, adv_loss prev = forward_sequence prev_rev = backward_sequence raise Exception("oops should not be here") return None def evaluate(self, test_X, test_Y, task_id="F0"): """ compute accuracy on a test file; by default use "F0" as predictor """ correct = 0 total = 0.0 for i, ((word_indices, word_char_indices), gold_tag_indices) in enumerate(zip(test_X, test_Y)): output, _, _ = self.predict(word_indices, word_char_indices, task_id) predicted_tag_indices = [np.argmax(o.value()) for o in output] correct += sum([ 1 for (predicted, gold) in zip(predicted_tag_indices, gold_tag_indices) if predicted == gold ]) total += len(gold_tag_indices) return correct, total def get_predictions(self, test_X, soft_labels=False, task_id="F0"): """ get flat list of predictions """ predictions = [] for word_indices, word_char_indices in test_X: output, _, _ = self.predict(word_indices, word_char_indices, task_id) predictions += [ o.value() if soft_labels else int(np.argmax(o.value())) for o in output ] return predictions def get_train_data_from_instances(self, train_words, train_tags): """ Extension of get_train_data method. Extracts training data from two arrays of word and label lists. transform training data to features (word indices) map tags to integers :param train_words: a numpy array containing lists of words :param train_tags: a numpy array containing lists of corresponding tags """ X = [] Y = [] # check if we continue training continue_training = False if self.w2i and self.tag2idx: continue_training = True if continue_training: print("update existing vocabulary") # fetch already existing w2i = self.w2i.copy() c2i = self.c2i.copy() tag2idx = self.tag2idx assert w2i["_UNK"] == 0, "No _UNK found!" else: # word 2 indices and tag 2 indices w2i = self.w2i.copy( ) # get a copy that refers to a different object c2i = {} # char to index tag2idx = {} # tag2idx if len(w2i) > 0: assert w2i["_UNK"] == 0 else: w2i["_UNK"] = 0 # unk word / OOV c2i["_UNK"] = 0 # unk char c2i["<w>"] = 1 # word start c2i["</w>"] = 2 # word end index num_sentences = 0 num_tokens = 0 for instance_idx, (words, tags) in enumerate(zip(train_words, train_tags)): instance_word_indices = [] # sequence of word indices instance_char_indices = [] # sequence of char indices instance_tags_indices = [] # sequence of tag indices for i, (word, tag) in enumerate(zip(words, tags)): # map words and tags to indices if word not in w2i: w2i[word] = len(w2i) instance_word_indices.append(w2i[word]) else: instance_word_indices.append(w2i[word]) chars_of_word = [c2i["<w>"]] for char in word: if char not in c2i: c2i[char] = len(c2i) chars_of_word.append(c2i[char]) chars_of_word.append(c2i["</w>"]) instance_char_indices.append(chars_of_word) if tag not in tag2idx: tag2idx[tag] = len(tag2idx) instance_tags_indices.append(tag2idx.get(tag)) num_tokens += 1 num_sentences += 1 X.append( (instance_word_indices, instance_char_indices) ) # list of word indices, for every word list of char indices Y.append(instance_tags_indices) print("%s sentences %s tokens" % (num_sentences, num_tokens), file=sys.stderr) print("%s w features, %s c features " % (len(w2i), len(c2i)), file=sys.stderr) assert (len(X) == len(Y)) # store mappings of words and tags to indices self.set_indices(w2i, c2i, tag2idx) return X, Y def get_train_data(self, train_data): """ transform training data to features (word indices) map tags to integers """ train_words, train_tags = self.__get_instances_from_file(train_data) return self.get_train_data_from_instances(train_words, train_tags) def get_predictions_output(self, test_X, test_labels, output_filename, task_id="F0"): """ get predictions to output to file assume test_labels are not indices (as target domain can have tags that are not in source) text_X: indices test_labels: original labels """ i2w = {self.w2i[w]: w for w in self.w2i.keys()} i2t = {self.tag2idx[t]: t for t in self.tag2idx.keys()} OUT = open(output_filename, "w") for (word_indices, word_char_indices), gold_tags in zip(test_X, test_labels): output, _, _ = self.predict(word_indices, word_char_indices, task_id) predicted_tag_ids = [int(np.argmax(o.value())) for o in output] for word_id, tag_id, gold_tag in zip(word_indices, predicted_tag_ids, gold_tags): known_tag_prefix = "{}" if gold_tag in self.tag2idx else "*{}" word, pred_tag, gold_tag = i2w[word_id], i2t[ tag_id], known_tag_prefix.format(gold_tag) OUT.write("{}\t{}\t{}\n".format(word, gold_tag, pred_tag)) OUT.write("\n") OUT.close()
def initialize_graph(self, num_words=None, num_chars=None): """ build graph and link to parameters F2=True: activate second auxiliary output Ft=True: activate third auxiliary output """ num_words = num_words if num_words is not None else len(self.w2i) num_chars = num_chars if num_chars is not None else len(self.c2i) if num_words == 0 or num_chars == 0: raise ValueError('Word2id and char2id have to be loaded before ' 'initializing the graph.') print('Initializing the graph...') # initialize the word embeddings and the parameters self.cembeds = None if self.embeds_file: print("loading embeddings", file=sys.stderr) embeddings, emb_dim = load_embeddings_file(self.embeds_file) assert (emb_dim == self.in_dim) num_words = len( set(embeddings.keys()).union(set( self.w2i.keys()))) # initialize all with embeddings # init model parameters and initialize them self.wembeds = self.model.add_lookup_parameters( (num_words, self.in_dim), init=dynet.ConstInitializer(0.01)) if self.c_in_dim > 0: self.cembeds = self.model.add_lookup_parameters( (num_chars, self.c_in_dim), init=dynet.ConstInitializer(0.01)) init = 0 l = len(embeddings.keys()) for word in embeddings.keys(): # for those words we have already in w2i, update vector, otherwise add to w2i (since we keep data as integers) if word in self.w2i: self.wembeds.init_row(self.w2i[word], embeddings[word]) else: self.w2i[word] = len(self.w2i.keys()) # add new word self.wembeds.init_row(self.w2i[word], embeddings[word]) init += 1 print("initialized: {}".format(init), file=sys.stderr) else: self.wembeds = self.model.add_lookup_parameters( (num_words, self.in_dim), init=dynet.ConstInitializer(0.01)) if self.c_in_dim > 0: self.cembeds = self.model.add_lookup_parameters( (num_chars, self.c_in_dim), init=dynet.ConstInitializer(0.01)) # make it more flexible to add number of layers as specified by parameter layers = [] # inner layers for layer_num in range(0, self.h_layers): if layer_num == 0: if self.c_in_dim > 0: f_builder = dynet.CoupledLSTMBuilder( 1, self.in_dim + self.c_in_dim * 2, self.h_dim, self.model) # in_dim: size of each layer b_builder = dynet.CoupledLSTMBuilder( 1, self.in_dim + self.c_in_dim * 2, self.h_dim, self.model) else: f_builder = dynet.CoupledLSTMBuilder( 1, self.in_dim, self.h_dim, self.model) b_builder = dynet.CoupledLSTMBuilder( 1, self.in_dim, self.h_dim, self.model) layers.append(BiRNNSequencePredictor( f_builder, b_builder)) #returns forward and backward sequence else: # add inner layers (if h_layers >1) f_builder = dynet.LSTMBuilder(1, self.h_dim, self.h_dim, self.model) b_builder = dynet.LSTMBuilder(1, self.h_dim, self.h_dim, self.model) layers.append(BiRNNSequencePredictor(f_builder, b_builder)) # store at which layer to predict task task_num_labels = len(self.tag2idx) output_layers_dict = {} output_layers_dict["F0"] = FFSequencePredictor( Layer(self.model, self.h_dim * 2, task_num_labels, dynet.softmax, mlp=self.h_dim if self.add_hidden else 0)) # for simplicity always add additional outputs, even if they are then not used output_layers_dict["F1"] = FFSequencePredictor( Layer(self.model, self.h_dim * 2, task_num_labels, dynet.softmax, mlp=self.h_dim if self.add_hidden else 0)) output_layers_dict["Ft"] = FFSequencePredictor( Layer(self.model, self.h_dim * 2, task_num_labels, dynet.softmax, mlp=self.h_dim if self.add_hidden else 0)) if self.c_in_dim > 0: self.char_rnn = BiRNNSequencePredictor( dynet.CoupledLSTMBuilder(1, self.c_in_dim, self.c_in_dim, self.model), dynet.CoupledLSTMBuilder(1, self.c_in_dim, self.c_in_dim, self.model)) else: self.char_rnn = None self.predictors = dict() self.predictors["inner"] = layers self.predictors["output_layers_dict"] = output_layers_dict self.predictors["task_expected_at"] = self.h_layers