def read_file(filename, w2i, t2i, c2i, max_iter=sys.maxsize, processing_word=get_processing_word(lowercase=False)): """ Read in a dataset and turn it into a list of instances. Modifies the w2i, t2is and c2i dicts, adding new words/attributes/tags/chars as it sees them. """ instances = [] vocab_counter = collections.Counter() niter = 0 with codecs.open(filename, "r", "utf-8") as f: words, tags = [], [] for line in f: line = line.strip() if line == 'BMES_BREAK' or line.startswith("-DOCSTART-"): if len(words) != 0: niter += 1 if max_iter is not None and niter > max_iter: break instances.append(Instance(words, tags)) words, tags = [], [] else: word, tag = line.split() word = processing_word(word) vocab_counter[word] += 1 if word not in w2i: w2i[word] = len(w2i) if tag not in t2i: t2i[tag] = len(t2i) if is_dataset_tag(word): if word not in c2i: c2i[word] = len(c2i) else: for c in word: if c not in c2i: c2i[c] = len(c2i) words.append(w2i[word]) tags.append(t2i[tag]) return instances, vocab_counter
def __init__(self, tagset_size, num_lstm_layers, hidden_dim, word_embeddings, no_we_update, use_char_rnn, char_embeddings, char_hidden_dim, margins, lowercase_words, vocab_size=None, word_embedding_dim=DEFAULT_WORD_EMBEDDING_SIZE, charset_size=None, char_embedding_dim=50, tie_two_embeddings=False, use_we=True): self.dropout = None self.model = dy.Model() self.tagset_size = tagset_size self.margins = margins self.we_update = not no_we_update self.lowercase_words = lowercase_words # Word embedding parameters self.use_we = use_we if use_we: if word_embeddings is not None: # Use pretrained embeddings vocab_size = word_embeddings.shape[0] word_embedding_dim = word_embeddings.shape[1] self.words_lookup = self.model.add_lookup_parameters( (vocab_size, word_embedding_dim)) if word_embeddings is not None: self.words_lookup.init_from_array(word_embeddings) else: self.words_lookup = None # bigram embeddings if options.bigram: self.bigram_lookup = self.model.add_lookup_parameters( (len(b2i), word_embedding_dim)) self.bigram_lookup.init_from_array(bigram_embeddings) # Char LSTM Parameters self.use_char_rnn = use_char_rnn if use_char_rnn: if char_embeddings is not None: charset_size = char_embeddings.shape[0] char_embedding_dim = char_embeddings.shape[1] self.char_embedding_dim = char_embedding_dim if tie_two_embeddings: self.char_lookup = self.words_lookup else: self.char_lookup = self.model.add_lookup_parameters( (charset_size, self.char_embedding_dim)) if char_embeddings is not None: self.char_lookup.init_from_array(char_embeddings) self.char_bi_lstm = dy.BiRNNBuilder(1, self.char_embedding_dim, char_hidden_dim, self.model, dy.LSTMBuilder) # Cache char ids for each word for fast speed self.word_to_char_ids = dict() for word, word_id in w2i.items(): # Note: use original casing ("word") for characters if utils.is_dataset_tag(word): char_ids = [c2i[word]] else: char_ids = [c2i[c] for c in word] self.word_to_char_ids[word_id] = char_ids # Word LSTM parameters if use_char_rnn: if use_we: input_dim = word_embedding_dim + char_hidden_dim else: input_dim = char_hidden_dim else: input_dim = word_embedding_dim self.bi_lstm = dy.BiRNNBuilder(num_lstm_layers, input_dim, hidden_dim, self.model, dy.LSTMBuilder) # Matrix that maps from Bi-LSTM output to num tags if options.bigram: self.lstm_to_tags_params = self.model.add_parameters( (tagset_size, hidden_dim + word_embedding_dim * 2)) else: self.lstm_to_tags_params = self.model.add_parameters( (tagset_size, hidden_dim)) self.lstm_to_tags_bias = self.model.add_parameters(tagset_size) self.mlp_out = self.model.add_parameters((tagset_size, tagset_size)) self.mlp_out_bias = self.model.add_parameters(tagset_size) # Transition matrix for tagging layer, [i,j] is score of transitioning to i from j self.transitions = self.model.add_lookup_parameters( (tagset_size, tagset_size))
for batch_id, batch in enumerate( utils.minibatches(dev_instances, dev_batch_size)): for idx, instance in enumerate(batch): sentence = instance.sentence if len(sentence) == 0: continue gold_tags = instance.tags losses = model.neg_log_loss(sentence, gold_tags) total_loss = losses.scalar_value() _, out_tags = model.viterbi_loss(sentence, gold_tags, use_margins=False) sentence = utils.restore_sentence(sentence) dataset_name = None if utils.is_dataset_tag(i2w[sentence[0]]): dataset_name = i2w[sentence[0]][1:-1] if dataset_name not in prf_dataset: prf_dataset[dataset_name] = utils.CWSEvaluator(t2i) sentence = sentence[1:-1] gold_tags = gold_tags[1:-1] out_tags = out_tags[1:-1] prf_dataset[dataset_name].add_instance( gold_tags, out_tags) prf.add_instance(gold_tags, out_tags) gold_strings = utils.to_tag_strings(i2t, gold_tags) obs_strings = utils.to_tag_strings(i2t, out_tags) dev_total_instance += 1
def tester(model, test_batch, write_out=False): res = [] prf = utils.CWSEvaluator(i2t) prf_dataset = {} oov_dataset = {} model.eval() for batch_x, batch_y in test_batch: with torch.no_grad(): if bigram_embedding is not None: out = model(batch_x["task"], batch_x["uni"], batch_x["seq_len"], batch_x["bi1"], batch_x["bi2"]) else: out = model(batch_x["task"], batch_x["uni"], batch_x["seq_len"]) out = out["pred"] #print(out) num = out.size(0) out = out.detach().cpu().numpy() for i in range(num): length = int(batch_x["seq_len"][i]) out_tags = out[i, 1:length].tolist() sentence = batch_x["ori_words"][i] gold_tags = batch_y["tags"][i][1:length].numpy().tolist() dataset_name = sentence[0] sentence = sentence[1:] #print(out_tags,gold_tags) assert utils.is_dataset_tag(dataset_name) assert len(gold_tags) == len(out_tags) and len(gold_tags) == len( sentence) if dataset_name not in prf_dataset: prf_dataset[dataset_name] = utils.CWSEvaluator(i2t) oov_dataset[dataset_name] = utils.CWS_OOV( word_dic[dataset_name[1:-1]]) prf_dataset[dataset_name].add_instance(gold_tags, out_tags) prf.add_instance(gold_tags, out_tags) if write_out == True: gold_strings = utils.to_tag_strings(i2t, gold_tags) obs_strings = utils.to_tag_strings(i2t, out_tags) word_list = utils.bmes_to_words(sentence, obs_strings) oov_dataset[dataset_name].update( utils.bmes_to_words(sentence, gold_strings), word_list) raw_string = ' '.join(word_list) res.append(dataset_name + " " + raw_string + " " + dataset_name) Ap = 0.0 Ar = 0.0 Af = 0.0 Aoov = 0.0 tot = 0 nw = 0.0 for dataset_name, performance in sorted(prf_dataset.items()): p = performance.result() if write_out == True: nw = oov_dataset[dataset_name].oov() logger.info('{}\t{:04.2f}\t{:04.2f}\t{:04.2f}\t{:04.2f}'.format( dataset_name, p[0], p[1], p[2], nw)) else: logger.info('{}\t{:04.2f}\t{:04.2f}\t{:04.2f}'.format( dataset_name, p[0], p[1], p[2])) Ap += p[0] Ar += p[1] Af += p[2] Aoov += nw tot += 1 prf = prf.result() logger.info('{}\t{:04.2f}\t{:04.2f}\t{:04.2f}'.format( 'TOT', prf[0], prf[1], prf[2])) if write_out == False: logger.info('{}\t{:04.2f}\t{:04.2f}\t{:04.2f}'.format( 'AVG', Ap / tot, Ar / tot, Af / tot)) else: logger.info('{}\t{:04.2f}\t{:04.2f}\t{:04.2f}\t{:04.2f}'.format( 'AVG', Ap / tot, Ar / tot, Af / tot, Aoov / tot)) return prf[-1], res