def read_file(filename, w2i, t2i, mt2i, c2i): """ Read in a dataset and turn it into a list of instances. Modifies the w2i, t2i and c2i dicts, adding new words/tags/chars as it sees them. """ instances = [] vocab_counter = collections.Counter() with codecs.open(filename, "r", "utf-8") as f: sentence = [] tags = [] mtags = [] for i, line in enumerate(f): if line.startswith("#"): continue # Some files like Italian have comments elif line.isspace(): # Reached the end of a sentence instances.append(Instance(sentence, tags, mtags)) sentence = [] tags = [] mtags = [] else: data = line.split("\t") if '-' in data[ 0]: # Italian has contractions on a separate line, we don't want to include them also continue word = data[1] tag = data[3] if options.ud_tags else data[4] morphotags = split_tagstring( data[5], uni_key=True) if options.morphotags else {} vocab_counter[word] += 1 if word not in w2i: w2i[word] = len(w2i) if tag not in t2i: t2i[tag] = len(t2i) for c in word: if c not in c2i: c2i[c] = len(c2i) for mtag in morphotags: if mtag not in mt2i: mt2i[mtag] = len(mt2i) sentence.append(w2i[word]) tags.append(t2i[tag]) mtags.append([mt2i[t] for t in morphotags]) return instances, vocab_counter
def evaluate_raw(model, instances, t2is, c2i, i2w, i2ts, training_vocab, log_dir_name=None, use_bar=False): logging.info("\n") logging.info("Number instances: {}".format(len(instances))) model.disable_dropout() test_correct = Counter() test_total = Counter() test_oov_total = Counter() test_loss = 0.0 bar = progressbar.ProgressBar() total_wrong = Counter() total_wrong_oov = Counter() f1_eval = Evaluator(m='att') test_outputs = [] for instance in (bar(instances) if use_bar else instances): if len(instance.sentence) == 0: continue gold_tags = instance.tags word_chars = get_word_chars(instance.sentence, i2w, c2i) for att in model.attributes: if att not in instance.tags: gold_tags[att] = [t2is[att][NONE_TAG]] * len(instance.sentence) losses = model.loss(word_chars, gold_tags) total_loss = sum([l.scalar_value() for l in list(losses.values())]) out_tags_set, _ = model.tag_sentence(word_chars) gold_strings = utils.morphotag_strings(i2ts, gold_tags) obs_strings = utils.morphotag_strings(i2ts, out_tags_set) for g, o in zip(gold_strings, obs_strings): f1_eval.add_instance(utils.split_tagstring(g, has_pos=True), utils.split_tagstring(o, has_pos=True)) for att, tags in gold_tags.items(): out_tags = out_tags_set[att] oov_strings = [] for word, gold, out in zip(instance.sentence, tags, out_tags): if gold == out: test_correct[att] += 1 else: # Got the wrong tag total_wrong[att] += 1 if i2w[word] not in training_vocab: total_wrong_oov[att] += 1 if i2w[word] not in training_vocab: test_oov_total[att] += 1 oov_strings.append("OOV") else: oov_strings.append("") test_total[att] += len(tags) test_loss += (total_loss / len(instance.sentence)) test_outputs.append(("\n" + "\n".join([ "\t".join(z) for z in zip([i2w[w] for w in instance.sentence], gold_strings, obs_strings, oov_strings) ]) + "\n").encode('utf8')) test_loss = test_loss / len(instances) if log_dir_name: with open("{}/testout.txt".format(log_dir_name), 'w') as test_writer: for output in test_outputs: test_writer.write(output) attr_f1s = {} for attr in t2is.keys(): attr_f1s[attr] = f1_eval.mic_f1(att=attr) results = { 'pos_acc': (test_correct[POS_KEY] / test_total[POS_KEY]), 'pos_oov_accuracy': (test_oov_total[POS_KEY] - total_wrong_oov[POS_KEY]) / test_oov_total[POS_KEY], 'pos_wrong_oov': (total_wrong_oov[POS_KEY] / total_wrong[POS_KEY]) if total_wrong[POS_KEY] > 0 else None, 'f1_scores': attr_f1s, 'micro_f1': f1_eval.mic_f1(), 'macro_f1': f1_eval.mac_f1(), 'total_tokens': test_total[POS_KEY], 'total_oov': test_oov_total[POS_KEY], 'oov_percent': test_oov_total[POS_KEY] / test_total[POS_KEY], 'loss': test_loss, 'outputs': test_outputs } return results
with open("{}/{}out.txt".format(options.out_dir, devortest), 'w') as test_writer: for instance in bar(t_instances): if len(instance.sentence) == 0: continue gold_tags = instance.tags for att in model.attributes: if att not in instance.tags: gold_tags[att] = [t2is[att][NONE_TAG]] * len(instance.sentence) out_tags_set = model.tag_sentence(instance.sentence) gold_strings = utils.morphotag_strings(i2ts, gold_tags, not options.all_same_col) obs_strings = utils.morphotag_strings(i2ts, out_tags_set, not options.all_same_col) for g, o in zip(gold_strings, obs_strings): f1_eval.add_instance(utils.split_tagstring(g, has_pos=True), utils.split_tagstring(o, has_pos=True)) for att, tags in gold_tags.items(): out_tags = out_tags_set[att] oov_strings = [] for word, gold, out in zip(instance.sentence, tags, out_tags): if gold == out: test_correct[att] += 1 else: # Got the wrong tag total_wrong[att] += 1 if i2w[word] not in training_vocab: total_wrong_oov[att] += 1 if i2w[word] not in training_vocab:
def read_file(filename, w2i, t2is, c2i, options): """ Read in a dataset and turn it into a list of instances. Modifies the w2i, t2is and c2i dicts, adding new words/attributes/tags/chars as it sees them. """ # populate mandatory t2i tables if POS_KEY not in t2is: t2is[POS_KEY] = {} # build dataset instances = [] vocab_counter = collections.Counter() with codecs.open(filename, "r", "utf-8") as f: # running sentence buffers (lines are tokens) sentence = [] tags = defaultdict(list) # main file reading loop for i, line in enumerate(f): # discard comments if line.startswith("#"): continue # parse sentence end elif line.isspace(): # pad tag lists to sentence end slen = len(sentence) for seq in tags.values(): if len(seq) < slen: seq.extend( [0] * (slen - len(seq)) ) # 0 guaranteed below to represent NONE_TAG # add sentence to dataset instances.append(Instance(sentence, tags)) sentence = [] tags = defaultdict(list) else: # parse token information in line data = line.split("\t") if '-' in data[ 0]: # Some UD languages have contractions on a separate line, we don't want to include them also continue try: idx = int(data[0]) except: continue word = data[1] postag = data[3] if options.ud_tags else data[4] morphotags = {} if options.no_morphotags else split_tagstring( data[5], uni_key=False) # ensure counts and dictionary population vocab_counter[word] += 1 if word not in w2i: w2i[word] = len(w2i) pt2i = t2is[POS_KEY] if postag not in pt2i: pt2i[postag] = len(pt2i) for c in word: if c not in c2i: c2i[c] = len(c2i) for key, val in morphotags.items(): if key not in t2is: t2is[key] = {NONE_TAG: 0} mt2i = t2is[key] if val not in mt2i: mt2i[val] = len(mt2i) # add data to sentence buffer sentence.append(w2i[word]) tags[POS_KEY].append(t2is[POS_KEY][postag]) for k, v in morphotags.items(): mtags = tags[k] # pad backwards to latest seen missing_tags = idx - len(mtags) - 1 mtags.extend([0] * missing_tags ) # 0 guaranteed above to represent NONE_TAG mtags.append(t2is[k][v]) return instances, vocab_counter