コード例 #1
0
def read_file(filename, w2i, t2i, mt2i, c2i):
    """
    Read in a dataset and turn it into a list of instances.
    Modifies the w2i, t2i and c2i dicts, adding new words/tags/chars 
    as it sees them.
    """
    instances = []
    vocab_counter = collections.Counter()
    with codecs.open(filename, "r", "utf-8") as f:
        sentence = []
        tags = []
        mtags = []
        for i, line in enumerate(f):
            if line.startswith("#"):
                continue  # Some files like Italian have comments
            elif line.isspace():
                # Reached the end of a sentence
                instances.append(Instance(sentence, tags, mtags))
                sentence = []
                tags = []
                mtags = []
            else:
                data = line.split("\t")
                if '-' in data[
                        0]:  # Italian has contractions on a separate line, we don't want to include them also
                    continue
                word = data[1]
                tag = data[3] if options.ud_tags else data[4]
                morphotags = split_tagstring(
                    data[5], uni_key=True) if options.morphotags else {}
                vocab_counter[word] += 1
                if word not in w2i:
                    w2i[word] = len(w2i)
                if tag not in t2i:
                    t2i[tag] = len(t2i)
                for c in word:
                    if c not in c2i:
                        c2i[c] = len(c2i)
                for mtag in morphotags:
                    if mtag not in mt2i:
                        mt2i[mtag] = len(mt2i)
                sentence.append(w2i[word])
                tags.append(t2i[tag])
                mtags.append([mt2i[t] for t in morphotags])
    return instances, vocab_counter
コード例 #2
0
ファイル: test.py プロジェクト: ruyimarone/character-eyes
def evaluate_raw(model,
                 instances,
                 t2is,
                 c2i,
                 i2w,
                 i2ts,
                 training_vocab,
                 log_dir_name=None,
                 use_bar=False):
    logging.info("\n")
    logging.info("Number instances: {}".format(len(instances)))

    model.disable_dropout()
    test_correct = Counter()
    test_total = Counter()
    test_oov_total = Counter()
    test_loss = 0.0
    bar = progressbar.ProgressBar()
    total_wrong = Counter()
    total_wrong_oov = Counter()
    f1_eval = Evaluator(m='att')

    test_outputs = []

    for instance in (bar(instances) if use_bar else instances):
        if len(instance.sentence) == 0: continue

        gold_tags = instance.tags
        word_chars = get_word_chars(instance.sentence, i2w, c2i)

        for att in model.attributes:
            if att not in instance.tags:
                gold_tags[att] = [t2is[att][NONE_TAG]] * len(instance.sentence)

        losses = model.loss(word_chars, gold_tags)
        total_loss = sum([l.scalar_value() for l in list(losses.values())])
        out_tags_set, _ = model.tag_sentence(word_chars)

        gold_strings = utils.morphotag_strings(i2ts, gold_tags)
        obs_strings = utils.morphotag_strings(i2ts, out_tags_set)

        for g, o in zip(gold_strings, obs_strings):
            f1_eval.add_instance(utils.split_tagstring(g, has_pos=True),
                                 utils.split_tagstring(o, has_pos=True))

        for att, tags in gold_tags.items():
            out_tags = out_tags_set[att]
            oov_strings = []

            for word, gold, out in zip(instance.sentence, tags, out_tags):
                if gold == out:
                    test_correct[att] += 1
                else:
                    # Got the wrong tag
                    total_wrong[att] += 1
                    if i2w[word] not in training_vocab:
                        total_wrong_oov[att] += 1

                if i2w[word] not in training_vocab:
                    test_oov_total[att] += 1
                    oov_strings.append("OOV")
                else:
                    oov_strings.append("")
            test_total[att] += len(tags)

        test_loss += (total_loss / len(instance.sentence))

        test_outputs.append(("\n" + "\n".join([
            "\t".join(z) for z in zip([i2w[w] for w in instance.sentence],
                                      gold_strings, obs_strings, oov_strings)
        ]) + "\n").encode('utf8'))

    test_loss = test_loss / len(instances)

    if log_dir_name:
        with open("{}/testout.txt".format(log_dir_name), 'w') as test_writer:
            for output in test_outputs:
                test_writer.write(output)

    attr_f1s = {}
    for attr in t2is.keys():
        attr_f1s[attr] = f1_eval.mic_f1(att=attr)

    results = {
        'pos_acc': (test_correct[POS_KEY] / test_total[POS_KEY]),
        'pos_oov_accuracy':
        (test_oov_total[POS_KEY] - total_wrong_oov[POS_KEY]) /
        test_oov_total[POS_KEY],
        'pos_wrong_oov':
        (total_wrong_oov[POS_KEY] /
         total_wrong[POS_KEY]) if total_wrong[POS_KEY] > 0 else None,
        'f1_scores':
        attr_f1s,
        'micro_f1':
        f1_eval.mic_f1(),
        'macro_f1':
        f1_eval.mac_f1(),
        'total_tokens':
        test_total[POS_KEY],
        'total_oov':
        test_oov_total[POS_KEY],
        'oov_percent':
        test_oov_total[POS_KEY] / test_total[POS_KEY],
        'loss':
        test_loss,
        'outputs':
        test_outputs
    }

    return results
コード例 #3
0
ファイル: test_model.py プロジェクト: thuanthan147/Mimick
with open("{}/{}out.txt".format(options.out_dir, devortest),
          'w') as test_writer:
    for instance in bar(t_instances):
        if len(instance.sentence) == 0: continue
        gold_tags = instance.tags
        for att in model.attributes:
            if att not in instance.tags:
                gold_tags[att] = [t2is[att][NONE_TAG]] * len(instance.sentence)
        out_tags_set = model.tag_sentence(instance.sentence)

        gold_strings = utils.morphotag_strings(i2ts, gold_tags,
                                               not options.all_same_col)
        obs_strings = utils.morphotag_strings(i2ts, out_tags_set,
                                              not options.all_same_col)
        for g, o in zip(gold_strings, obs_strings):
            f1_eval.add_instance(utils.split_tagstring(g, has_pos=True),
                                 utils.split_tagstring(o, has_pos=True))
        for att, tags in gold_tags.items():
            out_tags = out_tags_set[att]

            oov_strings = []
            for word, gold, out in zip(instance.sentence, tags, out_tags):
                if gold == out:
                    test_correct[att] += 1
                else:
                    # Got the wrong tag
                    total_wrong[att] += 1
                    if i2w[word] not in training_vocab:
                        total_wrong_oov[att] += 1

                if i2w[word] not in training_vocab:
コード例 #4
0
ファイル: make_dataset.py プロジェクト: thuanthan147/Mimick
def read_file(filename, w2i, t2is, c2i, options):
    """
    Read in a dataset and turn it into a list of instances.
    Modifies the w2i, t2is and c2i dicts, adding new words/attributes/tags/chars
    as it sees them.
    """

    # populate mandatory t2i tables
    if POS_KEY not in t2is:
        t2is[POS_KEY] = {}

    # build dataset
    instances = []
    vocab_counter = collections.Counter()
    with codecs.open(filename, "r", "utf-8") as f:

        # running sentence buffers (lines are tokens)
        sentence = []
        tags = defaultdict(list)

        # main file reading loop
        for i, line in enumerate(f):

            # discard comments
            if line.startswith("#"):
                continue

            # parse sentence end
            elif line.isspace():

                # pad tag lists to sentence end
                slen = len(sentence)
                for seq in tags.values():
                    if len(seq) < slen:
                        seq.extend(
                            [0] * (slen - len(seq))
                        )  # 0 guaranteed below to represent NONE_TAG

                # add sentence to dataset
                instances.append(Instance(sentence, tags))
                sentence = []
                tags = defaultdict(list)

            else:

                # parse token information in line
                data = line.split("\t")
                if '-' in data[
                        0]:  # Some UD languages have contractions on a separate line, we don't want to include them also
                    continue
                try:
                    idx = int(data[0])
                except:
                    continue
                word = data[1]
                postag = data[3] if options.ud_tags else data[4]
                morphotags = {} if options.no_morphotags else split_tagstring(
                    data[5], uni_key=False)

                # ensure counts and dictionary population
                vocab_counter[word] += 1
                if word not in w2i:
                    w2i[word] = len(w2i)
                pt2i = t2is[POS_KEY]
                if postag not in pt2i:
                    pt2i[postag] = len(pt2i)
                for c in word:
                    if c not in c2i:
                        c2i[c] = len(c2i)
                for key, val in morphotags.items():
                    if key not in t2is:
                        t2is[key] = {NONE_TAG: 0}
                    mt2i = t2is[key]
                    if val not in mt2i:
                        mt2i[val] = len(mt2i)

                # add data to sentence buffer
                sentence.append(w2i[word])
                tags[POS_KEY].append(t2is[POS_KEY][postag])
                for k, v in morphotags.items():
                    mtags = tags[k]
                    # pad backwards to latest seen
                    missing_tags = idx - len(mtags) - 1
                    mtags.extend([0] * missing_tags
                                 )  # 0 guaranteed above to represent NONE_TAG
                    mtags.append(t2is[k][v])

    return instances, vocab_counter