コード例 #1
0
def score_model(vocab, tagger, parser, gold_docs, verbose=False):
    scorer = Scorer()
    for _, gold_doc in gold_docs:
        for (ids, words, tags, heads, deps, entities), _ in gold_doc:
            doc = Doc(vocab, words=words)
            tagger(doc)
            parser(doc)
            PseudoProjectivity.deprojectivize(doc)
            gold = GoldParse(doc, tags=tags, heads=heads, deps=deps)
            scorer.score(doc, gold, verbose=verbose)
    return scorer
コード例 #2
0
ファイル: train_ud.py プロジェクト: geovedi/spaCy
def score_model(vocab, tagger, parser, gold_docs, verbose=False):
    scorer = Scorer()
    for _, gold_doc in gold_docs:
        for (ids, words, tags, heads, deps, entities), _ in gold_doc:
            doc = Doc(vocab, words=words)
            tagger(doc)
            parser(doc)
            PseudoProjectivity.deprojectivize(doc)
            gold = GoldParse(doc, tags=tags, heads=heads, deps=deps)
            scorer.score(doc, gold, verbose=verbose)
    return scorer
コード例 #3
0
ファイル: test_nonproj.py プロジェクト: Christo44/spaCy
def deprojectivize(proj_heads, deco_labels, EN):
    slen = len(proj_heads)
    sent = EN.tokenizer.tokens_from_list(["whatever"] * slen)
    rel_proj_heads = [head - i for i, head in enumerate(proj_heads)]
    labelids = [EN.vocab.strings[label] for label in deco_labels]
    pairs = list(zip(rel_proj_heads, labelids))
    parse = numpy.asarray(pairs, dtype=numpy.int32)
    sent.from_array([HEAD, DEP], parse)
    PseudoProjectivity.deprojectivize(sent)
    parse = sent.to_array([HEAD, DEP])
    deproj_heads = [i + head for i, head in enumerate(parse[:, 0])]
    undeco_labels = [EN.vocab.strings[int(labelid)] for labelid in parse[:, 1]]
    return deproj_heads, undeco_labels
コード例 #4
0
def deprojectivize(proj_heads, deco_labels, EN):
    slen = len(proj_heads)
    sent = EN.tokenizer.tokens_from_list(['whatever'] * slen)
    rel_proj_heads = [head - i for i, head in enumerate(proj_heads)]
    labelids = [EN.vocab.strings[label] for label in deco_labels]
    pairs = list(zip(rel_proj_heads, labelids))
    parse = numpy.asarray(pairs, dtype=numpy.int32)
    sent.from_array([HEAD, DEP], parse)
    PseudoProjectivity.deprojectivize(sent)
    parse = sent.to_array([HEAD, DEP])
    deproj_heads = [i + head for i, head in enumerate(parse[:, 0])]
    undeco_labels = [EN.vocab.strings[int(labelid)] for labelid in parse[:, 1]]
    return deproj_heads, undeco_labels
コード例 #5
0
def main(train_loc, dev_loc, model_dir, tag_map_loc=None):
    if tag_map_loc:
        with open(tag_map_loc) as file_:
            tag_map = json.loads(file_.read())
    else:
        tag_map = DEFAULT_TAG_MAP
    train_sents = list(read_conllx(train_loc))
    train_sents = PseudoProjectivity.preprocess_training_data(train_sents)

    actions = ArcEager.get_actions(gold_parses=train_sents)
    features = get_templates('basic')

    model_dir = pathlib.Path(model_dir)
    if not (model_dir / 'deps').exists():
        (model_dir / 'deps').mkdir()
    with (model_dir / 'deps' / 'config.json').open('wb') as file_:
        file_.write(
            json.dumps(
                {'pseudoprojective': True, 'labels': actions, 'features': features}).encode('utf8'))
    vocab = Vocab(lex_attr_getters=Language.Defaults.lex_attr_getters, tag_map=tag_map)
    # Populate vocab
    for _, doc_sents in train_sents:
        for (ids, words, tags, heads, deps, ner), _ in doc_sents:
            for word in words:
                _ = vocab[word]
            for dep in deps:
                _ = vocab[dep]
            for tag in tags:
                _ = vocab[tag]
            if tag_map:
                for tag in tags:
                    assert tag in tag_map, repr(tag)
    tagger = Tagger(vocab, tag_map=tag_map)
    parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0)

    for itn in range(15):
        loss = 0.
        for _, doc_sents in train_sents:
            for (ids, words, tags, heads, deps, ner), _ in doc_sents:
                doc = Doc(vocab, words=words)
                gold = GoldParse(doc, tags=tags, heads=heads, deps=deps)
                tagger(doc)
                loss += parser.update(doc, gold, itn=itn)
                doc = Doc(vocab, words=words)
                tagger.update(doc, gold)
        random.shuffle(train_sents)
        scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
        print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc))
    nlp = Language(vocab=vocab, tagger=tagger, parser=parser)
    nlp.end_training(model_dir)
    scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
    print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
コード例 #6
0
ファイル: train_ud.py プロジェクト: adamhadani/spaCy
def main(train_loc, dev_loc, model_dir, tag_map_loc):
    with open(tag_map_loc) as file_:
        tag_map = json.loads(file_.read())
    train_sents = list(read_conllx(train_loc))
    train_sents = PseudoProjectivity.preprocess_training_data(train_sents)

    actions = ArcEager.get_actions(gold_parses=train_sents)
    features = get_templates('basic')
    
    model_dir = pathlib.Path(model_dir)
    with (model_dir / 'deps' / 'config.json').open('w') as file_:
        json.dump({'pseudoprojective': True, 'labels': actions, 'features': features}, file_)

    vocab = Vocab(lex_attr_getters=Language.Defaults.lex_attr_getters, tag_map=tag_map)
    # Populate vocab
    for _, doc_sents in train_sents:
        for (ids, words, tags, heads, deps, ner), _ in doc_sents:
            for word in words:
                _ = vocab[word]
            for dep in deps:
                _ = vocab[dep]
            for tag in tags:
                _ = vocab[tag]
            for tag in tags:
                assert tag in tag_map, repr(tag)
    tagger = Tagger(vocab, tag_map=tag_map)
    parser = DependencyParser(vocab, actions=actions, features=features)
    
    for itn in range(15):
        for _, doc_sents in train_sents:
            for (ids, words, tags, heads, deps, ner), _ in doc_sents:
                doc = Doc(vocab, words=words)
                gold = GoldParse(doc, tags=tags, heads=heads, deps=deps)
                tagger(doc)
                parser.update(doc, gold)
                doc = Doc(vocab, words=words)
                tagger.update(doc, gold)
        random.shuffle(train_sents)
        scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
        print('%d:\t%.3f\t%.3f' % (itn, scorer.uas, scorer.tags_acc))
    nlp = Language(vocab=vocab, tagger=tagger, parser=parser)
    nlp.end_training(model_dir)
    scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
    print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
コード例 #7
0
ファイル: train_ud.py プロジェクト: geovedi/spaCy
def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
    LangClass = spacy.util.get_lang_class(lang_name)
    train_sents = list(read_conllx(train_loc))
    train_sents = PseudoProjectivity.preprocess_training_data(train_sents)

    actions = ArcEager.get_actions(gold_parses=train_sents)
    features = get_templates('basic')

    model_dir = pathlib.Path(model_dir)
    if not model_dir.exists():
        model_dir.mkdir()
    if not (model_dir / 'deps').exists():
        (model_dir / 'deps').mkdir()
    if not (model_dir / 'pos').exists():
        (model_dir / 'pos').mkdir()
    with (model_dir / 'deps' / 'config.json').open('wb') as file_:
        file_.write(
            json.dumps(
                {'pseudoprojective': True, 'labels': actions, 'features': features}).encode('utf8'))

    vocab = LangClass.Defaults.create_vocab()
    if not (model_dir / 'vocab').exists():
        (model_dir / 'vocab').mkdir()
    else:
        if (model_dir / 'vocab' / 'strings.json').exists():
            with (model_dir / 'vocab' / 'strings.json').open() as file_:
                vocab.strings.load(file_)
            if (model_dir / 'vocab' / 'lexemes.bin').exists():
                vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin')

    if clusters_loc is not None:
        clusters_loc = pathlib.Path(clusters_loc)
        with clusters_loc.open() as file_:
            for line in file_:
                try:
                    cluster, word, freq = line.split()
                except ValueError:
                    continue
                lex = vocab[word]
                lex.cluster = int(cluster[::-1], 2)
    # Populate vocab
    for _, doc_sents in train_sents:
        for (ids, words, tags, heads, deps, ner), _ in doc_sents:
            for word in words:
                _ = vocab[word]
            for dep in deps:
                _ = vocab[dep]
            for tag in tags:
                _ = vocab[tag]
            if vocab.morphology.tag_map:
                for tag in tags:
                    assert tag in vocab.morphology.tag_map, repr(tag)
    tagger = Tagger(vocab)
    parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0)

    for itn in range(30):
        loss = 0.
        for _, doc_sents in train_sents:
            for (ids, words, tags, heads, deps, ner), _ in doc_sents:
                doc = Doc(vocab, words=words)
                gold = GoldParse(doc, tags=tags, heads=heads, deps=deps)
                tagger(doc)
                loss += parser.update(doc, gold, itn=itn)
                doc = Doc(vocab, words=words)
                tagger.update(doc, gold)
        random.shuffle(train_sents)
        scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
        print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc))
    nlp = LangClass(vocab=vocab, tagger=tagger, parser=parser)
    nlp.end_training(model_dir)
    scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
    print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
コード例 #8
0
ファイル: train.py プロジェクト: anukat2015/spaCy
def train(Language,
          gold_tuples,
          model_dir,
          n_iter=15,
          feat_set=u'basic',
          seed=0,
          gold_preproc=False,
          n_sents=0,
          corruption_level=0,
          beam_width=1,
          verbose=False,
          use_orig_arc_eager=False,
          pseudoprojective=False):
    dep_model_dir = path.join(model_dir, 'deps')
    ner_model_dir = path.join(model_dir, 'ner')
    pos_model_dir = path.join(model_dir, 'pos')
    if path.exists(dep_model_dir):
        shutil.rmtree(dep_model_dir)
    if path.exists(ner_model_dir):
        shutil.rmtree(ner_model_dir)
    if path.exists(pos_model_dir):
        shutil.rmtree(pos_model_dir)
    os.mkdir(dep_model_dir)
    os.mkdir(ner_model_dir)
    os.mkdir(pos_model_dir)

    if pseudoprojective:
        # preprocess training data here before ArcEager.get_labels() is called
        gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples)

    Config.write(dep_model_dir,
                 'config',
                 features=feat_set,
                 seed=seed,
                 labels=ArcEager.get_labels(gold_tuples),
                 beam_width=beam_width,
                 projectivize=pseudoprojective)
    Config.write(ner_model_dir,
                 'config',
                 features='ner',
                 seed=seed,
                 labels=BiluoPushDown.get_labels(gold_tuples),
                 beam_width=0)

    if n_sents > 0:
        gold_tuples = gold_tuples[:n_sents]

    nlp = Language(data_dir=model_dir,
                   tagger=False,
                   parser=False,
                   entity=False)
    if nlp.lang == 'de':
        nlp.vocab.morphology.lemmatizer = lambda string, pos: set([string])
    nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
    nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
    nlp.entity = Parser.from_dir(ner_model_dir, nlp.vocab.strings,
                                 BiluoPushDown)
    print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
    for itn in range(n_iter):
        scorer = Scorer()
        loss = 0
        for raw_text, sents in gold_tuples:
            if gold_preproc:
                raw_text = None
            else:
                sents = _merge_sents(sents)
            for annot_tuples, ctnt in sents:
                if len(annot_tuples[1]) == 1:
                    continue
                score_model(scorer,
                            nlp,
                            raw_text,
                            annot_tuples,
                            verbose=verbose if itn >= 2 else False)
                if raw_text is None:
                    words = add_noise(annot_tuples[1], corruption_level)
                    tokens = nlp.tokenizer.tokens_from_list(words)
                else:
                    raw_text = add_noise(raw_text, corruption_level)
                    tokens = nlp.tokenizer(raw_text)
                nlp.tagger(tokens)
                gold = GoldParse(tokens, annot_tuples)
                if not gold.is_projective:
                    raise Exception("Non-projective sentence in training: %s" %
                                    annot_tuples)
                loss += nlp.parser.train(tokens, gold)
                nlp.entity.train(tokens, gold)
                nlp.tagger.train(tokens, gold.tags)
        random.shuffle(gold_tuples)
        print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' %
              (itn, loss, scorer.uas, scorer.ents_f, scorer.tags_acc,
               scorer.token_acc))
    print('end training')
    nlp.end_training(model_dir)
    print('done')
コード例 #9
0
ファイル: test_nonproj.py プロジェクト: Christo44/spaCy
def test_pseudoprojectivity(EN):
    tree = [1, 2, 2]
    nonproj_tree = [1, 2, 2, 4, 5, 2, 7, 4, 2]
    labels = ["det", "nsubj", "root", "det", "dobj", "aux", "nsubj", "acl", "punct"]
    nonproj_tree2 = [9, 1, 3, 1, 5, 6, 9, 8, 6, 1, 6, 12, 13, 10, 1]
    labels2 = [
        "advmod",
        "root",
        "det",
        "nsubj",
        "advmod",
        "det",
        "dobj",
        "det",
        "nmod",
        "aux",
        "nmod",
        "advmod",
        "det",
        "amod",
        "punct",
    ]

    assert PseudoProjectivity.decompose("X||Y") == ("X", "Y")
    assert PseudoProjectivity.decompose("X") == ("X", "")

    assert PseudoProjectivity.is_decorated("X||Y") == True
    assert PseudoProjectivity.is_decorated("X") == False

    PseudoProjectivity._lift(0, tree)
    assert tree == [2, 2, 2]

    np_arc = PseudoProjectivity._get_smallest_nonproj_arc(nonproj_tree)
    assert np_arc == 7

    np_arc = PseudoProjectivity._get_smallest_nonproj_arc(nonproj_tree2)
    assert np_arc == 10

    proj_heads, deco_labels = PseudoProjectivity.projectivize(nonproj_tree, labels)
    assert proj_heads == [1, 2, 2, 4, 5, 2, 7, 5, 2]
    assert deco_labels == ["det", "nsubj", "root", "det", "dobj", "aux", "nsubj", "acl||dobj", "punct"]
    deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels, EN)
    assert deproj_heads == nonproj_tree
    assert undeco_labels == labels

    proj_heads, deco_labels = PseudoProjectivity.projectivize(nonproj_tree2, labels2)
    assert proj_heads == [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1]
    assert deco_labels == [
        "advmod||aux",
        "root",
        "det",
        "nsubj",
        "advmod",
        "det",
        "dobj",
        "det",
        "nmod",
        "aux",
        "nmod||dobj",
        "advmod",
        "det",
        "amod",
        "punct",
    ]
    deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels, EN)
    assert deproj_heads == nonproj_tree2
    assert undeco_labels == labels2

    # if decoration is wrong such that there is no head with the desired label
    # the structure is kept and the label is undecorated
    proj_heads = [1, 2, 2, 4, 5, 2, 7, 5, 2]
    deco_labels = ["det", "nsubj", "root", "det", "dobj", "aux", "nsubj", "acl||iobj", "punct"]
    deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels, EN)
    assert deproj_heads == proj_heads
    assert undeco_labels == ["det", "nsubj", "root", "det", "dobj", "aux", "nsubj", "acl", "punct"]

    # if there are two potential new heads, the first one is chosen even if it's wrong
    proj_heads = [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1]
    deco_labels = [
        "advmod||aux",
        "root",
        "det",
        "aux",
        "advmod",
        "det",
        "dobj",
        "det",
        "nmod",
        "aux",
        "nmod||dobj",
        "advmod",
        "det",
        "amod",
        "punct",
    ]
    deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels, EN)
    assert deproj_heads == [3, 1, 3, 1, 5, 6, 9, 8, 6, 1, 6, 12, 13, 10, 1]
    assert undeco_labels == [
        "advmod",
        "root",
        "det",
        "aux",
        "advmod",
        "det",
        "dobj",
        "det",
        "nmod",
        "aux",
        "nmod",
        "advmod",
        "det",
        "amod",
        "punct",
    ]
コード例 #10
0
ファイル: train.py プロジェクト: Develer/spaCy
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
          seed=0, gold_preproc=False, n_sents=0, corruption_level=0,
          beam_width=1, verbose=False,
          use_orig_arc_eager=False, pseudoprojective=False):
    dep_model_dir = path.join(model_dir, 'deps')
    ner_model_dir = path.join(model_dir, 'ner')
    pos_model_dir = path.join(model_dir, 'pos')
    if path.exists(dep_model_dir):
        shutil.rmtree(dep_model_dir)
    if path.exists(ner_model_dir):
        shutil.rmtree(ner_model_dir)
    if path.exists(pos_model_dir):
        shutil.rmtree(pos_model_dir)
    os.mkdir(dep_model_dir)
    os.mkdir(ner_model_dir)
    os.mkdir(pos_model_dir)

    if pseudoprojective:
        # preprocess training data here before ArcEager.get_labels() is called
        gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples)

    Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
                 labels=ArcEager.get_labels(gold_tuples),
                 beam_width=beam_width,projectivize=pseudoprojective)
    Config.write(ner_model_dir, 'config', features='ner', seed=seed,
                 labels=BiluoPushDown.get_labels(gold_tuples),
                 beam_width=0)

    if n_sents > 0:
        gold_tuples = gold_tuples[:n_sents]

    nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
    if nlp.lang == 'de':
        nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string])
    nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
    nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
    nlp.entity = Parser.from_dir(ner_model_dir, nlp.vocab.strings, BiluoPushDown)
    print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
    for itn in range(n_iter):
        scorer = Scorer()
        loss = 0
        for raw_text, sents in gold_tuples:
            if gold_preproc:
                raw_text = None
            else:
                sents = _merge_sents(sents)
            for annot_tuples, ctnt in sents:
                if len(annot_tuples[1]) == 1:
                    continue
                score_model(scorer, nlp, raw_text, annot_tuples,
                            verbose=verbose if itn >= 2 else False)
                if raw_text is None:
                    words = add_noise(annot_tuples[1], corruption_level)
                    tokens = nlp.tokenizer.tokens_from_list(words)
                else:
                    raw_text = add_noise(raw_text, corruption_level)
                    tokens = nlp.tokenizer(raw_text)
                nlp.tagger(tokens)
                gold = GoldParse(tokens, annot_tuples)
                if not gold.is_projective:
                    raise Exception("Non-projective sentence in training: %s" % annot_tuples[1])
                loss += nlp.parser.train(tokens, gold)
                nlp.entity.train(tokens, gold)
                nlp.tagger.train(tokens, gold.tags)
        random.shuffle(gold_tuples)
        print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
                                                   scorer.tags_acc,
                                                   scorer.token_acc))
    print('end training')
    nlp.end_training(model_dir)
    print('done')
コード例 #11
0
def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
    LangClass = spacy.util.get_lang_class(lang_name)
    train_sents = list(read_conllx(train_loc))
    train_sents = PseudoProjectivity.preprocess_training_data(train_sents)

    actions = ArcEager.get_actions(gold_parses=train_sents)
    features = get_templates('basic')

    model_dir = pathlib.Path(model_dir)
    if not model_dir.exists():
        model_dir.mkdir()
    if not (model_dir / 'deps').exists():
        (model_dir / 'deps').mkdir()
    if not (model_dir / 'pos').exists():
        (model_dir / 'pos').mkdir()
    with (model_dir / 'deps' / 'config.json').open('wb') as file_:
        file_.write(
            json.dumps(
                {'pseudoprojective': True, 'labels': actions, 'features': features}).encode('utf8'))

    vocab = LangClass.Defaults.create_vocab()
    if not (model_dir / 'vocab').exists():
        (model_dir / 'vocab').mkdir()
    else:
        if (model_dir / 'vocab' / 'strings.json').exists():
            with (model_dir / 'vocab' / 'strings.json').open() as file_:
                vocab.strings.load(file_)
            if (model_dir / 'vocab' / 'lexemes.bin').exists():
                vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin')

    if clusters_loc is not None:
        clusters_loc = pathlib.Path(clusters_loc)
        with clusters_loc.open() as file_:
            for line in file_:
                try:
                    cluster, word, freq = line.split()
                except ValueError:
                    continue
                lex = vocab[word]
                lex.cluster = int(cluster[::-1], 2)
    # Populate vocab
    for _, doc_sents in train_sents:
        for (ids, words, tags, heads, deps, ner), _ in doc_sents:
            for word in words:
                _ = vocab[word]
            for dep in deps:
                _ = vocab[dep]
            for tag in tags:
                _ = vocab[tag]
            if vocab.morphology.tag_map:
                for tag in tags:
                    assert tag in vocab.morphology.tag_map, repr(tag)
    tagger = Tagger(vocab)
    parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0)

    for itn in range(30):
        loss = 0.
        for _, doc_sents in train_sents:
            for (ids, words, tags, heads, deps, ner), _ in doc_sents:
                doc = Doc(vocab, words=words)
                gold = GoldParse(doc, tags=tags, heads=heads, deps=deps)
                tagger(doc)
                loss += parser.update(doc, gold, itn=itn)
                doc = Doc(vocab, words=words)
                tagger.update(doc, gold)
        random.shuffle(train_sents)
        scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
        print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc))
    nlp = Language(vocab=vocab, tagger=tagger, parser=parser)
    nlp.end_training(model_dir)
    scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
    print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
コード例 #12
0
def test_pseudoprojectivity(EN):
    tree = [1, 2, 2]
    nonproj_tree = [1, 2, 2, 4, 5, 2, 7, 4, 2]
    labels = [
        'det', 'nsubj', 'root', 'det', 'dobj', 'aux', 'nsubj', 'acl', 'punct'
    ]
    nonproj_tree2 = [9, 1, 3, 1, 5, 6, 9, 8, 6, 1, 6, 12, 13, 10, 1]
    labels2 = [
        'advmod', 'root', 'det', 'nsubj', 'advmod', 'det', 'dobj', 'det',
        'nmod', 'aux', 'nmod', 'advmod', 'det', 'amod', 'punct'
    ]

    assert (PseudoProjectivity.decompose('X||Y') == ('X', 'Y'))
    assert (PseudoProjectivity.decompose('X') == ('X', ''))

    assert (PseudoProjectivity.is_decorated('X||Y') == True)
    assert (PseudoProjectivity.is_decorated('X') == False)

    PseudoProjectivity._lift(0, tree)
    assert (tree == [2, 2, 2])

    np_arc = PseudoProjectivity._get_smallest_nonproj_arc(nonproj_tree)
    assert (np_arc == 7)

    np_arc = PseudoProjectivity._get_smallest_nonproj_arc(nonproj_tree2)
    assert (np_arc == 10)

    proj_heads, deco_labels = PseudoProjectivity.projectivize(
        nonproj_tree, labels)
    assert (proj_heads == [1, 2, 2, 4, 5, 2, 7, 5, 2])
    assert (deco_labels == [
        'det', 'nsubj', 'root', 'det', 'dobj', 'aux', 'nsubj', 'acl||dobj',
        'punct'
    ])
    deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels, EN)
    assert (deproj_heads == nonproj_tree)
    assert (undeco_labels == labels)

    proj_heads, deco_labels = PseudoProjectivity.projectivize(
        nonproj_tree2, labels2)
    assert (proj_heads == [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1])
    assert (deco_labels == [
        'advmod||aux', 'root', 'det', 'nsubj', 'advmod', 'det', 'dobj', 'det',
        'nmod', 'aux', 'nmod||dobj', 'advmod', 'det', 'amod', 'punct'
    ])
    deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels, EN)
    assert (deproj_heads == nonproj_tree2)
    assert (undeco_labels == labels2)

    # if decoration is wrong such that there is no head with the desired label
    # the structure is kept and the label is undecorated
    proj_heads = [1, 2, 2, 4, 5, 2, 7, 5, 2]
    deco_labels = [
        'det', 'nsubj', 'root', 'det', 'dobj', 'aux', 'nsubj', 'acl||iobj',
        'punct'
    ]
    deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels, EN)
    assert (deproj_heads == proj_heads)
    assert (undeco_labels == [
        'det', 'nsubj', 'root', 'det', 'dobj', 'aux', 'nsubj', 'acl', 'punct'
    ])

    # if there are two potential new heads, the first one is chosen even if it's wrong
    proj_heads = [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1]
    deco_labels = [
        'advmod||aux', 'root', 'det', 'aux', 'advmod', 'det', 'dobj', 'det',
        'nmod', 'aux', 'nmod||dobj', 'advmod', 'det', 'amod', 'punct'
    ]
    deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels, EN)
    assert (deproj_heads == [3, 1, 3, 1, 5, 6, 9, 8, 6, 1, 6, 12, 13, 10, 1])
    assert (undeco_labels == [
        'advmod', 'root', 'det', 'aux', 'advmod', 'det', 'dobj', 'det', 'nmod',
        'aux', 'nmod', 'advmod', 'det', 'amod', 'punct'
    ])