Esempio n. 1
0
def get_features(ptree: nltk.ParentedTree, dtree: List[DepRel], indices, sense, offset):
    features_sentence = []
    for i, drel in enumerate(dtree):
        if drel.rel.lower() == 'root':
            mv_position = i
            break
    else:
        mv_position = 0
    main_verb = lemmatizer.lemmatize(ptree.pos()[mv_position][0])

    for i, (word, tag) in enumerate(ptree.pos()):
        tree_pos = ptree.treeposition_spanning_leaves(i, i + 1)[:-2]
        chain = [ptree[tree_pos[:i + 1]].label() for i in range(len(tree_pos))]
        chain = ['S' if c == 'SBAR' else c for c in chain]
        if len(chain) > 0:
            chain = "-".join(get_compressed_chain(chain))
        stem = stemmer.stem(word).lower()

        features_word = {
            'idx': offset + i,
            'BOS': i == 0,
            'word': word.lower(),
            'pos': tag,
            'lemma': lemmatizer.lemmatize(word).lower(),
            'stem': stem.lower(),
            'chain': chain,
            'conn': sense.split('.')[0] if offset + i in indices else "",
            'inflection': word[len(stem):],
            'is_main_verb': i == mv_position,
            'main_verb': main_verb.lower()
        }
        features_sentence.append(features_word)
    return features_sentence
Esempio n. 2
0
def get_features(ptree: nltk.ParentedTree, conn_idxs: List[int]):
    features = []
    for i, (word, tag) in enumerate(ptree.pos()):
        features.append({
            'BOS': i == 0,
            'word': word,
            'pos': tag,
            'lemma': lemmatizer.lemmatize(word),
            'stem': stemmer.stem(word),
            'conn': i in conn_idxs
        })
    return features