def build_vocab(dataset):
    word_freq = Counter()
    pos_freq = Counter()
    for paragraph in chain(*dataset):
        for edu in paragraph.edus():
            word_freq.update(edu.words)
            pos_freq.update(edu.tags)
    word_vocab = Vocab("word", word_freq)
    pos_vocab = Vocab("part of speech", pos_freq)
    return word_vocab, pos_vocab
Beispiel #2
0
def build_vocab(dataset):
    word_freq = Counter()
    pos_freq = Counter()
    for paragraph in chain(*dataset):
        for edu in paragraph.edus():
            word_freq.update(edu.words)
            pos_freq.update(edu.tags)
    word_vocab = Vocab("word", word_freq)
    pos_vocab = Vocab("part of speech", pos_freq)
    gcn_vocab = Vocab("gcn tag", Counter(["dep", "head", "self"]))
    return word_vocab, pos_vocab, gcn_vocab
Beispiel #3
0
def build_vocab(instances):
    words_counter = Counter()
    poses_counter = Counter()
    trans_counter = Counter()
    for words, poses, trans in instances:
        words_counter.update(chain(*words))
        poses_counter.update(chain(*poses))
        trans_counter.update(trans)
    word_vocab = Vocab("word", words_counter)
    pos_vocab = Vocab("part of speech", poses_counter)
    trans_label = Label("transition", trans_counter)
    return word_vocab, pos_vocab, trans_label
def build_vocab(trees, trans):
    trans_label = Label("transition", Counter(chain(*trans)))

    words_counter = Counter()
    poses_counter = Counter()
    for tree in trees:
        edus = list(tree.edus())
        words = [getattr(edu, "words") for edu in edus]
        poses = [getattr(edu, "tags") for edu in edus]
        words_counter.update(chain(*words))
        poses_counter.update(chain(*poses))
    word_vocab = Vocab("word", words_counter)
    pos_vocab = Vocab("part of speech", poses_counter)
    return word_vocab, pos_vocab, trans_label
def build_vocab(dataset):
    word_freq = Counter()
    pos_freq = Counter()
    nuc_freq = Counter()
    rel_freq = Counter()
    for paragraph in chain(*dataset):
        for node in paragraph.iterfind(filter=node_type_filter([EDU, Relation])):
            if isinstance(node, EDU):
                word_freq.update(node.words)
                pos_freq.update(node.tags)
            elif isinstance(node, Relation):
                nuc_freq[node.nuclear] += 1
                rel_freq[node.ftype] += 1

    word_vocab = Vocab("word", word_freq)
    pos_vocab = Vocab("part of speech", pos_freq)
    nuc_label = Label("nuclear", nuc_freq)
    rel_label = Label("relation", rel_freq)
    return word_vocab, pos_vocab, nuc_label, rel_label