Exemple #1
0
def gen_instances(dataset, parses, model):
    instances = []
    labels = []
    candidate_re = re.compile("[%s]" % model.candidate)
    for paragraph in chain(*dataset):
        root = paragraph.root_relation()
        if root:
            sentences = list(root.iterfind(filter=node_type_filter(Sentence)))
            # 分割点两边的偏移量
            for sentence in sentences:
                segments = set()  # 分割点两侧的偏移量
                candidates = set()  # 候选分割词的偏移量
                edus = list(sentence.iterfind(filter=node_type_filter(EDU)))
                offset = 0
                for edu in edus:
                    segments.add(offset)
                    segments.add(offset+len(edu.text)-1)
                    offset += len(edu.text)
                # convert tree in parented tree for feature extraction
                parse = ParentedTree.fromstring(parses[sentence.sid].pformat())
                for m in candidate_re.finditer(sentence.text):
                    candidate = m.start()
                    instances.append(model.extract_features(candidate, parse))
                    labels.append(1 if candidate in segments else 0)
    return instances, labels
Exemple #2
0
def gen_train_instances(dataset):
    instances = []
    tags = []
    for paragraph in chain(*dataset):
        for sentence in paragraph.sentences():
            edus = list(sentence.iterfind(node_type_filter(EDU)))
            if edus:
                sent_words = []
                sent_poses = []
                sent_tags = []
                graph = []
                for i, edu in enumerate(edus):
                    words = edu.words
                    poses = edu.tags
                    label = ['O'] * (len(words) - 1)
                    label += ['B'] if i < len(edus) - 1 else ['O']
                    sent_words.extend(words)
                    sent_poses.extend(poses)
                    sent_tags.extend(label)
                for i, token in enumerate(sentence.dependency):
                    graph.append((i, i, "self"))
                    if token.head > 0:
                        graph.append((i, token.head - 1, "head"))
                        graph.append((token.head - 1, i, "dep"))
                instances.append((sent_words, sent_poses, graph))
                tags.append(sent_tags)
    return instances, tags
def evaluate(dataset, model):
    model.eval()
    segmenter = RNNSegmenter(model)
    golds = []
    segs = []
    for paragraph in chain(*dataset):
        seged_sents = []
        for sentence in paragraph.sentences():
            # make sure sentence has edus
            if list(sentence.iterfind(node_type_filter(EDU))):
                seged_sents.append(Sentence(segmenter.cut_edu(sentence)))
        if seged_sents:
            segs.append(Paragraph(seged_sents))
            golds.append(paragraph)
    return edu_eval(segs, golds)
def build_vocab(dataset):
    word_freq = Counter()
    pos_freq = Counter()
    nuc_freq = Counter()
    rel_freq = Counter()
    for paragraph in chain(*dataset):
        for node in paragraph.iterfind(filter=node_type_filter([EDU, Relation])):
            if isinstance(node, EDU):
                word_freq.update(node.words)
                pos_freq.update(node.tags)
            elif isinstance(node, Relation):
                nuc_freq[node.nuclear] += 1
                rel_freq[node.ftype] += 1

    word_vocab = Vocab("word", word_freq)
    pos_vocab = Vocab("part of speech", pos_freq)
    nuc_label = Label("nuclear", nuc_freq)
    rel_label = Label("relation", rel_freq)
    return word_vocab, pos_vocab, nuc_label, rel_label
def gen_train_instances(dataset):
    instances = []
    tags = []
    for paragraph in chain(*dataset):
        for sentence in paragraph.sentences():
            edus = list(sentence.iterfind(node_type_filter(EDU)))
            if edus:
                sent_words = []
                sent_poses = []
                sent_tags = []
                for i, edu in enumerate(edus):
                    words = edu.words
                    poses = edu.tags
                    label = ['O'] * (len(words) - 1)
                    label += ['B'] if i < len(edus) - 1 else ['O']
                    sent_words.extend(words)
                    sent_poses.extend(poses)
                    sent_tags.extend(label)
                instances.append((sent_words, sent_poses))
                tags.append(sent_tags)
    return instances, tags