Beispiel #1
0
def cal_class_distribution(data_dir, level):
    """
    calculate the class distribution
    :param data_dir:
    :param level: 0 for inner-sentence, 1 for inter-sentence but inner paragraph, 2 for inter-paragraph, 3 for different depth
    :return: None
    """
    rst_trees = DataHelper.read_rst_trees(data_dir)
    all_nodes = [node for rst_tree in rst_trees for node in rst_tree.postorder_DFT(rst_tree.tree, [])]
    if level in [0, 1, 2]:
        valid_relations = [RstTree.extract_relation(node.child_relation) for node in all_nodes if
                           node.level == level and node.child_relation is not None]
        distribution = Counter(valid_relations)
        for cla in class2rel:
            if cla not in distribution:
                distribution[cla] = 0
        return distribution
    if level == 3:
        depth_relation_distributions = {}
        for node in all_nodes:
            if node.lnode is None and node.rnode is None:
                continue
            if node.depth in depth_relation_distributions:
                depth_relation_distributions[node.depth][RstTree.extract_relation(node.child_relation)] += 1
            else:
                depth_relation_distributions[node.depth] = Counter()
                depth_relation_distributions[node.depth][RstTree.extract_relation(node.child_relation)] = 1
        for depth, distribution in depth_relation_distributions.items():
            for cla in class2rel:
                if cla not in distribution:
                    distribution[cla] = 0
        return depth_relation_distributions
Beispiel #2
0
    def eval_parser(self,
                    path='./examples',
                    report=False,
                    bcvocab=None,
                    draw=True):
        """ Test the parsing performance"""
        # Evaluation
        met = Metrics(levels=['span', 'nuclearity', 'relation'])
        # ----------------------------------------
        # Read all files from the given path
        doclist = [
            os.path.join(path, fname) for fname in os.listdir(path)
            if fname.endswith('.merge')
        ]
        pred_forms = []
        gold_forms = []
        depth_per_relation = {}
        for fmerge in doclist:
            # ----------------------------------------
            # Read *.merge file
            doc = Doc()
            doc.read_from_fmerge(fmerge)
            # ----------------------------------------
            # Parsing
            pred_rst = self.parser.sr_parse(doc, bcvocab)
            if draw:
                pred_rst.draw_rst(fmerge.replace(".merge", ".ps"))
            # Get brackets from parsing results
            pred_brackets = pred_rst.bracketing()
            fbrackets = fmerge.replace('.merge', '.brackets')
            # Write brackets into file
            Evaluator.writebrackets(fbrackets, pred_brackets)
            # ----------------------------------------
            # Evaluate with gold RST tree
            if report:
                fdis = fmerge.replace('.merge', '.dis')
                gold_rst = RstTree(fdis, fmerge)
                gold_rst.build()
                met.eval(gold_rst, pred_rst)
                for node in pred_rst.postorder_DFT(pred_rst.tree, []):
                    pred_forms.append(node.form)
                for node in gold_rst.postorder_DFT(gold_rst.tree, []):
                    gold_forms.append(node.form)

                nodes = gold_rst.postorder_DFT(gold_rst.tree, [])
                inner_nodes = [
                    node for node in nodes
                    if node.lnode is not None and node.rnode is not None
                ]
                for idx, node in enumerate(inner_nodes):
                    relation = node.rnode.relation if node.form == 'NS' else node.lnode.relation
                    rela_class = RstTree.extract_relation(relation)
                    if rela_class in depth_per_relation:
                        depth_per_relation[rela_class].append(node.depth)
                    else:
                        depth_per_relation[rela_class] = [node.depth]
                    lnode_text = ' '.join([
                        gold_rst.doc.token_dict[tid].word
                        for tid in node.lnode.text
                    ])
                    lnode_lemmas = ' '.join([
                        gold_rst.doc.token_dict[tid].lemma
                        for tid in node.lnode.text
                    ])
                    rnode_text = ' '.join([
                        gold_rst.doc.token_dict[tid].word
                        for tid in node.rnode.text
                    ])
                    rnode_lemmas = ' '.join([
                        gold_rst.doc.token_dict[tid].lemma
                        for tid in node.rnode.text
                    ])
                    # if rela_class == 'Topic-Change':
                    #     print(fmerge)
                    #     print(relation)
                    #     print(lnode_text)
                    #     print(rnode_text)
                    #     print()

        if report:
            met.report()