Beispiel #1
0
 def _process_parsed_conn(self, articles, which='test'):
     """
     generate explicit relation for each true discourse connective
     """
     connParser = Connective()
     conn_feat_name = FILE_PATH + '/../tmp/conn.feat'
     conn_feat_file = codecs.open(conn_feat_name, 'w', 'utf-8')
     checked_conns = []
     for art in articles:
         checked_conns.append(connParser.print_features(art, which, conn_feat_file))
     conn_feat_file.close()
     conn_pred_name = FILE_PATH + '/../tmp/conn.pred'
     Corpus.test_with_opennlp(conn_feat_name, connParser.model_file, conn_pred_name)
     conn_res = [l.strip().split()[-1] for l in codecs.open(conn_pred_name, 'r', 'utf-8')]
     assert len(checked_conns) == len(articles), 'article size not match'
     s = 0
     for art, cand_conns in zip(articles, checked_conns):
         length = len(cand_conns)
         cand_res = conn_res[s:s+length]
         s += length
         for conn, label in zip(cand_conns, cand_res):
             if label == '1':
                 rel = Relation()
                 rel.doc_id = art.id
                 rel.rel_type = 'Explicit'
                 rel.article = art
                 rel.conn_leaves = conn
                 rel.conn_addr = [n.leaf_id for n in conn]
                 art.exp_relations.append(rel)
     assert s == len(conn_res), 'conn size not match'
Beispiel #2
0
    def generate_nonexp_relations(self, article):
        for para in article.paragraphs:
            for s1, s2 in zip(para.sentences[:-1], para.sentences[1:]):
                if not article.has_exp_relation(s1.id):
                    # TODO: Add detail implementation
                    rel = Relation()
                    rel.article = article
                    rel.doc_id = article.id
                    rel.arg1s['parsed'] = [s1.tree.root
                                           ] if not s1.tree.is_null() else []
                    rel.arg1_leaves = self.remove_leading_tailing_punc(
                        s1.leaves)
                    rel.arg1_addr = [n.leaf_id for n in rel.arg1_leaves]
                    rel.arg1_sid = rel.arg1_leaves[-1].goto_tree(
                    ).sent_id if len(rel.arg1_leaves) > 0 else -1
                    rel.arg1_text = ' '.join(n.value for n in rel.arg1_leaves)

                    rel.arg2s['parsed'] = [s2.tree.root
                                           ] if not s2.tree.is_null() else []
                    rel.arg2_leaves = self.remove_leading_tailing_punc(
                        s2.leaves)
                    rel.arg2_addr = [n.leaf_id for n in rel.arg2_leaves]
                    rel.arg2_sid = rel.arg2_leaves[0].goto_tree(
                    ).sent_id if len(rel.arg2_leaves) > 0 else -1
                    rel.arg2_text = ' '.join(n.value for n in rel.arg2_leaves)

                    article.nonexp_relations.append(rel)
Beispiel #3
0
 def _process_parsed_conn(self, articles, which='test'):
     """
     generate explicit relation for each true discourse connective
     """
     connParser = Connective()
     conn_feat_name = FILE_PATH + '/../tmp/conn.feat'
     conn_feat_file = codecs.open(conn_feat_name, 'w', 'utf-8')
     checked_conns = []
     for art in articles:
         checked_conns.append(
             connParser.print_features(art, which, conn_feat_file))
     conn_feat_file.close()
     conn_pred_name = FILE_PATH + '/../tmp/conn.pred'
     Corpus.test_with_opennlp(conn_feat_name, connParser.model_file,
                              conn_pred_name)
     conn_res = [
         l.strip().split()[-1]
         for l in codecs.open(conn_pred_name, 'r', 'utf-8')
     ]
     assert len(checked_conns) == len(articles), 'article size not match'
     s = 0
     for art, cand_conns in zip(articles, checked_conns):
         length = len(cand_conns)
         cand_res = conn_res[s:s + length]
         s += length
         for conn, label in zip(cand_conns, cand_res):
             if label == '1':
                 rel = Relation()
                 rel.doc_id = art.id
                 rel.rel_type = 'Explicit'
                 rel.article = art
                 rel.conn_leaves = conn
                 rel.conn_addr = [n.leaf_id for n in conn]
                 art.exp_relations.append(rel)
     assert s == len(conn_res), 'conn size not match'
Beispiel #4
0
    def generate_nonexp_relations(self, article):
        for s1, s2 in zip(article.sentences[:-1], article.sentences[1:]):
            if not article.has_exp_inter_relation(s1.id):
                # TODO: Add detail implementation
                rel = Relation()
                rel.article = article
                rel.doc_id = article.id
                rel.arg1s['parsed'] = [s1.tree.root] if not s1.tree.is_null() else []
                rel.arg1_leaves = self.remove_leading_tailing_punc(s1.leaves)
                rel.arg1_addr = [n.leaf_id for n in rel.arg1_leaves]
                rel.arg1_sid = rel.arg1_leaves[-1].goto_tree().sent_id if len(rel.arg1_leaves) > 0 else -1
                rel.arg1_text = ' '.join(n.value for n in rel.arg1_leaves)

                rel.arg2s['parsed'] = [s2.tree.root] if not s2.tree.is_null() else []
                rel.arg2_leaves = self.remove_leading_tailing_punc(s2.leaves)
                rel.arg2_addr = [n.leaf_id for n in rel.arg2_leaves]
                rel.arg2_sid = rel.arg2_leaves[0].goto_tree().sent_id if len(rel.arg2_leaves) > 0 else -1
                rel.arg2_text = ' '.join(n.value for n in rel.arg2_leaves)

                article.nonexp_relations.append(rel)

        # sentence intra nonexp relation
        for sen in article.sentences:
            tree = sen.tree
            if len(sen.clauses) <= 1 :
                continue
            for c1, c2 in zip(sen.clauses[:-1], sen.clauses[1:]):
                if not article.has_exp_intra_relation(sen.id):
                    rel = Relation()
                    rel.article = article
                    rel.doc_id = article.id
                    rel.arg1s['parsed'] = tree.find_subtrees(c1)
                    rel.arg1_leaves = self.remove_leading_tailing_punc(c1)
                    rel.arg1_addr = [n.leaf_id for n in rel.arg1_leaves]
                    rel.arg1_sid = sen.id
                    rel.arg1_text = ' '.join(n.value for n in rel.arg1_leaves)

                    rel.arg2s['parsed'] = tree.find_subtrees(c2)
                    rel.arg2_leaves = self.remove_leading_tailing_punc(c2)
                    rel.arg2_addr = [n.leaf_id for n in rel.arg2_leaves]
                    rel.arg2_sid = sen.id
                    rel.arg2_text = ' '.join(n.value for n in rel.arg2_leaves)

                    article.nonexp_relations.append(rel)
Beispiel #5
0
    def prepare_data(self, parse_path, rel_path, which, to_file):
        rel_dict = Corpus.read_relations(rel_path)
        articles = []
        dist = defaultdict(int)
        for art in Corpus.read_parses(parse_path, rel_dict):
            articles.append(art)
            for rel in art.relations:
                rel.article = art
                rel.get_arg_leaves()
                if rel.rel_type == 'Explicit':
                    continue
                labels = {s.replace(' ', '_') for s in rel.sense}
                for l in labels:
                    dist[l] += 1
                if which == 'test':
                    labels = ['|'.join(labels)]

                self.print_features(rel, labels, to_file)

        # add NoRel relations
        for art in articles:
            for s1, s2 in zip(art.sentences[:-1], art.sentences[1:]):
                if not art.has_inter_relation(s1.id):
                    rel = Relation()
                    rel.article = art
                    rel.doc_id = art.id
                    rel.arg1s['parsed'] = [s1.tree.root
                                           ] if not s1.tree.is_null() else []
                    rel.arg1_leaves = self.remove_leading_tailing_punc(
                        s1.leaves)
                    rel.arg1_addr = [n.leaf_id for n in rel.arg1_leaves]
                    rel.arg1_sid = rel.arg1_leaves[-1].goto_tree(
                    ).sent_id if len(rel.arg1_leaves) > 0 else -1
                    rel.arg1_text = ' '.join(n.value for n in rel.arg1_leaves)

                    rel.arg2s['parsed'] = [s2.tree.root
                                           ] if not s2.tree.is_null() else []
                    rel.arg2_leaves = self.remove_leading_tailing_punc(
                        s2.leaves)
                    rel.arg2_addr = [n.leaf_id for n in rel.arg2_leaves]
                    rel.arg2_sid = rel.arg2_leaves[0].goto_tree(
                    ).sent_id if len(rel.arg2_leaves) > 0 else -1
                    rel.arg2_text = ' '.join(n.value for n in rel.arg2_leaves)
                    self.print_features(rel, ['NoRel'], to_file)
Beispiel #6
0
    def generate_nonexp_relations(self, article):
        for para in article.paragraphs:
            for s1, s2 in zip(para.sentences[:-1], para.sentences[1:]):
                if not article.has_exp_relation(s1.id):
                    # TODO: Add detail implementation
                    rel = Relation()
                    rel.article = article
                    rel.doc_id = article.id
                    rel.arg1s['parsed'] = [s1.tree.root] if not s1.tree.is_null() else []
                    rel.arg1_leaves = self.remove_leading_tailing_punc(s1.leaves)
                    rel.arg1_addr = [n.leaf_id for n in rel.arg1_leaves]
                    rel.arg1_sid = rel.arg1_leaves[-1].goto_tree().sent_id if len(rel.arg1_leaves) > 0 else -1
                    rel.arg1_text = ' '.join(n.value for n in rel.arg1_leaves)

                    rel.arg2s['parsed'] = [s2.tree.root] if not s2.tree.is_null() else []
                    rel.arg2_leaves = self.remove_leading_tailing_punc(s2.leaves)
                    rel.arg2_addr = [n.leaf_id for n in rel.arg2_leaves]
                    rel.arg2_sid = rel.arg2_leaves[0].goto_tree().sent_id if len(rel.arg2_leaves) > 0 else -1
                    rel.arg2_text = ' '.join(n.value for n in rel.arg2_leaves)

                    article.nonexp_relations.append(rel)
Beispiel #7
0
    def prepare_data(self, parse_path, rel_path, which, to_file):
        rel_dict = Corpus.read_relations(rel_path)
        articles = []
        dist = defaultdict(int)
        for art in Corpus.read_parses(parse_path, rel_dict):
            articles.append(art)
            for rel in art.relations:
                rel.article = art
                rel.get_arg_leaves()
                if rel.rel_type == 'Explicit':
                    continue
                labels = {s.replace(' ','_') for s in rel.sense}
                for l in labels:
                    dist[l] += 1
                if which == 'test':
                    labels = ['|'.join(labels)]

                self.print_features(rel, labels, to_file)

        # add NoRel relations
        for art in articles:
            for s1, s2 in zip(art.sentences[:-1], art.sentences[1:]):
                if not art.has_inter_relation(s1.id):
                    rel = Relation()
                    rel.article = art
                    rel.doc_id = art.id
                    rel.arg1s['parsed'] = [s1.tree.root] if not s1.tree.is_null() else []
                    rel.arg1_leaves = self.remove_leading_tailing_punc(s1.leaves)
                    rel.arg1_addr = [n.leaf_id for n in rel.arg1_leaves]
                    rel.arg1_sid = rel.arg1_leaves[-1].goto_tree().sent_id if len(rel.arg1_leaves) > 0 else -1
                    rel.arg1_text = ' '.join(n.value for n in rel.arg1_leaves)

                    rel.arg2s['parsed'] = [s2.tree.root] if not s2.tree.is_null() else []
                    rel.arg2_leaves = self.remove_leading_tailing_punc(s2.leaves)
                    rel.arg2_addr = [n.leaf_id for n in rel.arg2_leaves]
                    rel.arg2_sid = rel.arg2_leaves[0].goto_tree().sent_id if len(rel.arg2_leaves) > 0 else -1
                    rel.arg2_text = ' '.join(n.value for n in rel.arg2_leaves)
                    self.print_features(rel, ['NoRel'], to_file)