Beispiel #1
0
 def _get_features(self, method='treedlib'):
     get_feats = compile_relation_feature_generator()
     f_index = defaultdict(list)
     for j, cand in enumerate(self._candidates):
         for feat in get_feats(cand.root, cand.e1_idxs, cand.e2_idxs):
             f_index[feat].append(j)
     return f_index
Beispiel #2
0
 def _get_features(self, method='treedlib'):
   get_feats = compile_relation_feature_generator()
   f_index = defaultdict(list)
   for j,cand in enumerate(self._candidates):
     for feat in get_feats(cand.root, cand.e1_idxs, cand.e2_idxs):
       f_index[feat].append(j)
   return f_index
Beispiel #3
0
def get_binary_span_feats(sidxs, sentence, stopwords):
    """Get binary (relation) span features from TreeDLib"""
    get_tdl_feats = compile_relation_feature_generator()
    xmltree = corenlp_to_xmltree(get_as_dict(sentence))
    s1_idxs, s2_idxs = sidxs
    if len(s1_idxs) > 0 and len(s2_idxs) > 0:
        # Apply TDL features
        for f in get_tdl_feats(xmltree.root, s1_idxs, s2_idxs,
                               stopwords=stopwords):
            yield 'TDL_' + f, 1
Beispiel #4
0
def get_span_feats(candidate):
    args = candidate.get_arguments()
    if not isinstance(args[0], Span):
        raise ValueError("Accepts Span-type arguments, %s-type found.")

    # Unary candidates
    if len(args) == 1:
        get_tdl_feats = compile_entity_feature_generator()
        span = args[0]
        sent = get_as_dict(span.parent)
        xmltree = corenlp_to_xmltree(sent)
        sidxs = range(span.get_word_start(), span.get_word_end() + 1)
        if len(sidxs) > 0:

            # Add DDLIB entity features
            for f in get_ddlib_feats(sent, sidxs):
                yield 'DDL_' + f, 1

            # Add TreeDLib entity features
            for f in get_tdl_feats(xmltree.root, sidxs):
                yield 'TDL_' + f, 1

    # Binary candidates
    elif len(args) == 2:
        get_tdl_feats = compile_relation_feature_generator()
        span1, span2 = args
        xmltree = corenlp_to_xmltree(get_as_dict(span1.parent))
        s1_idxs = range(span1.get_word_start(), span1.get_word_end() + 1)
        s2_idxs = range(span2.get_word_start(), span2.get_word_end() + 1)
        if len(s1_idxs) > 0 and len(s2_idxs) > 0:

            # Apply TDL features
            for f in get_tdl_feats(xmltree.root, s1_idxs, s2_idxs):
                yield 'TDL_' + f, 1
    else:
        raise NotImplementedError(
            "Only handles unary and binary candidates currently")
Beispiel #5
0
def get_content_feats(candidates):
    candidates = candidates if isinstance(candidates, list) else [candidates]
    for candidate in candidates:
        args = tuple([m.context for m in candidate.get_mentions()])
        if not (isinstance(args[0], TemporarySpanMention)):
            raise ValueError(
                f"Accepts Span-type arguments, {type(candidate)}-type found.")

        # Unary candidates
        if len(args) == 1:
            span = args[0]
            if span.sentence.is_lingual():
                get_tdl_feats = compile_entity_feature_generator()
                sent = get_as_dict(span.sentence)
                xmltree = corenlp_to_xmltree(sent)
                sidxs = list(
                    range(span.get_word_start_index(),
                          span.get_word_end_index() + 1))
                if len(sidxs) > 0:
                    # Add DDLIB entity features
                    for f in get_ddlib_feats(span, sent, sidxs):
                        yield candidate.id, f"DDL_{f}", DEF_VALUE
                    # Add TreeDLib entity features
                    if span.stable_id not in unary_tdl_feats:
                        unary_tdl_feats[span.stable_id] = set()
                        for f in get_tdl_feats(xmltree.root, sidxs):
                            unary_tdl_feats[span.stable_id].add(f)
                    for f in unary_tdl_feats[span.stable_id]:
                        yield candidate.id, f"TDL_{f}", DEF_VALUE
            else:
                for f in get_word_feats(span):
                    yield candidate.id, f"BASIC_{f}", DEF_VALUE

        # Binary candidates
        elif len(args) == 2:
            span1, span2 = args
            if span1.sentence.is_lingual() and span2.sentence.is_lingual():
                get_tdl_feats = compile_relation_feature_generator()
                sent1 = get_as_dict(span1.sentence)
                sent2 = get_as_dict(span2.sentence)
                xmltree = corenlp_to_xmltree(get_as_dict(span1.sentence))
                s1_idxs = list(
                    range(span1.get_word_start_index(),
                          span1.get_word_end_index() + 1))
                s2_idxs = list(
                    range(span2.get_word_start_index(),
                          span2.get_word_end_index() + 1))
                if len(s1_idxs) > 0 and len(s2_idxs) > 0:

                    # Add DDLIB entity features for relation
                    for f in get_ddlib_feats(span1, sent1, s1_idxs):
                        yield candidate.id, f"DDL_e1_{f}", DEF_VALUE

                    for f in get_ddlib_feats(span2, sent2, s2_idxs):
                        yield candidate.id, f"DDL_e2_{f}", DEF_VALUE

                    # Add TreeDLib relation features
                    if candidate.id not in binary_tdl_feats:
                        binary_tdl_feats[candidate.id] = set()
                        for f in get_tdl_feats(xmltree.root, s1_idxs, s2_idxs):
                            binary_tdl_feats[candidate.id].add(f)
                    for f in binary_tdl_feats[candidate.id]:
                        yield candidate.id, f"TDL_{f}", DEF_VALUE
            else:
                for f in get_word_feats(span1):
                    yield candidate.id, f"BASIC_e1_{f}", DEF_VALUE

                for f in get_word_feats(span2):
                    yield candidate.id, f"BASIC_e2_{f}", DEF_VALUE

        else:
            raise NotImplementedError(
                "Only handles unary and binary candidates currently")
Beispiel #6
0
def extract_textual_features(
    candidates: Union[Candidate, List[Candidate]],
) -> Iterator[Tuple[int, str, int]]:
    """Extract textual features.

    :param candidates: A list of candidates to extract features from
    """
    candidates = candidates if isinstance(candidates, list) else [candidates]
    for candidate in candidates:
        args = tuple([m.context for m in candidate.get_mentions()])
        if not (isinstance(args[0], (SpanMention, ImplicitSpanMention))):
            raise ValueError(
                f"Accepts Span/ImplicitSpan-type mentions, {type(args[0])}-type found."
            )

        # Unary candidates
        if len(args) == 1:
            span: Union[SpanMention, ImplicitSpanMention] = args[0]
            if span.sentence.is_lingual():
                get_tdl_feats = _compile_entity_feature_generator()
                xmltree = corenlp_to_xmltree(span.sentence)
                sidxs = list(
                    range(span.get_word_start_index(),
                          span.get_word_end_index() + 1))
                if len(sidxs) > 0:
                    # Add DDLIB entity features
                    for f in _get_ddlib_feats(span, get_as_dict(span.sentence),
                                              sidxs):
                        yield candidate.id, f"DDL_{f}", DEF_VALUE
                    # Add TreeDLib entity features
                    if span.stable_id not in unary_tdl_feats:
                        unary_tdl_feats[span.stable_id] = set()
                        for f in get_tdl_feats(xmltree.root, sidxs):
                            unary_tdl_feats[span.stable_id].add(f)
                    for f in unary_tdl_feats[span.stable_id]:
                        yield candidate.id, f"TDL_{f}", DEF_VALUE
            for f in _get_word_feats(span):
                yield candidate.id, f"BASIC_{f}", DEF_VALUE

        # Multinary candidates
        else:
            spans = args
            if all([span.sentence.is_lingual() for span in spans]):
                get_tdl_feats = compile_relation_feature_generator(
                    is_multary=True)
                sents = [get_as_dict(span.sentence) for span in spans]
                xmltree = corenlp_to_xmltree(spans[0].sentence)
                s_idxs = [
                    list(
                        range(span.get_word_start_index(),
                              span.get_word_end_index() + 1)) for span in spans
                ]
                if all([len(s_idx) > 0 for s_idx in s_idxs]):

                    # Add DDLIB entity features for relation
                    for span, sent, s_idx, i in zip(spans, sents, s_idxs,
                                                    range(len(spans))):

                        for f in _get_ddlib_feats(span, sent, s_idx):
                            yield candidate.id, f"DDL_e{i}_{f}", DEF_VALUE

                    # Add TreeDLib relation features
                    if candidate.id not in multinary_tdl_feats:
                        multinary_tdl_feats[candidate.id] = set()
                        for f in get_tdl_feats(xmltree.root, s_idxs):
                            multinary_tdl_feats[candidate.id].add(f)
                    for f in multinary_tdl_feats[candidate.id]:
                        yield candidate.id, f"TDL_{f}", DEF_VALUE
            for i, span in enumerate(spans):
                for f in _get_word_feats(span):
                    yield candidate.id, f"BASIC_e{i}_{f}", DEF_VALUE
Beispiel #7
0
def extract_textual_features(
    candidates: Union[Candidate, List[Candidate]],
) -> Iterator[Tuple[int, str, int]]:
    """Extract textual features.

    :param candidates: A list of candidates to extract features from
    :type candidates: list
    """
    candidates = candidates if isinstance(candidates, list) else [candidates]
    for candidate in candidates:
        args = tuple([m.context for m in candidate.get_mentions()])
        if not (isinstance(args[0], (SpanMention, ImplicitSpanMention))):
            raise ValueError(
                f"Accepts Span/ImplicitSpan-type mentions, {type(args[0])}-type found."
            )

        # Unary candidates
        if len(args) == 1:
            span: Union[SpanMention, ImplicitSpanMention] = args[0]
            if span.sentence.is_lingual():
                get_tdl_feats = _compile_entity_feature_generator()
                xmltree = corenlp_to_xmltree(span.sentence)
                sidxs = list(
                    range(span.get_word_start_index(), span.get_word_end_index() + 1)
                )
                if len(sidxs) > 0:
                    # Add DDLIB entity features
                    for f in _get_ddlib_feats(span, get_as_dict(span.sentence), sidxs):
                        yield candidate.id, f"DDL_{f}", DEF_VALUE
                    # Add TreeDLib entity features
                    if span.stable_id not in unary_tdl_feats:
                        unary_tdl_feats[span.stable_id] = set()
                        for f in get_tdl_feats(xmltree.root, sidxs):
                            unary_tdl_feats[span.stable_id].add(f)
                    for f in unary_tdl_feats[span.stable_id]:
                        yield candidate.id, f"TDL_{f}", DEF_VALUE
            for f in _get_word_feats(span):
                yield candidate.id, f"BASIC_{f}", DEF_VALUE

        # Binary candidates
        elif len(args) == 2:
            span1, span2 = args
            if span1.sentence.is_lingual() and span2.sentence.is_lingual():
                get_tdl_feats = compile_relation_feature_generator()
                sent1 = get_as_dict(span1.sentence)
                sent2 = get_as_dict(span2.sentence)
                xmltree = corenlp_to_xmltree(span1.sentence)
                s1_idxs = list(
                    range(span1.get_word_start_index(), span1.get_word_end_index() + 1)
                )
                s2_idxs = list(
                    range(span2.get_word_start_index(), span2.get_word_end_index() + 1)
                )
                if len(s1_idxs) > 0 and len(s2_idxs) > 0:

                    # Add DDLIB entity features for relation
                    for f in _get_ddlib_feats(span1, sent1, s1_idxs):
                        yield candidate.id, f"DDL_e1_{f}", DEF_VALUE

                    for f in _get_ddlib_feats(span2, sent2, s2_idxs):
                        yield candidate.id, f"DDL_e2_{f}", DEF_VALUE

                    # Add TreeDLib relation features
                    if candidate.id not in binary_tdl_feats:
                        binary_tdl_feats[candidate.id] = set()
                        for f in get_tdl_feats(xmltree.root, s1_idxs, s2_idxs):
                            binary_tdl_feats[candidate.id].add(f)
                    for f in binary_tdl_feats[candidate.id]:
                        yield candidate.id, f"TDL_{f}", DEF_VALUE
            for f in _get_word_feats(span1):
                yield candidate.id, f"BASIC_e1_{f}", DEF_VALUE

            for f in _get_word_feats(span2):
                yield candidate.id, f"BASIC_e2_{f}", DEF_VALUE

        else:
            raise NotImplementedError(
                "Only handles unary and binary candidates currently"
            )
Beispiel #8
0
for i,model in enumerate(roc_models):
    plt.plot(model[0],model[1], color=color_vals[i],
         lw=2, label='%s (area = %0.2f)' % (model_names[i],model[2]))
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()


# # Parse Tree Visualization

# In[ ]:

cand = session.query(Candidate).filter(Candidate.id == 19885).one()
print cand
xmltree = corenlp_to_xmltree(get_as_dict(cand.get_parent()))
xmltree.render_tree(highlight=[range(cand[0].get_word_start(), cand[0].get_word_end() + 1), range(cand[1].get_word_start(), cand[1].get_word_end()+1)])


# In[ ]:

get_tdl_feats = compile_relation_feature_generator()
sids = [range(a.get_word_start(), a.get_word_end() + 1) for a in cand.get_contexts()]
tags = list(get_tdl_feats(xmltree.root, sids[0], sids[1]))
print tags