コード例 #1
0
def _get_seq_features(sentence, span):
    """Yield the sequence features in a Span

    These include:
        - words sequence in the span
        - lemmas sequence in the span
        - NER tags sequence in the span
        - POS tags sequence in the span

    Args:
        sentence: a list of Word objects
        span: the Span
    """
    word_seq_feat = "WORD_SEQ_[" + " ".join(
        materialize_span(sentence, span, lambda x: x.word)) + "]"
    yield word_seq_feat
    lemma_seq_feat = "LEMMA_SEQ_[" + " ".join(
        materialize_span(sentence, span, lambda x: str(x.lemma))) + "]"
    yield lemma_seq_feat
    ner_seq_feat = "NER_SEQ_[" + " ".join(
        materialize_span(sentence, span, lambda x: str(x.ner))) + "]"
    yield ner_seq_feat
    pos_seq_feat = "POS_SEQ_[" + " ".join(
        materialize_span(sentence, span, lambda x: str(x.pos))) + "]"
    yield pos_seq_feat
コード例 #2
0
def get_forbidden_word(sentence, span, span1):
    forbidden_word = [
        'nếu', "đối_với", "trường_hợp", 'phải', 'đó', 'không', 'được', 'đã',
        'đồng_thời', 'cần', 'chỉ', 'cụ_thể', 'coi', 'đây', 'ai'
    ]
    sent = ""
    f2 = ["đối_với", "trường_hợp", "nếu"]
    sent_exp = " ".join(materialize_span(sentence, span1, lambda x: x.word))
    sent_exp = toLowerCase(sent_exp)
    try:
        for i in {1, 2, 3}:
            token = str(sentence[span.begin_word_id - i].word)
            sent += toLowerCase(token) + ' '
    except IndexError:
        pass
    try:
        for i in {1, 2, 3}:
            token = str(sentence[span.begin_word_id + i].word)
            sent += toLowerCase(token) + ' '
    except IndexError:
        pass
    for i in forbidden_word:
        if i in sent:
            return "SENTENCE_HAVE_FORBIDDEN_WORD"
    for i in f2:
        if i in sent_exp:
            return "SENTENCE_HAVE_FORBIDDEN_WORD"
    return "SENTENCE_HAVE_CLEAR"
コード例 #3
0
def get_generic_def_mention(sentence, span, length_bin_size=5):

    for dict_indicator_feat in _get_dictionary_indicator_features(
            sentence, span):
        yield dict_indicator_feat
    # Dependency path(s) from mention to keyword(s). Various transformations of
    # the dependency path are done.
    for (i, j) in _get_substring_indices(len(sentence), MAX_KW_LENGTH):
        if i >= span.begin_word_id and i < span.begin_word_id + span.length:
            continue
        if j > span.begin_word_id and j < span.begin_word_id + span.length:
            continue
        is_in_dictionary = False
        for dict_id in dictionaries:
            if " ".join(map(lambda x: str(x.lemma), sentence[i:j])) in \
                    dictionaries[dict_id]:
                is_in_dictionary = True
                yield "KW_IND_[" + dict_id + "]"
                break
        if is_in_dictionary:
            kw_span = Span(begin_word_id=i, length=j - i)
            for dep_path_feature in _get_min_dep_path_features(
                    sentence, span, kw_span, "KW"):
                yield dep_path_feature
    # The mention starts with a capital
    length = len(" ".join(materialize_span(sentence, span, lambda x: x.word)))
    bin_id = length // length_bin_size
    length_feat = "LENGTH_" + str(bin_id)
    yield length_feat
コード例 #4
0
ファイル: gen_feats.py プロジェクト: threefoldo/deepdive
def _get_seq_features(sentence, span):
    """Yield the sequence features in a Span

    These include:
        - words sequence in the span
        - lemmas sequence in the span
        - NER tags sequence in the span
        - POS tags sequence in the span

    Args:
        sentence: a list of Word objects
        span: the Span
    """
    word_seq_feat = "WORD_SEQ_[" + " ".join(materialize_span(sentence, span, lambda x: x.word)) + "]"
    yield word_seq_feat
    lemma_seq_feat = "LEMMA_SEQ_[" + " ".join(materialize_span(sentence, span, lambda x: str(x.lemma))) + "]"
    yield lemma_seq_feat
    ner_seq_feat = "NER_SEQ_[" + " ".join(materialize_span(sentence, span, lambda x: str(x.ner))) + "]"
    yield ner_seq_feat
    pos_seq_feat = "POS_SEQ_[" + " ".join(materialize_span(sentence, span, lambda x: str(x.pos))) + "]"
    yield pos_seq_feat
コード例 #5
0
def get_generic_features_mention(sentence, span, length_bin_size=5):
    """Yield 'generic' features for a mention in a sentence.

    Args:
        sentence: a list of Word objects
        span: a Span namedtuple
        length_bin_size: the size of the bins for the length feature
    """
    # Mention sequence features (words, lemmas, ners, and poses)
    for seq_feat in _get_seq_features(sentence, span):
        yield seq_feat
    # Window (left and right, up to size 3, with combinations) around the
    # mention
    for window_feat in _get_window_features(sentence, span):
        yield window_feat
    # Is (substring of) mention in a dictionary?
    for dict_indicator_feat in _get_dictionary_indicator_features(
            sentence, span):
        yield dict_indicator_feat
    # Dependency path(s) from mention to keyword(s). Various transformations of
    # the dependency path are done.
    for (i, j) in _get_substring_indices(len(sentence), MAX_KW_LENGTH):
        if i >= span.begin_word_id and i < span.begin_word_id + span.length:
            continue
        if j > span.begin_word_id and j < span.begin_word_id + span.length:
            continue
        is_in_dictionary = False
        for dict_id in dictionaries:
            if " ".join(map(lambda x: str(x.lemma), sentence[i:j])) in \
                    dictionaries[dict_id]:
                is_in_dictionary = True
                yield "KW_IND_[" + dict_id + "]"
                break
        if is_in_dictionary:
            kw_span = Span(begin_word_id=i, length=j - i)
            for dep_path_feature in _get_min_dep_path_features(
                    sentence, span, kw_span, "KW"):
                yield dep_path_feature
    # The mention starts with a capital
    if sentence[span.begin_word_id].word[0].isupper():
        yield "STARTS_WITH_CAPITAL"
    # Length of the mention
    length = len(" ".join(materialize_span(sentence, span, lambda x: x.word)))
    bin_id = length // length_bin_size
    length_feat = "LENGTH_" + str(bin_id)
    yield length_feat
コード例 #6
0
ファイル: gen_feats.py プロジェクト: threefoldo/deepdive
def get_generic_features_mention(sentence, span, length_bin_size=5):
    """Yield 'generic' features for a mention in a sentence.

    Args:
        sentence: a list of Word objects
        span: a Span namedtuple
        length_bin_size: the size of the bins for the length feature
    """
    # Mention sequence features (words, lemmas, ners, and poses)
    for seq_feat in _get_seq_features(sentence, span):
        yield seq_feat
    # Window (left and right, up to size 3, with combinations) around the
    # mention
    for window_feat in _get_window_features(sentence, span):
        yield window_feat
    # Is (substring of) mention in a dictionary?
    for dict_indicator_feat in _get_dictionary_indicator_features(sentence, span):
        yield dict_indicator_feat
    # Dependency path(s) from mention to keyword(s). Various transformations of
    # the dependency path are done.
    for (i, j) in _get_substring_indices(len(sentence), MAX_KW_LENGTH):
        if i >= span.begin_word_id and i < span.begin_word_id + span.length:
            continue
        if j > span.begin_word_id and j < span.begin_word_id + span.length:
            continue
        is_in_dictionary = False
        for dict_id in dictionaries:
            if " ".join(map(lambda x: str(x.lemma), sentence[i:j])) in dictionaries[dict_id]:
                is_in_dictionary = True
                yield "KW_IND_[" + dict_id + "]"
                break
        if is_in_dictionary:
            kw_span = Span(begin_word_id=i, length=j - i)
            for dep_path_feature in _get_min_dep_path_features(sentence, span, kw_span, "KW"):
                yield dep_path_feature
    # The mention starts with a capital
    if sentence[span.begin_word_id].word[0].isupper():
        yield "STARTS_WITH_CAPITAL"
    # Length of the mention
    length = len(" ".join(materialize_span(sentence, span, lambda x: x.word)))
    bin_id = length // length_bin_size
    length_feat = "LENGTH_" + str(bin_id)
    yield length_feat
コード例 #7
0
ファイル: gen_feats.py プロジェクト: threefoldo/deepdive
def get_generic_features_relation(sentence, span1, span2, length_bin_size=5):
    """Yield 'generic' features for a relation in a sentence.

    Args:
        sentence: a list of Word objects
        span1: the first Span of the relation
        span2: the second Span of the relation
        length_bin_size: the size of the bins for the length feature
    """
    # Check whether the order of the spans is inverted. We use this information
    # to add a prefix to *all* the features.
    order = sorted(
        [
            span1.begin_word_id,
            span1.begin_word_id + span1.length,
            span2.begin_word_id,
            span2.begin_word_id + span2.length,
        ]
    )
    begin = order[0]
    betw_begin = order[1]
    betw_end = order[2]
    end = order[3]
    if begin == span2.begin_word_id:
        inverted = "INV_"
        yield "IS_INVERTED"
    else:
        inverted = ""
    betw_span = Span(begin_word_id=betw_begin, length=betw_end - betw_begin)
    covering_span = Span(begin_word_id=begin, length=end - begin)
    # Words, Lemmas, Ners, and Poses sequence between the mentions
    for seq_feat in _get_seq_features(sentence, betw_span):
        yield inverted + seq_feat
    # Window feature (left and right, up to size 3, combined)
    for window_feat in _get_window_features(sentence, covering_span, isolated=False):
        yield inverted + window_feat
    # Ngrams of up to size 3 between the mentions
    for ngram_feat in _get_ngram_features(sentence, betw_span):
        yield inverted + ngram_feat
    # Indicator features of whether the mentions are in dictionaries
    found1 = False
    for feat1 in _get_dictionary_indicator_features(sentence, span1, prefix=inverted + "IN_DICT"):
        found1 = True
        found2 = False
        for feat2 in _get_dictionary_indicator_features(sentence, span2, prefix=""):
            found2 = True
            yield feat1 + feat2
        if not found2:
            yield feat1 + "_[_NONE]"
    if not found1:
        for feat2 in _get_dictionary_indicator_features(sentence, span2, prefix=""):
            found2 = True
            yield inverted + "IN_DICT_[_NONE]" + feat2
    # Dependency path (and transformations) between the mention
    for betw_dep_path_feature in _get_min_dep_path_features(sentence, span1, span2, inverted + "BETW"):
        yield betw_dep_path_feature
    # Dependency paths (and transformations) between the mentions and keywords
    for (i, j) in _get_substring_indices(len(sentence), MAX_KW_LENGTH):
        if (i >= begin and i < betw_begin) or (i >= betw_end and i < end):
            continue
        if (j > begin and j <= betw_begin) or (j > betw_end and j <= end):
            continue
        is_in_dictionary = False
        for dict_id in dictionaries:
            if " ".join(map(lambda x: str(x.lemma), sentence[i:j])) in dictionaries[dict_id]:
                is_in_dictionary = True
                yield inverted + "KW_IND_[" + dict_id + "]"
                break
        if is_in_dictionary:
            kw_span = Span(begin_word_id=i, length=j - i)
            path1 = _get_min_dep_path(sentence, span1, kw_span)
            lemmas1 = []
            labels1 = []
            for edge in path1:
                lemmas1.append(str(edge.word2.lemma))
                labels1.append(edge.label)
            both1 = []
            for j in range(len(labels1)):
                both1.append(labels1[j])
                both1.append(lemmas1[j])
            both1 = both1[:-1]
            path2 = _get_min_dep_path(sentence, span2, kw_span)
            lemmas2 = []
            labels2 = []
            for edge in path2:
                lemmas2.append(str(edge.word2.lemma))
                labels2.append(edge.label)
            both2 = []
            for j in range(len(labels2)):
                both2.append(labels2[j])
                both2.append(lemmas2[j])
            both2 = both2[:-1]
            yield inverted + "KW_[" + " ".join(both1) + "]_[" + " ".join(both2) + "]"
            yield inverted + "KW_L_[" + " ".join(labels1) + "]_[" + " ".join(labels2) + "]"
            for j in range(1, len(both1), 2):
                for dict_id in dictionaries:
                    if both1[j] in dictionaries[dict_id]:
                        both1[j] = "DICT_" + str(dict_id)
                        break  # Picking up the first dictionary we find
            for j in range(1, len(both2), 2):
                for dict_id in dictionaries:
                    if both2[j] in dictionaries[dict_id]:
                        both2[j] = "DICT_" + str(dict_id)
                        break  # Picking up the first dictionary we find
            yield inverted + "KW_D_[" + " ".join(both1) + "]_[" + " ".join(both2) + "]"
    # The mentions start with a capital letter
    first_capital = sentence[span1.begin_word_id].word[0].isupper()
    second_capital = sentence[span2.begin_word_id].word[0].isupper()
    capital_feat = inverted + "STARTS_WITH_CAPITAL_[" + str(first_capital) + "_" + str(second_capital) + "]"
    yield capital_feat
    # The lengths of the mentions
    first_length = len(" ".join(materialize_span(sentence, span1, lambda x: str(x.word))))
    second_length = len(" ".join(materialize_span(sentence, span2, lambda x: str(x.word))))
    first_bin_id = first_length // length_bin_size
    second_bin_id = second_length // length_bin_size
    length_feat = inverted + "LENGTHS_[" + str(first_bin_id) + "_" + str(second_bin_id) + "]"
    yield length_feat
コード例 #8
0
def get_generic_features_relation(sentence, span1, span2, length_bin_size=5):
    """Yield 'generic' features for a relation in a sentence.

    Args:
        sentence: a list of Word objects
        span1: the first Span of the relation
        span2: the second Span of the relation
        length_bin_size: the size of the bins for the length feature
    """
    # Check whether the order of the spans is inverted. We use this information
    # to add a prefix to *all* the features.
    order = sorted([
        span1.begin_word_id, span1.begin_word_id + span1.length,
        span2.begin_word_id, span2.begin_word_id + span2.length
    ])
    begin = order[0]
    betw_begin = order[1]
    betw_end = order[2]
    end = order[3]
    if begin == span2.begin_word_id:
        inverted = "INV_"
        yield "IS_INVERTED"
    else:
        inverted = ""
    betw_span = Span(begin_word_id=betw_begin, length=betw_end - betw_begin)
    covering_span = Span(begin_word_id=begin, length=end - begin)
    # Words, Lemmas, Ners, and Poses sequence between the mentions
    for seq_feat in _get_seq_features(sentence, betw_span):
        yield inverted + seq_feat
    # Window feature (left and right, up to size 3, combined)
    for window_feat in _get_window_features(sentence,
                                            covering_span,
                                            isolated=False):
        yield inverted + window_feat
    # Ngrams of up to size 3 between the mentions
    for ngram_feat in _get_ngram_features(sentence, betw_span):
        yield inverted + ngram_feat
    # Indicator features of whether the mentions are in dictionaries
    found1 = False
    for feat1 in _get_dictionary_indicator_features(sentence,
                                                    span1,
                                                    prefix=inverted +
                                                    "IN_DICT"):
        found1 = True
        found2 = False
        for feat2 in _get_dictionary_indicator_features(sentence,
                                                        span2,
                                                        prefix=""):
            found2 = True
            yield feat1 + feat2
        if not found2:
            yield feat1 + "_[_NONE]"
    if not found1:
        for feat2 in _get_dictionary_indicator_features(sentence,
                                                        span2,
                                                        prefix=""):
            found2 = True
            yield inverted + "IN_DICT_[_NONE]" + feat2
    # Dependency path (and transformations) between the mention
    for betw_dep_path_feature in _get_min_dep_path_features(
            sentence, span1, span2, inverted + "BETW"):
        yield betw_dep_path_feature
    # Dependency paths (and transformations) between the mentions and keywords
    for (i, j) in _get_substring_indices(len(sentence), MAX_KW_LENGTH):
        if (i >= begin and i < betw_begin) or (i >= betw_end and i < end):
            continue
        if (j > begin and j <= betw_begin) or (j > betw_end and j <= end):
            continue
        is_in_dictionary = False
        for dict_id in dictionaries:
            if " ".join(map(lambda x: str(x.lemma), sentence[i:j])) in \
                    dictionaries[dict_id]:
                is_in_dictionary = True
                yield inverted + "KW_IND_[" + dict_id + "]"
                break
        if is_in_dictionary:
            kw_span = Span(begin_word_id=i, length=j - i)
            path1 = _get_min_dep_path(sentence, span1, kw_span)
            lemmas1 = []
            labels1 = []
            for edge in path1:
                lemmas1.append(str(edge.word2.lemma))
                labels1.append(edge.label)
            both1 = []
            for j in range(len(labels1)):
                both1.append(labels1[j])
                both1.append(lemmas1[j])
            both1 = both1[:-1]
            path2 = _get_min_dep_path(sentence, span2, kw_span)
            lemmas2 = []
            labels2 = []
            for edge in path2:
                lemmas2.append(str(edge.word2.lemma))
                labels2.append(edge.label)
            both2 = []
            for j in range(len(labels2)):
                both2.append(labels2[j])
                both2.append(lemmas2[j])
            both2 = both2[:-1]
            yield inverted + "KW_[" + " ".join(both1) + "]_[" + \
                " ".join(both2) + "]"
            yield inverted + "KW_L_[" + " ".join(labels1) + "]_[" + \
                " ".join(labels2) + "]"
            for j in range(1, len(both1), 2):
                for dict_id in dictionaries:
                    if both1[j] in dictionaries[dict_id]:
                        both1[j] = "DICT_" + str(dict_id)
                        break  # Picking up the first dictionary we find
            for j in range(1, len(both2), 2):
                for dict_id in dictionaries:
                    if both2[j] in dictionaries[dict_id]:
                        both2[j] = "DICT_" + str(dict_id)
                        break  # Picking up the first dictionary we find
            yield inverted + "KW_D_[" + " ".join(both1) + "]_[" + \
                " ".join(both2) + "]"
    # The mentions start with a capital letter
    first_capital = sentence[span1.begin_word_id].word[0].isupper()
    second_capital = sentence[span2.begin_word_id].word[0].isupper()
    capital_feat = inverted + "STARTS_WITH_CAPITAL_[" + str(first_capital) + \
        "_" + str(second_capital) + "]"
    yield capital_feat
    # The lengths of the mentions
    first_length = len(" ".join(
        materialize_span(sentence, span1, lambda x: str(x.word))))
    second_length = len(" ".join(
        materialize_span(sentence, span2, lambda x: str(x.word))))
    first_bin_id = first_length // length_bin_size
    second_bin_id = second_length // length_bin_size
    length_feat = inverted + "LENGTHS_[" + str(first_bin_id) + "_" + \
        str(second_bin_id) + "]"
    yield length_feat