def _get_seq_features(sentence, span): """Yield the sequence features in a Span These include: - words sequence in the span - lemmas sequence in the span - NER tags sequence in the span - POS tags sequence in the span Args: sentence: a list of Word objects span: the Span """ word_seq_feat = "WORD_SEQ_[" + " ".join( materialize_span(sentence, span, lambda x: x.word)) + "]" yield word_seq_feat lemma_seq_feat = "LEMMA_SEQ_[" + " ".join( materialize_span(sentence, span, lambda x: str(x.lemma))) + "]" yield lemma_seq_feat ner_seq_feat = "NER_SEQ_[" + " ".join( materialize_span(sentence, span, lambda x: str(x.ner))) + "]" yield ner_seq_feat pos_seq_feat = "POS_SEQ_[" + " ".join( materialize_span(sentence, span, lambda x: str(x.pos))) + "]" yield pos_seq_feat
def get_forbidden_word(sentence, span, span1): forbidden_word = [ 'nếu', "đối_với", "trường_hợp", 'phải', 'đó', 'không', 'được', 'đã', 'đồng_thời', 'cần', 'chỉ', 'cụ_thể', 'coi', 'đây', 'ai' ] sent = "" f2 = ["đối_với", "trường_hợp", "nếu"] sent_exp = " ".join(materialize_span(sentence, span1, lambda x: x.word)) sent_exp = toLowerCase(sent_exp) try: for i in {1, 2, 3}: token = str(sentence[span.begin_word_id - i].word) sent += toLowerCase(token) + ' ' except IndexError: pass try: for i in {1, 2, 3}: token = str(sentence[span.begin_word_id + i].word) sent += toLowerCase(token) + ' ' except IndexError: pass for i in forbidden_word: if i in sent: return "SENTENCE_HAVE_FORBIDDEN_WORD" for i in f2: if i in sent_exp: return "SENTENCE_HAVE_FORBIDDEN_WORD" return "SENTENCE_HAVE_CLEAR"
def get_generic_def_mention(sentence, span, length_bin_size=5): for dict_indicator_feat in _get_dictionary_indicator_features( sentence, span): yield dict_indicator_feat # Dependency path(s) from mention to keyword(s). Various transformations of # the dependency path are done. for (i, j) in _get_substring_indices(len(sentence), MAX_KW_LENGTH): if i >= span.begin_word_id and i < span.begin_word_id + span.length: continue if j > span.begin_word_id and j < span.begin_word_id + span.length: continue is_in_dictionary = False for dict_id in dictionaries: if " ".join(map(lambda x: str(x.lemma), sentence[i:j])) in \ dictionaries[dict_id]: is_in_dictionary = True yield "KW_IND_[" + dict_id + "]" break if is_in_dictionary: kw_span = Span(begin_word_id=i, length=j - i) for dep_path_feature in _get_min_dep_path_features( sentence, span, kw_span, "KW"): yield dep_path_feature # The mention starts with a capital length = len(" ".join(materialize_span(sentence, span, lambda x: x.word))) bin_id = length // length_bin_size length_feat = "LENGTH_" + str(bin_id) yield length_feat
def _get_seq_features(sentence, span): """Yield the sequence features in a Span These include: - words sequence in the span - lemmas sequence in the span - NER tags sequence in the span - POS tags sequence in the span Args: sentence: a list of Word objects span: the Span """ word_seq_feat = "WORD_SEQ_[" + " ".join(materialize_span(sentence, span, lambda x: x.word)) + "]" yield word_seq_feat lemma_seq_feat = "LEMMA_SEQ_[" + " ".join(materialize_span(sentence, span, lambda x: str(x.lemma))) + "]" yield lemma_seq_feat ner_seq_feat = "NER_SEQ_[" + " ".join(materialize_span(sentence, span, lambda x: str(x.ner))) + "]" yield ner_seq_feat pos_seq_feat = "POS_SEQ_[" + " ".join(materialize_span(sentence, span, lambda x: str(x.pos))) + "]" yield pos_seq_feat
def get_generic_features_mention(sentence, span, length_bin_size=5): """Yield 'generic' features for a mention in a sentence. Args: sentence: a list of Word objects span: a Span namedtuple length_bin_size: the size of the bins for the length feature """ # Mention sequence features (words, lemmas, ners, and poses) for seq_feat in _get_seq_features(sentence, span): yield seq_feat # Window (left and right, up to size 3, with combinations) around the # mention for window_feat in _get_window_features(sentence, span): yield window_feat # Is (substring of) mention in a dictionary? for dict_indicator_feat in _get_dictionary_indicator_features( sentence, span): yield dict_indicator_feat # Dependency path(s) from mention to keyword(s). Various transformations of # the dependency path are done. for (i, j) in _get_substring_indices(len(sentence), MAX_KW_LENGTH): if i >= span.begin_word_id and i < span.begin_word_id + span.length: continue if j > span.begin_word_id and j < span.begin_word_id + span.length: continue is_in_dictionary = False for dict_id in dictionaries: if " ".join(map(lambda x: str(x.lemma), sentence[i:j])) in \ dictionaries[dict_id]: is_in_dictionary = True yield "KW_IND_[" + dict_id + "]" break if is_in_dictionary: kw_span = Span(begin_word_id=i, length=j - i) for dep_path_feature in _get_min_dep_path_features( sentence, span, kw_span, "KW"): yield dep_path_feature # The mention starts with a capital if sentence[span.begin_word_id].word[0].isupper(): yield "STARTS_WITH_CAPITAL" # Length of the mention length = len(" ".join(materialize_span(sentence, span, lambda x: x.word))) bin_id = length // length_bin_size length_feat = "LENGTH_" + str(bin_id) yield length_feat
def get_generic_features_mention(sentence, span, length_bin_size=5): """Yield 'generic' features for a mention in a sentence. Args: sentence: a list of Word objects span: a Span namedtuple length_bin_size: the size of the bins for the length feature """ # Mention sequence features (words, lemmas, ners, and poses) for seq_feat in _get_seq_features(sentence, span): yield seq_feat # Window (left and right, up to size 3, with combinations) around the # mention for window_feat in _get_window_features(sentence, span): yield window_feat # Is (substring of) mention in a dictionary? for dict_indicator_feat in _get_dictionary_indicator_features(sentence, span): yield dict_indicator_feat # Dependency path(s) from mention to keyword(s). Various transformations of # the dependency path are done. for (i, j) in _get_substring_indices(len(sentence), MAX_KW_LENGTH): if i >= span.begin_word_id and i < span.begin_word_id + span.length: continue if j > span.begin_word_id and j < span.begin_word_id + span.length: continue is_in_dictionary = False for dict_id in dictionaries: if " ".join(map(lambda x: str(x.lemma), sentence[i:j])) in dictionaries[dict_id]: is_in_dictionary = True yield "KW_IND_[" + dict_id + "]" break if is_in_dictionary: kw_span = Span(begin_word_id=i, length=j - i) for dep_path_feature in _get_min_dep_path_features(sentence, span, kw_span, "KW"): yield dep_path_feature # The mention starts with a capital if sentence[span.begin_word_id].word[0].isupper(): yield "STARTS_WITH_CAPITAL" # Length of the mention length = len(" ".join(materialize_span(sentence, span, lambda x: x.word))) bin_id = length // length_bin_size length_feat = "LENGTH_" + str(bin_id) yield length_feat
def get_generic_features_relation(sentence, span1, span2, length_bin_size=5): """Yield 'generic' features for a relation in a sentence. Args: sentence: a list of Word objects span1: the first Span of the relation span2: the second Span of the relation length_bin_size: the size of the bins for the length feature """ # Check whether the order of the spans is inverted. We use this information # to add a prefix to *all* the features. order = sorted( [ span1.begin_word_id, span1.begin_word_id + span1.length, span2.begin_word_id, span2.begin_word_id + span2.length, ] ) begin = order[0] betw_begin = order[1] betw_end = order[2] end = order[3] if begin == span2.begin_word_id: inverted = "INV_" yield "IS_INVERTED" else: inverted = "" betw_span = Span(begin_word_id=betw_begin, length=betw_end - betw_begin) covering_span = Span(begin_word_id=begin, length=end - begin) # Words, Lemmas, Ners, and Poses sequence between the mentions for seq_feat in _get_seq_features(sentence, betw_span): yield inverted + seq_feat # Window feature (left and right, up to size 3, combined) for window_feat in _get_window_features(sentence, covering_span, isolated=False): yield inverted + window_feat # Ngrams of up to size 3 between the mentions for ngram_feat in _get_ngram_features(sentence, betw_span): yield inverted + ngram_feat # Indicator features of whether the mentions are in dictionaries found1 = False for feat1 in _get_dictionary_indicator_features(sentence, span1, prefix=inverted + "IN_DICT"): found1 = True found2 = False for feat2 in _get_dictionary_indicator_features(sentence, span2, prefix=""): found2 = True yield feat1 + feat2 if not found2: yield feat1 + "_[_NONE]" if not found1: for feat2 in _get_dictionary_indicator_features(sentence, span2, prefix=""): found2 = True yield inverted + "IN_DICT_[_NONE]" + feat2 # Dependency path (and transformations) between the mention for betw_dep_path_feature in _get_min_dep_path_features(sentence, span1, span2, inverted + "BETW"): yield betw_dep_path_feature # Dependency paths (and transformations) between the mentions and keywords for (i, j) in _get_substring_indices(len(sentence), MAX_KW_LENGTH): if (i >= begin and i < betw_begin) or (i >= betw_end and i < end): continue if (j > begin and j <= betw_begin) or (j > betw_end and j <= end): continue is_in_dictionary = False for dict_id in dictionaries: if " ".join(map(lambda x: str(x.lemma), sentence[i:j])) in dictionaries[dict_id]: is_in_dictionary = True yield inverted + "KW_IND_[" + dict_id + "]" break if is_in_dictionary: kw_span = Span(begin_word_id=i, length=j - i) path1 = _get_min_dep_path(sentence, span1, kw_span) lemmas1 = [] labels1 = [] for edge in path1: lemmas1.append(str(edge.word2.lemma)) labels1.append(edge.label) both1 = [] for j in range(len(labels1)): both1.append(labels1[j]) both1.append(lemmas1[j]) both1 = both1[:-1] path2 = _get_min_dep_path(sentence, span2, kw_span) lemmas2 = [] labels2 = [] for edge in path2: lemmas2.append(str(edge.word2.lemma)) labels2.append(edge.label) both2 = [] for j in range(len(labels2)): both2.append(labels2[j]) both2.append(lemmas2[j]) both2 = both2[:-1] yield inverted + "KW_[" + " ".join(both1) + "]_[" + " ".join(both2) + "]" yield inverted + "KW_L_[" + " ".join(labels1) + "]_[" + " ".join(labels2) + "]" for j in range(1, len(both1), 2): for dict_id in dictionaries: if both1[j] in dictionaries[dict_id]: both1[j] = "DICT_" + str(dict_id) break # Picking up the first dictionary we find for j in range(1, len(both2), 2): for dict_id in dictionaries: if both2[j] in dictionaries[dict_id]: both2[j] = "DICT_" + str(dict_id) break # Picking up the first dictionary we find yield inverted + "KW_D_[" + " ".join(both1) + "]_[" + " ".join(both2) + "]" # The mentions start with a capital letter first_capital = sentence[span1.begin_word_id].word[0].isupper() second_capital = sentence[span2.begin_word_id].word[0].isupper() capital_feat = inverted + "STARTS_WITH_CAPITAL_[" + str(first_capital) + "_" + str(second_capital) + "]" yield capital_feat # The lengths of the mentions first_length = len(" ".join(materialize_span(sentence, span1, lambda x: str(x.word)))) second_length = len(" ".join(materialize_span(sentence, span2, lambda x: str(x.word)))) first_bin_id = first_length // length_bin_size second_bin_id = second_length // length_bin_size length_feat = inverted + "LENGTHS_[" + str(first_bin_id) + "_" + str(second_bin_id) + "]" yield length_feat
def get_generic_features_relation(sentence, span1, span2, length_bin_size=5): """Yield 'generic' features for a relation in a sentence. Args: sentence: a list of Word objects span1: the first Span of the relation span2: the second Span of the relation length_bin_size: the size of the bins for the length feature """ # Check whether the order of the spans is inverted. We use this information # to add a prefix to *all* the features. order = sorted([ span1.begin_word_id, span1.begin_word_id + span1.length, span2.begin_word_id, span2.begin_word_id + span2.length ]) begin = order[0] betw_begin = order[1] betw_end = order[2] end = order[3] if begin == span2.begin_word_id: inverted = "INV_" yield "IS_INVERTED" else: inverted = "" betw_span = Span(begin_word_id=betw_begin, length=betw_end - betw_begin) covering_span = Span(begin_word_id=begin, length=end - begin) # Words, Lemmas, Ners, and Poses sequence between the mentions for seq_feat in _get_seq_features(sentence, betw_span): yield inverted + seq_feat # Window feature (left and right, up to size 3, combined) for window_feat in _get_window_features(sentence, covering_span, isolated=False): yield inverted + window_feat # Ngrams of up to size 3 between the mentions for ngram_feat in _get_ngram_features(sentence, betw_span): yield inverted + ngram_feat # Indicator features of whether the mentions are in dictionaries found1 = False for feat1 in _get_dictionary_indicator_features(sentence, span1, prefix=inverted + "IN_DICT"): found1 = True found2 = False for feat2 in _get_dictionary_indicator_features(sentence, span2, prefix=""): found2 = True yield feat1 + feat2 if not found2: yield feat1 + "_[_NONE]" if not found1: for feat2 in _get_dictionary_indicator_features(sentence, span2, prefix=""): found2 = True yield inverted + "IN_DICT_[_NONE]" + feat2 # Dependency path (and transformations) between the mention for betw_dep_path_feature in _get_min_dep_path_features( sentence, span1, span2, inverted + "BETW"): yield betw_dep_path_feature # Dependency paths (and transformations) between the mentions and keywords for (i, j) in _get_substring_indices(len(sentence), MAX_KW_LENGTH): if (i >= begin and i < betw_begin) or (i >= betw_end and i < end): continue if (j > begin and j <= betw_begin) or (j > betw_end and j <= end): continue is_in_dictionary = False for dict_id in dictionaries: if " ".join(map(lambda x: str(x.lemma), sentence[i:j])) in \ dictionaries[dict_id]: is_in_dictionary = True yield inverted + "KW_IND_[" + dict_id + "]" break if is_in_dictionary: kw_span = Span(begin_word_id=i, length=j - i) path1 = _get_min_dep_path(sentence, span1, kw_span) lemmas1 = [] labels1 = [] for edge in path1: lemmas1.append(str(edge.word2.lemma)) labels1.append(edge.label) both1 = [] for j in range(len(labels1)): both1.append(labels1[j]) both1.append(lemmas1[j]) both1 = both1[:-1] path2 = _get_min_dep_path(sentence, span2, kw_span) lemmas2 = [] labels2 = [] for edge in path2: lemmas2.append(str(edge.word2.lemma)) labels2.append(edge.label) both2 = [] for j in range(len(labels2)): both2.append(labels2[j]) both2.append(lemmas2[j]) both2 = both2[:-1] yield inverted + "KW_[" + " ".join(both1) + "]_[" + \ " ".join(both2) + "]" yield inverted + "KW_L_[" + " ".join(labels1) + "]_[" + \ " ".join(labels2) + "]" for j in range(1, len(both1), 2): for dict_id in dictionaries: if both1[j] in dictionaries[dict_id]: both1[j] = "DICT_" + str(dict_id) break # Picking up the first dictionary we find for j in range(1, len(both2), 2): for dict_id in dictionaries: if both2[j] in dictionaries[dict_id]: both2[j] = "DICT_" + str(dict_id) break # Picking up the first dictionary we find yield inverted + "KW_D_[" + " ".join(both1) + "]_[" + \ " ".join(both2) + "]" # The mentions start with a capital letter first_capital = sentence[span1.begin_word_id].word[0].isupper() second_capital = sentence[span2.begin_word_id].word[0].isupper() capital_feat = inverted + "STARTS_WITH_CAPITAL_[" + str(first_capital) + \ "_" + str(second_capital) + "]" yield capital_feat # The lengths of the mentions first_length = len(" ".join( materialize_span(sentence, span1, lambda x: str(x.word)))) second_length = len(" ".join( materialize_span(sentence, span2, lambda x: str(x.word)))) first_bin_id = first_length // length_bin_size second_bin_id = second_length // length_bin_size length_feat = inverted + "LENGTHS_[" + str(first_bin_id) + "_" + \ str(second_bin_id) + "]" yield length_feat