def lexical_polarity(word): if word.word_form.lower() in WordBasedExtractors.POSITIVE_WORDS: return WordBasedResult('pos') elif word.word_form.lower() in WordBasedExtractors.NEGATIVE_WORDS: return WordBasedResult('neg') else: return WordBasedResult('neu')
def parse_start_or_end_child_in_s_clause(sentence): '''Suggested by Marilena Di Bari. Typically temporal expressions are at the very end or beginning of a well-formed English sentence. ''' parsetree = sentence.parsetree result = [] for idx in parsetree.treepositions(order='leaves'): try: tree = parsetree[idx[:-1]] steps_up = 1 # there are some leaves which are not necessarily child of S # all the leaves are always child of ROOT) # Don't believe me? Try to parse this sentence: # - "And Rosneft benefits from BP's expertise in exploring in # difficult and potentially hazardous conditions." while not (tree.node.startswith('S') or tree.node == 'ROOT'): tree = tree.parent() steps_up += 1 position_under_s = idx[(len(idx) - steps_up)] leaf_result = position_under_s in (0, len(tree) - 1) result.append(WordBasedResult(leaf_result)) except Exception: result.append(WordBasedResult(False)) return SentenceBasedResult(tuple(result))
def dependency_incoming_granfather_pos_collapsed(word): try: pos = word.dependencies_in('collapsed')[0][1].dependencies_in( 'collapsed')[0][1].part_of_speech return WordBasedResult(pos) except IndexError: return WordBasedResult(False)
def dependency_incoming_granfather_relations_collapsed(word): try: gfather = word.dependencies_in('collapsed')[0][1].dependencies_in( 'collapsed')[0][0] return WordBasedResult(gfather) except IndexError: return WordBasedResult(False)
def from_temp_dct(from_obj, to_obj, document): if isinstance(from_obj, TemporalExpression): res = from_obj.value.replace('-', '') == \ document.dct.replace('_', '') return WordBasedResult(res) else: return WordBasedResult('_')
def parse_3_levels_up_childs(word): try: node = word.constituency_parent.parent().parent() node_label = [i.node for i in node] return WordBasedResult('_'.join(node_label)) except (AttributeError, TypeError): return WordBasedResult('_^_')
def word_distance(from_obj, to_obj, document): if from_obj.id_sentence() == to_obj.id_sentence(): if from_obj.id_first_word() < to_obj.id_first_word(): return WordBasedResult( abs(to_obj.id_first_word() - from_obj.id_last_word())) else: return WordBasedResult( abs(from_obj.id_first_word() - to_obj.id_last_word())) else: return WordBasedResult('_')
def direction(from_obj, to_obj, document): if from_obj.id_sentence() - to_obj.id_sentence() < 0: return WordBasedResult('>') elif from_obj.id_sentence() - to_obj.id_sentence() > 0: return WordBasedResult('<') else: if from_obj.id_first_word() - to_obj.id_first_word() > 0: return WordBasedResult('>') else: return WordBasedResult('<')
def dominant_verb_collapsed(word): if word.part_of_speech.startswith('V'): return WordBasedResult(word.part_of_speech) else: steps = max_steps while not word.part_of_speech.startswith('V') and steps: try: word = word.dependencies_in('collapsed')[0][1] steps -= 1 except: return WordBasedResult(False) return WordBasedResult(word.part_of_speech)
def temporal_temporal_adverbs_relationships_in_time_indefinite(word): adverbs = ('already', 'before', 'early', 'earlier', 'eventually', 'finally', 'first', 'formerly', 'just', 'last', 'late', 'later', 'lately', 'next', 'previously', 'recently', 'since', 'soon', 'still', 'yet', 'after', 'earliest', 'latest', 'afterwards') return WordBasedResult(word.word_form.lower() in adverbs)
def temporal_temporal_adverbs_frequency_indefinite(word): adverbs = ('always', 'constantly', 'ever', 'frequently', 'generally', 'infrequently', 'never', 'normally', 'occasionally', 'often', 'rarely', 'regularly', 'seldom', 'sometimes', 'regularly', 'usually', 'continually', 'periodically', 'repeatedly') return WordBasedResult(word.word_form.lower() in adverbs)
def temporal_pod(word): pods = ('morning', 'afternoon', 'evening', 'night', 'noon', 'midnight', 'midday', 'sunrise', 'dusk', 'sunset', 'dawn', 'overnight', 'midday', 'noonday', 'noontide', 'nightfall', 'midafternoon', 'daybreak', 'gloaming', 'a\.?m\.?', 'p\.?m\.?') pattrn = r'^({})s?$'.format('|'.join(pods)) return WordBasedResult(any(re.findall(pattrn, word.word_form.lower())))
def parse_2_levels_up_nodes(word): parents = [] try: parents.append(word.constituency_parent.parent().node) parents.append(word.constituency_parent.parent().parent().node) except (AttributeError, TypeError): pass return WordBasedResult('_'.join(reversed(parents)))
def sentences_linked_by_coref(from_obj, to_obj, document): """Checks whether the sentence of from_obj and the one of to_obj are linked by a coreferencial link. """ linked = document.sentences[from_obj.id_sentence()].connected_to( to_obj.id_sentence()) return WordBasedResult(linked)
def to_is_root(from_obj, to_obj, document): """It returns True if one of the words in `to_obj` is ROOT according to the dependency relations. """ sent = document.sentences[to_obj.id_sentence()] to_ids = [w.id_token for w in to_obj.words] res = any([sent.basic_dependencies.is_root(n) for n in to_ids]) return WordBasedResult(res)
def temporal_period(word): periods = [ 'centur[y|ies]', 'decades?', 'years?', 'months?', 'days?', 'week\-?ends?', 'weeks?', 'hours?', 'minutes?', 'seconds?', 'fortnights?' ] pattern = r'^({pattern})$'.format(pattern='|'.join(periods)) return WordBasedResult(any(re.findall(pattern, word.word_form.lower())))
def matching_gazetteer(gazetteer, sentence): ''' It searches for gazetteer elements into the sentence and returns a SentenceBasedResult object which is composed of WordBasedResults 'I's or 'O's. Example: sentence = ['I', 'live', 'in', 'New', 'York', '.'] gazetteer = { ..., ('New', 'York'), ...} returns SentenceBasedResult(W('O'), W('O'), W('O'), W('I'), W('I'), W('O')) ''' word_forms = [token.word_form for token in sentence.words] result = [WordBasedResult('O')] * len(word_forms) for gazetteer_item in gazetteer: subsequences = search_subsequence(word_forms, gazetteer_item, end=True) for start, end in subsequences: for index in xrange(start, end + 1): result[index] = WordBasedResult('I') return SentenceBasedResult(tuple(result))
def to_parse_common_ancestor(from_obj, to_obj, document): start, end = to_obj.id_first_word() + 1, to_obj.id_last_word() + 1 sentence = document.sentences[to_obj.words[0].id_sentence] positions = list(sentence.parsetree.treepositions(order='leaves')) address = positions[start - 1] for w in positions[start:end]: if len(w) < len(address): address = w common_ancestor = sentence.parsetree[address[:-1]] return WordBasedResult(common_ancestor.node)
def dependency_relation_type(from_obj, to_obj, document): if from_obj.id_sentence() != to_obj.id_sentence(): return WordBasedResult('') result = [] for from_word in from_obj.words: for to_word in to_obj.words: try: from_father = from_word.dependencies_in('basic', to_word) to_father = to_word.dependencies_in('basic', from_word) if from_father: result.append(from_father[0][0]) continue if to_father: result.append(to_father[0][0]) continue except KeyError: pass return WordBasedResult(' '.join(result))
def temporal_difference(from_obj, to_obj, document): if isinstance(from_obj, TemporalExpression) and \ isinstance(to_obj, TemporalExpression): from_dt = re.match(r'^([0-9]{4})-([0-9]{2})-([0-9]{2})', from_obj.value) to_dt = re.match(r'^([0-9]{4})-([0-9]{2})-([0-9]{2})', to_obj.value) if from_dt and to_dt: try: from_dt = date(int(from_dt.group(1)), int(from_dt.group(2)), int(from_dt.group(3))) to_dt = date(int(to_dt.group(1)), int(to_dt.group(2)), int(to_dt.group(3))) diff = from_dt - to_dt return WordBasedResult(diff.days) except ValueError: return WordBasedResult('_') return WordBasedResult('_') else: return WordBasedResult('_')
def morphological_extended_pattern(word): pattern = '' for char in word.word_form: if char.isupper(): pattern += 'X' elif char.islower(): pattern += 'x' elif char.isdigit(): pattern += 'd' elif char.isspace(): pattern += ' ' else: pattern += char return WordBasedResult(pattern)
def linked_by_dependency_relation(from_obj, to_obj, document): '''It returns if the two elements are connected through one of their words. ''' def same_sentence(from_obj, to_obj): return from_obj.id_sentence() == to_obj.id_sentence() def connected(word1, word2): cond1 = word1.dependencies_in('basic', word2) cond2 = word2.dependencies_in('basic', word1) if cond1: return '<' if cond2: return '>' return False if same_sentence(from_obj, to_obj): for from_word in from_obj.words: for to_word in to_obj.words: conn = connected(from_word, to_word) if conn: return WordBasedResult(conn) return WordBasedResult(False)
def dependency_incoming_relations_basic(word): '''For each word I represent a vector of all incoming relations for each word = dr1 dr2 dr2 dr2 dr2 dr2 ... dr2 [ F , F , T , F , F , F , ... , T ] Dependencies relations are taken from: http://nlp.stanford.edu/software/dependencies_manual.pdf ''' f_suffix = lambda f_name: 'basic_dependency_incoming_' + f_name r = ((f_suffix(l), WordBasedResult(bool(word.basic_dependencies_in.get(l, False)))) for l in dep_labels) return WordBasedResults(tuple(r))
def to_governor_verb_pos(from_obj, to_obj, document): def stop_condition(word): return any([ word_to.part_of_speech.startswith('V') for _, word_to in word.dependencies_in('basic') ]) sentence = document.sentences[to_obj.id_sentence()] governors_pos = set() for word in to_obj.words: try: while not stop_condition(word): parents = sentence.dependencies_in('basic') if parents: word = parents[0][1] governors_pos.add(word.part_of_speech) except: continue return WordBasedResult('-'.join(sorted(governors_pos)))
def lexical_tense(word): postag = word.part_of_speech if postag in ('VB', 'VD', 'VH', 'VV'): return WordBasedResult('BASE') elif postag in ('VBN', 'VDN', 'VHN', 'VVN'): return WordBasedResult('PASTPARTICIPLE') elif postag in ('VBD', 'VDD', 'VHD', 'VVD'): return WordBasedResult('PAST') elif postag in ('VBG', 'VDG', 'VHG', 'VVG'): return WordBasedResult('GERUND') elif postag in ('VBZ', 'VBP', 'VDZ', 'VDP', 'VHZ', 'VHP'): return WordBasedResult('PRESENT') else: return WordBasedResult('NONE')
def parse_distance_from_s_node(sentence): '''How far the current node (its POS) is from an S-parent. ''' parsetree = sentence.parsetree result = [] for idx in parsetree.treepositions(order='leaves'): tree = parsetree[idx[:-1]] steps_up = 1 # there are some leaves which are not necessarily child of S # all the leaves are always child of ROOT) # Don't believe me? Try to parse this sentence: # - "And Rosneft benefits from BP's expertise in exploring in # difficult and potentially hazardous conditions." try: while not (tree.node.startswith('S') or tree.node == 'ROOT'): tree = tree.parent() steps_up += 1 except AttributeError: pass result.append(WordBasedResult(steps_up)) return SentenceBasedResult(tuple(result))
def same_temp_modality(from_obj, to_obj, document): if isinstance(from_obj, TemporalExpression) and \ isinstance(to_obj, TemporalExpression): return WordBasedResult(from_obj.mod == to_obj.mod) else: return WordBasedResult(False)
def to_temp_modality(from_obj, to_obj, document): if isinstance(to_obj, TemporalExpression): return WordBasedResult(to_obj.mod) else: return WordBasedResult('_')
def same_temp_type(from_obj, to_obj, document): if isinstance(from_obj, TemporalExpression) and \ isinstance(to_obj, TemporalExpression): return WordBasedResult(from_obj.ttype == to_obj.ttype) else: return WordBasedResult(False)
def to_temp_type(from_obj, to_obj, document): if isinstance(to_obj, TemporalExpression): return WordBasedResult(to_obj.ttype) else: return WordBasedResult('_')