def select(arguments, premise, verbose=True): """Select token or phrase.""" # Get token to be selected and selection type select_type = None select_token, select_type = arguments select_type = select_type.data # If full phrase should be deleted if select_type == 'full': select_tokens = easy_parse.get_dependent_tokens(premise.tokens, select_token) select_tokens.append(select_token) # Sort selected tokens by ID select_tokens_sorted = sorted(select_tokens, key=lambda token: token.id) # Ge new premise tokens new_premise_tokens = [t for t in select_tokens_sorted] premise.tokens = new_premise_tokens # Reparse and update premise premise = reparse(premise) premise.update(premise) if verbose: print('New premise:', " ".join(premise.words)) # Get projectivity projectivity = rel_pol.projectivity_dict[select_token.polarity] # Return new premise and projectivity return premise, projectivity
def get_entities_with_roles(parsed_text): """Get entities with syntactic role S,O,X,P or G. If required, reduce weights for embedded entities.""" # Initializing entities_with_roles = [] passive_counter = 0 for sentence in parsed_text: # transform tokens into Token class instances tokens = [parse_info.Token(k) for k in sentence] # resolve and count passive constructions if necessary if settings.passive_on: passives = parse_info.adjust_passive(tokens) if passives == True: passive_counter += 1 # transform tokens into Sentence class instance sent = parse_info.Sentence(tokens) # get all subjects and objects of Sentence # get subject and object lemma if subj/obj is a noun or is marked as coreferent entity subjs = sent.subj() subjs_lemma = [ t for t in subjs if (t.sim_pos == 'N' or t.coref != '_') ] objs = sent.obj() objs_lemma = [t for t in objs if (t.sim_pos == 'N' or t.coref != '_')] # get all words from full subj and obj noun phrases (for excluding words later in the 'other' category) full_subjs = [ t for t in list( chain.from_iterable([ parse_info.get_full_phrase(tokens, subj) for subj in subjs ])) if (t.sim_pos == 'N' or t.coref != '_') ] full_objs = [ t for t in list( chain.from_iterable( [parse_info.get_full_phrase(tokens, obj) for obj in objs])) if (t.sim_pos == 'N' or t.coref != '_') ] # get all possessive pronouns (category 'P') poss_pronouns = [ t for t in tokens if (t.coref != '_' and (t.full_pos == 'PPOSAT')) ] # get all genitive modifiers (category 'G') genitive_mods = [ t for t in tokens if ((t.coref != '_' or t.sim_pos == 'N') and t.function == 'gmod') ] # get all nouns that are not contained in the subj or obj noun phrase, or genitive modifiers others = [ t for t in tokens if ((t.sim_pos == 'N') and t not in subjs_lemma + objs_lemma) ] # get prepositions preps = [t for t in tokens if t.function == 'pp'] # if genitive cat is on, remove genitives from 'others' if settings.cat_g_on: others = [t for t in others if t.function != 'gmod'] # assign cat G to genitive modifiers; or merge with category P into X for g in genitive_mods: if not settings.merge_p_and_g: g.tag = 'G' # if category P and G are merged into one (X) else: if g in full_subjs: subjs_lemma.append(g) elif g in full_objs: objs_lemma.append(g) else: others.append(g) g.tag = 'X' # Assign tag X to "other" category tokens for x in others: x.tag = 'X' # if possessive category is on, if settings.cat_p_on: # assign cat G to possessive pronouns, or merge with category G for p in poss_pronouns: if not settings.merge_p_and_g: p.tag = 'P' # if category P and G are merged into one (X) else: if p in full_subjs: subjs_lemma.append(p) elif p in full_objs: objs_lemma.append(p) else: others.append(p) p.tag = 'X' # Assign tag O to objects for o in objs_lemma: o.tag = 'O' # Assign tag S to subjects for s in subjs_lemma: s.tag = 'S' # get prepositional phrases prep_phrase = [ (p_ent) for (p_ent, prep, ent) in itertools.product(tokens, preps, tokens) if p_ent.function == 'pn' and p_ent in subjs_lemma + objs_lemma + others + poss_pronouns + genitive_mods + full_subjs + full_objs and p_ent.dependency == prep.position and prep.dependency == ent.position and (ent.function == 'pn') ] # get rel pronouns rel_prons = [t for t in tokens if t.full_pos == 'PRELS'] # get rel clauses rel_clauses = [ (k, j) for (k, j) in itertools.product(rel_prons, tokens) if j.function in ['rel', 'cj', 'objc'] and j.full_pos.endswith('FIN') and j.position > k.position ] # mark relative clause tokens for (rel_pron, rel_pred) in rel_clauses: for token in tokens: if token.position >= rel_pron.position and token.position <= rel_pred.position: token.rel = True # get conjunction candidates conjunctions = [ t for t in tokens if t.full_pos == 'KOUS' and t.function == 'konj' ] # get conjunctions and predicates conj_pred = [ (k, j) for (k, j) in itertools.product(conjunctions, tokens) if j.full_pos.startswith('V') and j.full_pos.endswith('FIN') and j.function in ['root', 'neb'] and j.position == k.dependency ] # Mark all tokens within subjunctional clause for k, j in conj_pred: for t in tokens: if t.position >= k.position and t.position <= j.position: t.subj = True # get part presense and past part_pres = [ t for t in tokens if t.full_pos == 'ADJD' and t.morph.part == '<PPRES' and t.function in ['root', 'pn'] ] part_praet = [ t for t in tokens if t.full_pos == 'VVPP' and t.function == 'neb' ] # for each participle for part in part_pres + part_praet: # get full participle construction part_con = parse_info.get_dependent_tokens(tokens, part) + [part] part_con = parse_info.get_all_tokens(part_con, tokens) # set initial comma positions first_comma_position = None sec_comma_position = None # find comma positions for comma in [t for t in part_con if t.lemma == ',']: if comma.position < part.position: first_comma_position = comma.position if comma.position > part.position: sec_comma_position = comma.position # cut participle construction at commas (only in-between) part_con = [ k for k in part_con if (first_comma_position == None or first_comma_position < k.position) and (sec_comma_position == None or sec_comma_position > k.position) ] # mark token in participle construction for token in part_con: token.part = True # Reduce weights for tokes in prepositional phrases, relative and # subjunctive clauses and participle constructions if settings.reduce_weights: for p in prep_phrase: if p.tag != '': p.reduce_tag() for t in tokens: if t.rel and t.tag != '': t.reduce_tag() if t.part and t.tag != '': t.reduce_tag() if t.subj and t.tag != '': t.reduce_tag() # list of all entities all_entities = subjs_lemma + objs_lemma + others if not settings.merge_p_and_g: # append cat p and g entities if settings.cat_p_on: all_entities = all_entities + poss_pronouns if settings.cat_g_on: all_entities = all_entities + genitive_mods entities_with_roles.append(all_entities) return entities_with_roles
def set_polarity_scope(self): "Set polarity scope of quantifiers." # Set all token polarities to "up" by default for token in self.tokens: token.polarity = 'up' token.specific_projectiviy = None # Fore each token, get polarity for token in self.tokens: # If token is quantifier if token.lemma in rel_pol.monotonicity_dict.keys() \ and token.deprel == 'det': # Get first argument (noun phrase quantifier refers to) arg_1 = next(t for t in self.tokens if token.head == t.id) arg_1 = [t for t in easy_parse.get_dependent_tokens( self.tokens, arg_1)]+[arg_1] # Get second argument (VP) arg_2 = next(t for t in self.tokens if t.deprel == 'ROOT') arg_2 = [t for t in easy_parse.get_dependent_tokens( self.tokens, arg_2) if t not in arg_1] # Set polarity (up/down/non) for tokens in scope for t in arg_1: t.polarity = get_new_polarity(t.polarity, rel_pol.monotonicity_dict[token.lemma][0]) for t in arg_2: t.polarity = get_new_polarity(t.polarity, rel_pol.monotonicity_dict[token.lemma][1]) # If there is a specific projection for this quantifier, set it if token.lemma in rel_pol.quantifier_projection_dict.keys(): # First argument of operator for t in arg_1: t.specific_projectivity = \ rel_pol.quantifier_projection_dict[token.lemma][0] # Second argument of operator for t in arg_2: t.specific_projectivity = \ rel_pol.quantifier_projection_dict[token.lemma][1] # If negation if token.lemma == 'not': # Get root try: root = next(t for t in self.tokens if t.deprel in ['ROOT'] and t.id == token.head) # Excpetion except StopIteration: warnings.warn('Root not found') return False # Get subj try: subj = next(t for t in self.tokens if t.deprel in ['nsubj','nsubjpass'] and t.head == root.id) # Exception except StopIteration: warnings.warn('Root not found') return False # Get full subject and VP full_subj = [t for t in easy_parse.get_dependent_tokens(self.tokens, subj)] VP = [t for t in easy_parse.get_dependent_tokens(self.tokens, root) \ if t not in full_subj] # Set downward polarity to tokens in VP for t in VP: t.specific_projectivity = rel_pol.negation_projectivity_dict t.polarity = 'down' return True
def same_phrase(args, premise, verbose=True): """Check whether two phrases are the equal, or equal but negated.""" negated = False # Get other premise other_premise = premise.other_premise ## Get anchor tokens, and is_negated value if len(args)==2: current_prem_anchor_token, other_prem_anchor_token = args if len(args)==3: current_prem_anchor_token, other_prem_anchor_token, is_negated = args # if "negated" if is_negated.data.strip() == 'neg': negated = True # Get dependent tokens on achor token for current and other premise current_prem_dep_tokens = easy_parse.get_dependent_tokens(premise.tokens, current_prem_anchor_token) other_prem_dep_tokens = easy_parse.get_dependent_tokens(other_premise.tokens, other_prem_anchor_token) # If current premise anchor is ROOT, remove all adverbial clause dependent tokens if current_prem_anchor_token.deprel == 'ROOT': # Get adverbial clause modifier advcls = [t for t in current_prem_dep_tokens if t.deprel == 'advcl'] advcl_dep_tokens = [] # Get dependent tokens on advcl for adv in advcls: advcl_dep_tokens += easy_parse.get_dependent_tokens(premise.tokens, adv) advcl_dep_tokens.append(adv) # Get list of all dependent tokens advcl_dep_tokens = list(set(advcl_dep_tokens)) # Exclude all these tokens for current premise current_prem_dep_tokens = [t for t in current_prem_dep_tokens if t not in advcl_dep_tokens] # If other premise anchor is ROOT, remove all adverbial clause dependent tokens if other_prem_anchor_token.deprel == 'ROOT': # Get adverbial clause modifier advcls = [t for t in other_prem_dep_tokens if t.deprel == 'advcl'] advcl_dep_tokens = [] # Get dependent tokens on advcl for adv in advcls: advcl_dep_tokens += easy_parse.get_dependent_tokens(other_premise.tokens, adv) advcl_dep_tokens.append(adv) # Get list of all dependent tokens advcl_dep_tokens = list(set(advcl_dep_tokens)) # Exclude all these tokens for other premise other_prem_dep_tokens = [t for t in other_prem_dep_tokens if t not in advcl_dep_tokens] # Append anchor token current_prem_dep_tokens.append(current_prem_anchor_token) other_prem_dep_tokens.append(other_prem_anchor_token) # Clean and sort lists by token id sorted_current_tokens = list(set(sorted(current_prem_dep_tokens, key=lambda token: token.id))) sorted_other_tokens = list(set(sorted(other_prem_dep_tokens, key=lambda token: token.id))) # Remove punctuations sorted_current_tokens = list(set([t.lemma for t in sorted_current_tokens if t.deprel not in ['mark','punct']])) sorted_other_tokens = list(set([t.lemma for t in sorted_other_tokens if t.deprel not in ['mark','punct']])) # If negated version, add a negation to non-negated sentence (for easy comparison) if negated: # Add "not" or "do not" to set that does not contain negation if 'not' not in sorted_current_tokens: if current_prem_anchor_token.lemma in ['be','can','must']: sorted_current_tokens.append('not') else: sorted_current_tokens += ['do','not'] if 'not' not in sorted_other_tokens: if other_prem_anchor_token.lemma in ['be','can','must']: sorted_other_tokens.append('not') else: sorted_other_tokens += ['do','not'] # Sort lists sorted_current_tokens = sorted(sorted_current_tokens) sorted_other_tokens = sorted(sorted_other_tokens) # Return whether two token lists are identical return (sorted_current_tokens == sorted_other_tokens)
def delete(arguments, premise, verbose=True): """Delete token from sentence.""" # Get token to be deleted, and (if available) delete type if len(arguments)==2: del_token, del_type = arguments del_type = del_type.data else: del_token = arguments[0] del_type = None # Get deletion position if isinstance(del_token, str): for i, tok in enumerate(premise.tokens): if tok.lemma == del_token: del_token = tok break else: for i, tok in enumerate(premise.tokens): if tok.same_token(del_token): del_token = tok # Printing if verbose: print('\nDELETE', del_token.lemma) print('Old premise:', " ".join(premise.words)) # Get all tokens that have to be delete del_tokens = [del_token] # If full phrase should be deleted if del_type == 'full': del_phrase = easy_parse.get_dependent_tokens(premise.tokens, del_token) del_tokens += del_phrase # Sort tokens to be deleted by ID del_tokens_sorted = sorted(del_tokens, key=lambda token: token.id) # All positions of tokens to be deleted del_tokens_positions = [] # Save positions of tokens to be deleted for i,k in enumerate(premise.tokens): if k in del_tokens_sorted: del_tokens_positions.append(i) # Get first and last token last_del_token = max(del_tokens_positions) first_del_token = min(del_tokens_positions) # If relative clause, adjust positions (because of commas) if 'relcl' in [t.deprel for t in del_tokens_sorted]: first_del_token -= 1 last_del_token += 1 del_tokens_positions.append(first_del_token) del_tokens_positions.append(last_del_token) # If necessary, increase position of last token to be deleted (e.g. punctuation) if len(premise.tokens) > last_del_token+1 and premise.tokens[last_del_token+1].lemma in [',',':',';']: del_tokens_positions.append(last_del_token+1) # Number of deleted tokens n_del_tokens = len(del_tokens_positions) # Get all tokens that are not supposed to be deleted new_premise_tokens = [t for i,t in enumerate(premise.tokens) if i not in del_tokens_positions] # Set position and dependency values for tokens for i,token in enumerate(new_premise_tokens): if i >= last_del_token: token.id = i - n_del_tokens if token.head >= last_del_token: token.head = token.head - n_del_tokens # Save new premise tokens premise.tokens = new_premise_tokens # Reparse and update premise premise = reparse(premise) premise.update(premise) if verbose: print('New premise:', " ".join(premise.words)) # Get projectivity projectivity = rel_pol.projectivity_dict[del_token.polarity] # Return new premise and projectivity return premise, projectivity