def check_regexes(ambs_found, lexicon, req, sentence, sentence_start_index, _): # Go over all regular expressions in lexicon for _, amb_obj in lexicon.items(): # Create Python regular expression object regexp = regex.compile(amb_obj['regexp'], flags=regex.I | regex.X) # Search for all regexps in requirement for match in regex.finditer(regexp, sentence): ambs_found[req.id].append( create_ambiguity_object( amb_obj, text=match[0], index_start=sentence_start_index + match.start(), index_end=sentence_start_index + match.end()))
def find_ambiguities_through_lexicon(amb_obj, ambs_found, req, sentence, sentence_start_index): for word_phrase in amb_obj['lexicon']: # Search for all word phrases in sentence for match in regex.finditer(whole_phrase_regexp(word_phrase), sentence): # Throws out all lexical errors from final result list ambs_found[req.id].append( create_ambiguity_object( amb_obj, text=match[0], index_start=sentence_start_index + match.start(), index_end=sentence_start_index + match.end()))
def check_compounds_nouns(ambs_found, lexicon, req, sentence, sentence_start_index, doc): # Go over all phrases in lexicon for _, amb_obj in lexicon.items(): for chunk in doc.noun_chunks: compound_list = [ token for token in chunk if contains_noun_tokens(token, chunk) ] if len(compound_list) > 2: new_text, new_indexes = get_text_and_indexes(compound_list) ambs_found[req.id].append( create_ambiguity_object( amb_obj, text=new_text, index_start=sentence_start_index + new_indexes[0], index_end=sentence_start_index + new_indexes[1]))
def check_nominals(ambs_found, lexicon, req, sentence, sentence_start_index, doc): # Go over all phrases in lexicon for _, amb_obj in lexicon.items(): # Generate a list of gerund nouminalizations that have pos VB nominalizations = [ [t for t in token.subtree] for token in doc if (token.text[-3:] in amb_obj['gerund'] or token.text[-4:] in amb_obj['gerund_plural']) and token.tag_ == 'VBG' and token.dep_ not in ('root', 'aux', 'advmod', 'compound', 'acl') and doc[token.i - 1].dep_ != 'aux' and token.text.lower() not in amb_obj['rule_exceptions'] ] # Generate a list of nominalizations with pos NN based on suffixes nouns = [ token for token in doc if (token.lemma_[-4:] in amb_obj['suffixes_len4'] or token.lemma_[-3:] in amb_obj['suffixes_len3'] or token.lemma_[-2:] in amb_obj['suffixes_len2']) and token.tag_ in ('NN', 'NNS') and wn.synsets(token.text) ] # Filter list of nouns based on semantic hierarchy for token in nouns: # Generate and flatten the list of hypernyms for each noun hypernyms = list( map(lambda x: x.name().split('.')[0], sum(wn.synsets(token.text)[0].hypernym_paths(), []))) # Only consider nouns that express an event or a process if [l for l in hypernyms if l in ['event', 'process', 'act']] \ and token.text.lower() not in amb_obj['rule_exceptions']: nominalizations.append([t for t in token.subtree]) # Return all ambiguous nominalization sequences found for token_seq in nominalizations: if token_seq: new_text, new_indexes = get_text_and_indexes(token_seq) ambs_found[req.id].append( create_ambiguity_object( amb_obj, text=new_text, index_start=sentence_start_index + new_indexes[0], index_end=sentence_start_index + new_indexes[1]))
def check_pos_regexes(ambs_found, lexicon, req, sentence, sentence_start_index, doc): # Get the original indexes, before the truple design messed with it def get_original_indexes(req_original_string, req_tokenized_string, req_truple_string, match): # Add up extra letters (indexes) due to truple design def count_extra_indexes(up_to_index): # Count the extra letters in a given truple def count_extra_letters(req_truple): try: split = req_truple.split('°') return len(split[1]) + len(split[2]) + 2 except: return 0 # Calculate space added by tokenization process def count_tokenize_space(req_original_string, req_tokenized_string): orig_i = 0 tokn_i = 0 while tokn_i < len(req_tokenized_string): if req_original_string[orig_i] != req_tokenized_string[tokn_i]: tokn_i += 1 continue orig_i += 1 tokn_i += 1 return tokn_i - orig_i # Remove string after the index words_pre_index = req_truple_string[:up_to_index].split() # Calculate extra indexes added by the truple system extra_truple_indexes = sum([count_extra_letters(req_truple) for req_truple in words_pre_index]) # Update the 'up_to_index' to reflect the newly discovered mistakes up_to_index = up_to_index - extra_truple_indexes # Calculate the extra indexes added by the tokenizing process extra_tokenize_space = count_tokenize_space(req_original_string[:up_to_index], req_tokenized_string[:up_to_index]) return extra_truple_indexes + extra_tokenize_space return ( match.start() - count_extra_indexes(match.start()), match.end() - count_extra_indexes(match.end())) # Create list of truples strings (word, POS tag, lemma) with degree symbol in between each part truple_list = ['{0}°{1}°{2}'.format(token.text, token.tag_, token.lemma_) for token in doc] # Create variables for easier and more readable use later req_original_string = sentence req_tokenized_string = ' '.join([token.text for token in doc]) req_truple_string = ' '.join(truple_list) # Convert into string so regex can be performed # Check against each regular expression in the lexicon for _, amb_obj in lexicon.items(): # Create Python regular expression object regexp = regex.compile(amb_obj['regexp'], flags=regex.I | regex.X) # Search for all regexps in requirement for match in regex.finditer(regexp, req_truple_string): # Get the original indexes, since the truple string design messes with them orig_indexes = get_original_indexes( req_original_string, req_tokenized_string, req_truple_string, match) orig_text = ' '.join([req_truple.split('°')[0] for req_truple in match[0].split()]) # Save this found ambiguity ambs_found[req.id].append(create_ambiguity_object( amb_obj, text=orig_text, index_start=sentence_start_index + orig_indexes[0], index_end=sentence_start_index + orig_indexes[1] ))