def _check_prepositions(words, real_sentence, sent_decontracted, tagged_words) -> List[SingleResult]: prepositions_errors = [] for i, (word, tag, _) in enumerate(tagged_words): if tag in ['ADP', 'CCONJ']: # we are going to check prepositions and postpositions. And also coordinating conjunction results = _predict_words(words, i, num_results=5) results_dict = { result['word']: result['softmax'] for result in results } softmax_sum = sum(results_dict.values()) if word not in results_dict.keys() and softmax_sum > 0.8: print(f"CANDIDATES for {word}: {results_dict}") logging.info(f"CANDIDATES for {word}: {results_dict}") sr = SingleResult( category="HINTS", context=_get_mistake_context(number_of_neighbors=4, words=words, current_word_id=i), errorLength=len(word), matchedText=word, message="Perhaps incorrect preposition usage.", offset=sent_decontracted.find(word), offsetInContext=None, replacements=list(results_dict.keys())[:3], ruleId='INCORRECT_WORD_USAGE', ruleIssueType='Hint', sentence=real_sentence) sr.offsetInContext = sr.context.find(word) prepositions_errors.append(sr) return prepositions_errors
def _check_artickes(words, sentence, sent_decontracted, tagged_words): doc = spacy_tokenizer(sentence) errs = [] for ent in doc.ents: if ent.label_ in ['GPE']: NPs = [ np.text.lower().split() for np in doc.noun_chunks if (np.text.find(ent.text) > -1 or ent.text.find(np.text) > -1) and np.text != 'i' ] NPs_list = list(itertools.chain.from_iterable(NPs)) chunk = ' '.join(NPs_list) print(chunk) GPE_subphrase = set(ent.text.lower().split()).union(set(NPs_list)) print(f"GPE set of tokens = {GPE_subphrase}") try: if chunk.find('the') > -1: for country in NO_COUNTRIES: diff = GPE_subphrase.difference(set(country.split())) if diff == {'the'} or diff == {'a'}: raise ArticleMistake( position_token=list(diff).pop(), replacement='', message= "There should not be an article in the position: {}", position=sentence.find(ent.text)) elif chunk.find('the') == -1: for country in THE_COUNTRIES: if set(country.split()).difference(GPE_subphrase) == { 'the' }: raise ArticleMistake( position_token=ent.text, replacement='the', message= "Article 'the' is needed in the position: {}", position=sentence.find(ent.text)) except ArticleMistake as e: print(e.message) sr = SingleResult(category="HINTS", context='', errorLength=3, matchedText=e.position_token, message=e.message, offset=sentence.find(e.position_token), replacements=[e.replacement], ruleId="ARTICLES_BEFORE_COUNTRIES", ruleIssueType='Hint', sentence=sentence) sr.offsetInContext = 0 errs.append(sr) return errs
def _static_rules(sentence: str) -> Optional[List[SingleResult]]: """ :param sentence: sentence from users text :return: list of errors found with heuristic methods """ errors_caught_with_static_rules = [] words = word_tokenize(sentence) if "peoples" in words: context = _get_mistake_context(2, words, words.index("peoples")) errors_caught_with_static_rules.append( SingleResult( category="HINTS", context=context, errorLength=len("peoples"), matchedText='peoples', message= "Word 'Peoples' may be used in the context of different nations. In other cases 'People' should singular.", offset=sentence.find('peoples'), offsetInContext=context.find("peoples"), replacements=['people'], ruleId="PEOPLE_PEOPLES", ruleIssueType='Hint', sentence=sentence)) return errors_caught_with_static_rules
def _create_single_error( word: str, words: List[str], current_word_position: str, sent_decontracted: str, sentence: str, category: str, message: str, replacements: List[str], rule_id: str, rule_issue_type: str, matched_text: str = None, ) -> SingleResult: """create Single Error description object.""" @logged(logger) def get_mistake_context(number_of_neighbors, words, current_word_id): last_index = len(words) left_border = 0 if (current_word_id - number_of_neighbors) < 0 else ( current_word_id - number_of_neighbors) right_border = last_index if (current_word_id + number_of_neighbors) > last_index else ( current_word_id + number_of_neighbors) return " ".join(words[left_border:right_border + 1]) context = get_mistake_context(number_of_neighbors=2, words=words, current_word_id=current_word_position) error_length = len(word) matched_text = matched_text if matched_text else word err_sr = SingleResult(category=category, context=context, errorLength=error_length, matchedText=matched_text, message=message, offset=sent_decontracted.find(matched_text), offsetInContext=None, replacements=replacements, ruleId=rule_id, ruleIssueType=rule_issue_type, sentence=sentence) err_sr.offsetInContext = err_sr.context.find(matched_text) return err_sr
def _check_auxiliary_verbs( words: List[str], sentence: str, sent_decontracted: str, tagged_words: Tuple[str, str, str]) -> List[SingleResult]: ''' :param words: list of words in sentence :param sentence: real sentence, that has been written by user and not corrected at any way :param sent_decontracted: sentence that has been checked and corrected with LT, and part of contraction are removed :param tagged_words: [(word, universal_pos, detailed_pos), ...] :return: list of found mistakes and fixed sentence/ Fixed sentence is returned if we have high probability that predicted verb is from another Tence group ''' possible_errors = [] fixed_sentence: Optional[str] = None for i, (word, pos, _) in enumerate(tagged_words): if pos == 'AUX': word = word.lower() err, to_be_corrected = False, False results = _predict_words(words, i, options=AUX_FULL_FORMS, num_results=len(AUX_FULL_FORMS)) first, second, third, forth = results[0], results[1], results[ 2], results[3] logging.info( f"NEW: {word} -- {pos} -- {[first, second, third,forth]}") if first['softmax'] > 0.79 and not _is_in_same_contracted_group( word, first['word']): err, to_be_corrected = True, True elif first['softmax'] < 0.1 and not _is_in_same_contracted_group(word, first['word']) and \ not _is_in_same_contracted_group(word, second['word']) and \ not _is_in_same_contracted_group(word, third['word']) and \ not _is_in_same_contracted_group(word, forth['word']) : err = True elif not _is_in_same_contracted_group(word, first['word']) and \ not _is_in_same_contracted_group(word, second['word']) and \ not _is_in_same_contracted_group(word, third['word']): err = True replacements = [first['word'], second['word']] if err: if to_be_corrected: fixed_sentence = _replace_token_in_sentence( words, i, replacements[0]) err_sr = SingleResult( category="HINTS", context=_get_mistake_context(number_of_neighbors=2, words=words, current_word_id=i), errorLength=len(word), matchedText=word, message="Perhaps the wrong form of the auxiliary verb", offset=sent_decontracted.find(word), offsetInContext=None, replacements=replacements, ruleId='INCORRECT_AUXILIARY_VERB', ruleIssueType='Hint', sentence=sentence) err_sr.offsetInContext = err_sr.context.find(word) possible_errors.append(err_sr) return possible_errors, fixed_sentence
def _check_word_usage(words, sentence, sent_decontracted, tagged_words): possible_errors_usages = [] for i, (word, tag, _) in enumerate(tagged_words): # Проверить на односложные слова if len(word) == 1 and word.isalpha() and word not in ONE_LETTER_WORDS and \ not (word == 'e' and words[i + 1] == '-'): sr = SingleResult(category="HINTS", context=_get_mistake_context( number_of_neighbors=4, words=words, current_word_id=i), errorLength=len(word), matchedText=word, message="Perhaps one letter is the typo.", offset=sent_decontracted.find(word), offsetInContext=None, replacements=[], ruleId='ONE_LETTER_WORD', ruleIssueType='Hint', sentence=sentence) sr.offsetInContext = sr.context.find(word) possible_errors_usages.append(sr) word = word.lower() to_too_correct = _check_to_too_is_correct( word, i, words) if word.lower() in ['to', 'too'] else True if not to_too_correct: sr = SingleResult(category="HINTS", context=_get_mistake_context( number_of_neighbors=4, words=words, current_word_id=i), errorLength=len(word), matchedText=word, message="Incorrect to/too usage", offset=sent_decontracted.find(word), offsetInContext=None, replacements=['to' if word == 'too' else 'too'], ruleId='TO_TOO', ruleIssueType='Hint', sentence=sentence) sr.offsetInContext = sr.context.find(word) possible_errors_usages.append(sr) word_pos_tag = pos_tag([word])[0] # adverbs and adjectives if word_pos_tag[1] in ( "JJ", "JJR", "JJS", ): results = _predict_words(words, i, num_results=10) predicted_words = [r['word'] for r in results] if word not in predicted_words: word_forms = _get_possible_forms_of_verb(word, pos='a') common = word_forms.intersection(set(predicted_words)) if common != set(): sr = SingleResult( category="HINTS", context=_get_mistake_context(number_of_neighbors=4, words=words, current_word_id=i), errorLength=len(word), matchedText=word, message="Perhaps incorrect form of the word.", offset=sent_decontracted.find(word), offsetInContext=None, replacements=list(common), ruleId='INCORRECT_WORD_USAGE', ruleIssueType='Hint', sentence=sentence) sr.offsetInContext = sr.context.find(word) possible_errors_usages.append(sr) return possible_errors_usages
def _check_verbs(words, real_sentence, sent_decontracted, tagged_words): ''' :param words: list :param real_sentence: sentence written by user :param sent_decontracted: sentence corrected with LT and without contractions :param tagged_words: [(word, universal_pos, detailed_pos), ...] :return: return grammar errors within verbs with issue type = 'Mistake' and incorrect verb usage with issue type 'Hint' ''' possible_errors, usage_hints = [], [] for i, (word, pos, _) in enumerate(tagged_words): if pos == 'VERB': # word = word.lower() results = _predict_words(words, i, num_results=10) predicted_words = [r['word'] for r in results if r['word'] != ''] real_verb_nltk_post_tagged, results_nltk_pos_tagged = pos_tag( [word])[0], pos_tag(predicted_words) most_common_tag = _get_most_common_tag(results_nltk_pos_tagged) results_tags = [ res_tagged[1] for res_tagged in results_nltk_pos_tagged ] if not _first_level_verb_check(word, predicted_words): if not _at_least_one_verb_in_results(results_tags) or \ results[0]['softmax'] > HIGH_USAGE_THRESHOLD \ and not _is_in_different_pos_group(real_verb_nltk_post_tagged[1], most_common_tag): err = SingleResult(category="HINTS", context=_get_mistake_context( number_of_neighbors=4, words=words, current_word_id=i), errorLength=len(word), matchedText=word, message="Perhaps incorrect verb usage.", offset=sent_decontracted.find(word), offsetInContext=None, replacements=predicted_words[:3], ruleId='INCORRECT_WORD_USAGE', ruleIssueType='Hint', sentence=real_sentence) err.offsetInContext = err.context.find(word) usage_hints.append(err) continue verb_forms = _get_possible_forms_of_verb(word) common = verb_forms.intersection(set(predicted_words)) verb_form_predictions = _predict_words( words, i, num_results=len(verb_forms), options=list(verb_forms)) predicted_verbs = [ res['word'] for res in verb_form_predictions ] if common != set() \ or (most_common_tag in VERB_TAGS and _is_in_different_pos_group(real_verb_nltk_post_tagged[1], most_common_tag)) \ or word not in predicted_verbs[:3]: sr = SingleResult( category="HINTS", context=_get_mistake_context(number_of_neighbors=4, words=words, current_word_id=i), errorLength=len(word), matchedText=word, message="Perhaps incorrect form of the verb.", offset=sent_decontracted.find(word), offsetInContext=None, replacements=list(common) if common != set() else predicted_words[:3], ruleId='INCORRECT_VERB', ruleIssueType='Hint', sentence=real_sentence) sr.offsetInContext = sr.context.find(word) possible_errors.append(sr) return possible_errors, usage_hints