def _check_prepositions(words, real_sentence, sent_decontracted,
                        tagged_words) -> List[SingleResult]:
    prepositions_errors = []
    for i, (word, tag, _) in enumerate(tagged_words):
        if tag in ['ADP', 'CCONJ']:
            # we are going to check prepositions and postpositions. And also  coordinating conjunction
            results = _predict_words(words, i, num_results=5)
            results_dict = {
                result['word']: result['softmax']
                for result in results
            }
            softmax_sum = sum(results_dict.values())
            if word not in results_dict.keys() and softmax_sum > 0.8:
                print(f"CANDIDATES for {word}: {results_dict}")
                logging.info(f"CANDIDATES for {word}: {results_dict}")
                sr = SingleResult(
                    category="HINTS",
                    context=_get_mistake_context(number_of_neighbors=4,
                                                 words=words,
                                                 current_word_id=i),
                    errorLength=len(word),
                    matchedText=word,
                    message="Perhaps incorrect preposition usage.",
                    offset=sent_decontracted.find(word),
                    offsetInContext=None,
                    replacements=list(results_dict.keys())[:3],
                    ruleId='INCORRECT_WORD_USAGE',
                    ruleIssueType='Hint',
                    sentence=real_sentence)
                sr.offsetInContext = sr.context.find(word)
                prepositions_errors.append(sr)

    return prepositions_errors
def _check_artickes(words, sentence, sent_decontracted, tagged_words):
    doc = spacy_tokenizer(sentence)
    errs = []
    for ent in doc.ents:
        if ent.label_ in ['GPE']:

            NPs = [
                np.text.lower().split() for np in doc.noun_chunks
                if (np.text.find(ent.text) > -1 or ent.text.find(np.text) > -1)
                and np.text != 'i'
            ]
            NPs_list = list(itertools.chain.from_iterable(NPs))
            chunk = ' '.join(NPs_list)
            print(chunk)

            GPE_subphrase = set(ent.text.lower().split()).union(set(NPs_list))
            print(f"GPE set of tokens = {GPE_subphrase}")

            try:
                if chunk.find('the') > -1:
                    for country in NO_COUNTRIES:
                        diff = GPE_subphrase.difference(set(country.split()))
                        if diff == {'the'} or diff == {'a'}:
                            raise ArticleMistake(
                                position_token=list(diff).pop(),
                                replacement='',
                                message=
                                "There should not be an article in the position: {}",
                                position=sentence.find(ent.text))
                elif chunk.find('the') == -1:
                    for country in THE_COUNTRIES:
                        if set(country.split()).difference(GPE_subphrase) == {
                                'the'
                        }:
                            raise ArticleMistake(
                                position_token=ent.text,
                                replacement='the',
                                message=
                                "Article 'the' is needed in the position: {}",
                                position=sentence.find(ent.text))
            except ArticleMistake as e:
                print(e.message)
                sr = SingleResult(category="HINTS",
                                  context='',
                                  errorLength=3,
                                  matchedText=e.position_token,
                                  message=e.message,
                                  offset=sentence.find(e.position_token),
                                  replacements=[e.replacement],
                                  ruleId="ARTICLES_BEFORE_COUNTRIES",
                                  ruleIssueType='Hint',
                                  sentence=sentence)
                sr.offsetInContext = 0
                errs.append(sr)

    return errs
def _static_rules(sentence: str) -> Optional[List[SingleResult]]:
    """
    :param sentence: sentence from users text
    :return: list of errors found with heuristic methods
    """
    errors_caught_with_static_rules = []
    words = word_tokenize(sentence)
    if "peoples" in words:
        context = _get_mistake_context(2, words, words.index("peoples"))
        errors_caught_with_static_rules.append(
            SingleResult(
                category="HINTS",
                context=context,
                errorLength=len("peoples"),
                matchedText='peoples',
                message=
                "Word 'Peoples' may be used in the context of different nations. In other cases 'People' should singular.",
                offset=sentence.find('peoples'),
                offsetInContext=context.find("peoples"),
                replacements=['people'],
                ruleId="PEOPLE_PEOPLES",
                ruleIssueType='Hint',
                sentence=sentence))

    return errors_caught_with_static_rules
def _create_single_error(
    word: str,
    words: List[str],
    current_word_position: str,
    sent_decontracted: str,
    sentence: str,
    category: str,
    message: str,
    replacements: List[str],
    rule_id: str,
    rule_issue_type: str,
    matched_text: str = None,
) -> SingleResult:
    """create Single Error description object."""
    @logged(logger)
    def get_mistake_context(number_of_neighbors, words, current_word_id):
        last_index = len(words)
        left_border = 0 if (current_word_id - number_of_neighbors) < 0 else (
            current_word_id - number_of_neighbors)
        right_border = last_index if (current_word_id +
                                      number_of_neighbors) > last_index else (
                                          current_word_id +
                                          number_of_neighbors)

        return " ".join(words[left_border:right_border + 1])

    context = get_mistake_context(number_of_neighbors=2,
                                  words=words,
                                  current_word_id=current_word_position)
    error_length = len(word)
    matched_text = matched_text if matched_text else word
    err_sr = SingleResult(category=category,
                          context=context,
                          errorLength=error_length,
                          matchedText=matched_text,
                          message=message,
                          offset=sent_decontracted.find(matched_text),
                          offsetInContext=None,
                          replacements=replacements,
                          ruleId=rule_id,
                          ruleIssueType=rule_issue_type,
                          sentence=sentence)
    err_sr.offsetInContext = err_sr.context.find(matched_text)

    return err_sr
def _check_auxiliary_verbs(
        words: List[str], sentence: str, sent_decontracted: str,
        tagged_words: Tuple[str, str, str]) -> List[SingleResult]:
    '''

    :param words: list of words in sentence
    :param sentence: real sentence, that has been written by user and not corrected at any way
    :param sent_decontracted: sentence that has been checked and corrected with LT, and part of contraction are removed
    :param tagged_words: [(word, universal_pos, detailed_pos), ...]
    :return: list of found mistakes and fixed sentence/
     Fixed sentence is returned if we have high probability that predicted verb is from another Tence group
    '''
    possible_errors = []
    fixed_sentence: Optional[str] = None
    for i, (word, pos, _) in enumerate(tagged_words):
        if pos == 'AUX':
            word = word.lower()
            err, to_be_corrected = False, False

            results = _predict_words(words,
                                     i,
                                     options=AUX_FULL_FORMS,
                                     num_results=len(AUX_FULL_FORMS))

            first, second, third, forth = results[0], results[1], results[
                2], results[3]
            logging.info(
                f"NEW: {word} -- {pos} -- {[first, second, third,forth]}")
            if first['softmax'] > 0.79 and not _is_in_same_contracted_group(
                    word, first['word']):
                err, to_be_corrected = True, True
            elif first['softmax'] < 0.1 and not _is_in_same_contracted_group(word, first['word']) and \
                    not _is_in_same_contracted_group(word, second['word']) and \
                    not _is_in_same_contracted_group(word, third['word']) and \
                    not _is_in_same_contracted_group(word, forth['word']) :
                err = True
            elif not _is_in_same_contracted_group(word, first['word']) and \
                    not _is_in_same_contracted_group(word, second['word']) and \
                    not _is_in_same_contracted_group(word, third['word']):
                err = True

            replacements = [first['word'], second['word']]

            if err:
                if to_be_corrected:
                    fixed_sentence = _replace_token_in_sentence(
                        words, i, replacements[0])
                err_sr = SingleResult(
                    category="HINTS",
                    context=_get_mistake_context(number_of_neighbors=2,
                                                 words=words,
                                                 current_word_id=i),
                    errorLength=len(word),
                    matchedText=word,
                    message="Perhaps the wrong form of the auxiliary verb",
                    offset=sent_decontracted.find(word),
                    offsetInContext=None,
                    replacements=replacements,
                    ruleId='INCORRECT_AUXILIARY_VERB',
                    ruleIssueType='Hint',
                    sentence=sentence)
                err_sr.offsetInContext = err_sr.context.find(word)
                possible_errors.append(err_sr)

    return possible_errors, fixed_sentence
def _check_word_usage(words, sentence, sent_decontracted, tagged_words):
    possible_errors_usages = []
    for i, (word, tag, _) in enumerate(tagged_words):

        # Проверить на односложные слова
        if len(word) == 1 and word.isalpha() and word not in ONE_LETTER_WORDS and \
                not (word == 'e' and words[i + 1] == '-'):
            sr = SingleResult(category="HINTS",
                              context=_get_mistake_context(
                                  number_of_neighbors=4,
                                  words=words,
                                  current_word_id=i),
                              errorLength=len(word),
                              matchedText=word,
                              message="Perhaps one letter is the typo.",
                              offset=sent_decontracted.find(word),
                              offsetInContext=None,
                              replacements=[],
                              ruleId='ONE_LETTER_WORD',
                              ruleIssueType='Hint',
                              sentence=sentence)
            sr.offsetInContext = sr.context.find(word)
            possible_errors_usages.append(sr)
        word = word.lower()
        to_too_correct = _check_to_too_is_correct(
            word, i, words) if word.lower() in ['to', 'too'] else True
        if not to_too_correct:
            sr = SingleResult(category="HINTS",
                              context=_get_mistake_context(
                                  number_of_neighbors=4,
                                  words=words,
                                  current_word_id=i),
                              errorLength=len(word),
                              matchedText=word,
                              message="Incorrect to/too usage",
                              offset=sent_decontracted.find(word),
                              offsetInContext=None,
                              replacements=['to' if word == 'too' else 'too'],
                              ruleId='TO_TOO',
                              ruleIssueType='Hint',
                              sentence=sentence)
            sr.offsetInContext = sr.context.find(word)
            possible_errors_usages.append(sr)

        word_pos_tag = pos_tag([word])[0]

        # adverbs and adjectives
        if word_pos_tag[1] in (
                "JJ",
                "JJR",
                "JJS",
        ):
            results = _predict_words(words, i, num_results=10)
            predicted_words = [r['word'] for r in results]

            if word not in predicted_words:
                word_forms = _get_possible_forms_of_verb(word, pos='a')
                common = word_forms.intersection(set(predicted_words))

                if common != set():
                    sr = SingleResult(
                        category="HINTS",
                        context=_get_mistake_context(number_of_neighbors=4,
                                                     words=words,
                                                     current_word_id=i),
                        errorLength=len(word),
                        matchedText=word,
                        message="Perhaps incorrect form of the word.",
                        offset=sent_decontracted.find(word),
                        offsetInContext=None,
                        replacements=list(common),
                        ruleId='INCORRECT_WORD_USAGE',
                        ruleIssueType='Hint',
                        sentence=sentence)
                    sr.offsetInContext = sr.context.find(word)
                    possible_errors_usages.append(sr)

    return possible_errors_usages
def _check_verbs(words, real_sentence, sent_decontracted, tagged_words):
    '''
    :param words: list
    :param real_sentence: sentence written by user
    :param sent_decontracted:  sentence corrected with LT and without contractions
    :param tagged_words: [(word, universal_pos, detailed_pos), ...]
    :return: return grammar errors within verbs with issue type = 'Mistake' and incorrect verb usage with issue type 'Hint'
    '''
    possible_errors, usage_hints = [], []
    for i, (word, pos, _) in enumerate(tagged_words):
        if pos == 'VERB':
            # word = word.lower()

            results = _predict_words(words, i, num_results=10)

            predicted_words = [r['word'] for r in results if r['word'] != '']
            real_verb_nltk_post_tagged, results_nltk_pos_tagged = pos_tag(
                [word])[0], pos_tag(predicted_words)
            most_common_tag = _get_most_common_tag(results_nltk_pos_tagged)
            results_tags = [
                res_tagged[1] for res_tagged in results_nltk_pos_tagged
            ]
            if not _first_level_verb_check(word, predicted_words):

                if not _at_least_one_verb_in_results(results_tags) or \
                        results[0]['softmax'] > HIGH_USAGE_THRESHOLD \
                        and not _is_in_different_pos_group(real_verb_nltk_post_tagged[1], most_common_tag):
                    err = SingleResult(category="HINTS",
                                       context=_get_mistake_context(
                                           number_of_neighbors=4,
                                           words=words,
                                           current_word_id=i),
                                       errorLength=len(word),
                                       matchedText=word,
                                       message="Perhaps incorrect verb usage.",
                                       offset=sent_decontracted.find(word),
                                       offsetInContext=None,
                                       replacements=predicted_words[:3],
                                       ruleId='INCORRECT_WORD_USAGE',
                                       ruleIssueType='Hint',
                                       sentence=real_sentence)
                    err.offsetInContext = err.context.find(word)
                    usage_hints.append(err)
                    continue

                verb_forms = _get_possible_forms_of_verb(word)
                common = verb_forms.intersection(set(predicted_words))
                verb_form_predictions = _predict_words(
                    words,
                    i,
                    num_results=len(verb_forms),
                    options=list(verb_forms))
                predicted_verbs = [
                    res['word'] for res in verb_form_predictions
                ]


                if common != set() \
                        or (most_common_tag in VERB_TAGS and _is_in_different_pos_group(real_verb_nltk_post_tagged[1],
                                                                                        most_common_tag)) \
                        or word not in predicted_verbs[:3]:
                    sr = SingleResult(
                        category="HINTS",
                        context=_get_mistake_context(number_of_neighbors=4,
                                                     words=words,
                                                     current_word_id=i),
                        errorLength=len(word),
                        matchedText=word,
                        message="Perhaps incorrect form of the verb.",
                        offset=sent_decontracted.find(word),
                        offsetInContext=None,
                        replacements=list(common)
                        if common != set() else predicted_words[:3],
                        ruleId='INCORRECT_VERB',
                        ruleIssueType='Hint',
                        sentence=real_sentence)
                    sr.offsetInContext = sr.context.find(word)
                    possible_errors.append(sr)

    return possible_errors, usage_hints