def analyze(text, **opts): proximity = opts.get('proximity', 0) wthresh = opts.get('word_thresh', 17) cthresh = opts.get('char_thresh', 95) _common_word_set = _resources.common_words(**opts) problem_phrases = [] last_n_simple_tokens = collections.deque([], proximity) last_n_tokens = collections.deque([], proximity+1) current_sentence = [] for line_num, token in _split_text(text): simpletoken = token.strip(_resources.PUNCTUATION).lower() if proximity: last_n_tokens.append(token) if simpletoken not in _common_word_set: if simpletoken in last_n_simple_tokens: problem_phrases.append((PROXIMITY_FLAG,line_num,simpletoken, ' '.join(last_n_tokens))) last_n_simple_tokens.append(simpletoken) current_sentence.append(token) if any(t in token for t in _resources.TERMINATORS): if not any(t in token for t in _resources.NON_TERMINATORS): # End of sentence; check for problems. if wthresh and len(current_sentence) >= wthresh: problem_phrases.append((WTHRESH_FLAG, line_num, len(current_sentence), ' '.join(current_sentence))) if cthresh: lsen = sum(len(w) for w in current_sentence) if lsen > cthresh: problem_phrases.append((CTHRESH_FLAG, line_num, lsen, ' '.join(current_sentence))) # Reset current sentence current_sentence = [] problem_phrases.sort() return problem_phrases
def test_common_words(self): self.assertEqual(set(), _resources.common_words(track_all_words=True)) self.assertEqual(_resources.COMMON_WORDS, _resources.common_words()) self.assertCountEqual((_resources.COMMON_WORDS.union( _resources.COMMON_WORDS_EXTENSION)), _resources.common_words(extended_list=True))