Ejemplo n.º 1
0
def analyze(text, **opts):
    proximity = opts.get('proximity', 0)
    wthresh = opts.get('word_thresh', 17)
    cthresh = opts.get('char_thresh', 95)
    _common_word_set = _resources.common_words(**opts)
    
    problem_phrases = []
    last_n_simple_tokens = collections.deque([], proximity)
    last_n_tokens = collections.deque([], proximity+1)
    current_sentence = []
    
    for line_num, token in _split_text(text):
        simpletoken = token.strip(_resources.PUNCTUATION).lower()
        if proximity:
            last_n_tokens.append(token)
            if simpletoken not in _common_word_set:
                if simpletoken in last_n_simple_tokens:
                    problem_phrases.append((PROXIMITY_FLAG,line_num,simpletoken, 
                                           ' '.join(last_n_tokens)))
            last_n_simple_tokens.append(simpletoken)
        current_sentence.append(token)
        if any(t in token for t in _resources.TERMINATORS):
            if not any(t in token for t in _resources.NON_TERMINATORS):
                # End of sentence; check for problems.
                if wthresh and len(current_sentence) >= wthresh:
                    problem_phrases.append((WTHRESH_FLAG, line_num, 
                                           len(current_sentence),
                                           ' '.join(current_sentence)))
                if cthresh:
                    lsen = sum(len(w) for w in current_sentence)
                    if lsen > cthresh:
                        problem_phrases.append((CTHRESH_FLAG, line_num, lsen, 
                                               ' '.join(current_sentence)))
                # Reset current sentence
                current_sentence = []
    problem_phrases.sort()
    return problem_phrases
Ejemplo n.º 2
0
 def test_common_words(self):
     self.assertEqual(set(), _resources.common_words(track_all_words=True))
     self.assertEqual(_resources.COMMON_WORDS, _resources.common_words())
     self.assertCountEqual((_resources.COMMON_WORDS.union(
                            _resources.COMMON_WORDS_EXTENSION)),
                           _resources.common_words(extended_list=True))