def test_normalize_str_param_none_return_none(): # set up test_str = None expected_results = None # execute actual_results = string_utils.normalize(test_str) # asserts nt.eq_(expected_results, actual_results)
def test_normalize(): # set up test_str = 'AbCD!.' expected_results = 'abcd' # execute actual_results = string_utils.normalize(test_str) # asserts nt.eq_(expected_results, actual_results)
def _k_shingles_generator(doc_generator, size=DEFAULT_SHINGLE_SIZE): """ Generator that yields set of k-shingles :param doc_generator: generator that returns documents as strings :param size: size of k-shingles :return: yields list of shingles (list of strings) and original document shingles produced from, yields None if it can't generate list of shingles """ for doc in doc_generator: if doc: #step 1: remove all white space from the string cleaned_doc = strings_utils.remove_all_whitespace(doc) #step 2: remove punctuation and make all lower case cleaned_doc = strings_utils.normalize(cleaned_doc) #step 3: get shingles list yield _get_list_of_shingles(cleaned_doc, size), doc else: yield None