def test_tokenize_use_forward_slash_delimiter(): # set up test_str = 'a/b/c/d' expected_results = ['a', 'b', 'c', 'd'] # execute actual_results = string_utils.tokenize(test_str, delimiter='/') # asserts nt.eq_(expected_results, actual_results)
def test_tokenize_use_default_delimiter_empty_space(): # set up test_str = 'a b c d' expected_results = ['a', 'b', 'c', 'd'] # execute actual_results = string_utils.tokenize(test_str) # asserts nt.eq_(expected_results, actual_results)
def test_tokenize_use_default_delimiter_empty_space(): # set up test_str = 'a b c d' expected_results = ['a','b','c','d'] # execute actual_results = string_utils.tokenize(test_str) # asserts nt.eq_(expected_results, actual_results)
def test_tokenize_use_forward_slash_delimiter(): # set up test_str = 'a/b/c/d' expected_results = ['a','b','c','d'] # execute actual_results = string_utils.tokenize(test_str, delimiter='/') # asserts nt.eq_(expected_results, actual_results)
def _w_shingles_generator(doc_generator, size=DEFAULT_SHINGLE_SIZE): """ Generator that yields set of w-shingles :param doc_generator: generator that returns documents as strings :param size: size of w-shingles :return: yields list of shingles (list of word token tuples) and original document shingles produced from, yields None if it can't generate list of shingles """ for doc in doc_generator: if doc: #step 1: tokenize string tokens = tuple(strings_utils.tokenize(doc)) #step 2: remove punctuation, make string lower case tokens = tuple(map(strings_utils.normalize, tokens)) #step 3: do stemming TODO - implement stemming funciton, for now just returns what was passed in tokens = tuple(map(strings_utils.get_stem, tokens)) #step 4: create shingle tupule and add to list yield _get_list_of_shingles(tokens, size), doc else: yield None