コード例 #1
0
def test_tokenize_use_forward_slash_delimiter():
    # set up
    test_str = 'a/b/c/d'
    expected_results = ['a', 'b', 'c', 'd']

    # execute
    actual_results = string_utils.tokenize(test_str, delimiter='/')

    # asserts
    nt.eq_(expected_results, actual_results)
コード例 #2
0
def test_tokenize_use_default_delimiter_empty_space():
    # set up
    test_str = 'a b c d'
    expected_results = ['a', 'b', 'c', 'd']

    # execute
    actual_results = string_utils.tokenize(test_str)

    # asserts
    nt.eq_(expected_results, actual_results)
コード例 #3
0
def test_tokenize_use_default_delimiter_empty_space():
    # set up
    test_str = 'a b c d'
    expected_results = ['a','b','c','d']

    # execute
    actual_results = string_utils.tokenize(test_str)

    # asserts
    nt.eq_(expected_results, actual_results)
コード例 #4
0
def test_tokenize_use_forward_slash_delimiter():
    # set up
    test_str = 'a/b/c/d'
    expected_results = ['a','b','c','d']

    # execute
    actual_results = string_utils.tokenize(test_str, delimiter='/')

    # asserts
    nt.eq_(expected_results, actual_results)
コード例 #5
0
def _w_shingles_generator(doc_generator, size=DEFAULT_SHINGLE_SIZE):
    """
        Generator that yields set of w-shingles
        :param doc_generator: generator that returns documents as strings
        :param size: size of w-shingles
        :return: yields list of shingles (list of word token tuples) and original document shingles produced from, yields
                None if it can't generate list of shingles
    """

    for doc in doc_generator:
        if doc:
            #step 1: tokenize string
            tokens = tuple(strings_utils.tokenize(doc))

            #step 2: remove punctuation, make string lower case
            tokens = tuple(map(strings_utils.normalize, tokens))

            #step 3: do stemming TODO - implement stemming funciton, for now just returns what was passed in
            tokens = tuple(map(strings_utils.get_stem, tokens))

            #step 4: create shingle tupule and add to list
            yield _get_list_of_shingles(tokens, size), doc
        else:
            yield None