Exemple #1
0
def test_normalize_str_param_none_return_none():
    # set up
    test_str = None
    expected_results = None

    # execute
    actual_results = string_utils.normalize(test_str)

    # asserts
    nt.eq_(expected_results, actual_results)
Exemple #2
0
def test_normalize():
    # set up
    test_str = 'AbCD!.'
    expected_results = 'abcd'

    # execute
    actual_results = string_utils.normalize(test_str)

    # asserts
    nt.eq_(expected_results, actual_results)
def test_normalize():
    # set up
    test_str = 'AbCD!.'
    expected_results = 'abcd'

     # execute
    actual_results = string_utils.normalize(test_str)

    # asserts
    nt.eq_(expected_results, actual_results)
def test_normalize_str_param_none_return_none():
    # set up
    test_str = None
    expected_results = None

     # execute
    actual_results = string_utils.normalize(test_str)

    # asserts
    nt.eq_(expected_results, actual_results)
Exemple #5
0
def _k_shingles_generator(doc_generator, size=DEFAULT_SHINGLE_SIZE):
    """
        Generator that yields set of k-shingles
        :param doc_generator: generator that returns documents as strings
        :param size: size of k-shingles
        :return: yields list of shingles (list of strings) and original document shingles produced from, yields
                 None if it can't generate list of shingles
    """

    for doc in doc_generator:
        if doc:
            #step 1: remove all white space from the string
            cleaned_doc = strings_utils.remove_all_whitespace(doc)

            #step 2: remove punctuation and make all lower case
            cleaned_doc = strings_utils.normalize(cleaned_doc)

            #step 3: get shingles list
            yield _get_list_of_shingles(cleaned_doc, size), doc
        else:
            yield None