Ejemplo n.º 1
0
 def setUp(self):
     self.string_indexer = StringIndexer()
Ejemplo n.º 2
0
 def setUp(self):
     self.string_indexer = StringIndexer()
Ejemplo n.º 3
0
class StringIndexerTest(unittest.TestCase):
    test_strings = ['<1921___.bg three cats!Left__home(early)-In.Two.CARS', 'One man left Home early!!', 'two Woman left homE Late?', 'one Car_turned Left at Our HOME']
    string_indxer = None

    def setUp(self):
        self.string_indexer = StringIndexer()

    def test_should_split_simple_string_into_one_indexable_word(self):

        # Given
        test_string = 'sea.'

        # When
        result = self.string_indexer.strip_and_lower(test_string)

        # Then
        expected_result = 'sea'
        self.assertEqual(expected_result, result)

    def test_should_split_complex_string_into_indexable_words(self):

        # Given
        test_string = u'<1921___.bg three cäts!Left__hôme(early)-In.Two.CARS really?'

        # When
        result = self.string_indexer.strip_and_lower(test_string)

        # Then
        expected_result = '1921 bg three c\xc3\xa4ts left h\xc3\xb4me early in two cars really'
        self.assertEqual(expected_result, result)


    def test_should_return_indexable_substrings_given_depth_1(self):

        # Given
        test_string = 'hello indexed words'

        # When
        result = self.string_indexer._build_substrings(test_string, 1)

        # Then
        expected_result = set(['hello', 'indexed', 'words'])
        self.assertEqual(expected_result, result)

    def test_should_return_indexable_substrings_given_depth_2(self):

        # Given
        test_string = 'hello indexed words'

        # When
        result = self.string_indexer._build_substrings(test_string, 2)

        # Then
        expected_result = set(['hello', 'indexed', 'words', 'hello indexed', 'indexed words'])
        self.assertEqual(expected_result, result)

    def test_should_return_indexable_substrings_for_3_word_string_given_depth_5(self):

        # Given
        test_string = 'hello indexed words'

        # When
        result = self.string_indexer._build_substrings(test_string, 5)

        # Then
        expected_result = set(['hello', 'indexed', 'words', 'hello indexed', 'indexed words', 'hello indexed words'])
        self.assertEqual(expected_result, result)

    def test_should_return_indexable_substrings_for_5_word_string_given_depth_3(self):

        # Given
        test_string = 'hello indexed words of yore'

        # When
        result = self.string_indexer._build_substrings(test_string, 3)

        # Then
        expected_result = set(['hello', 'indexed', 'words', 'of', 'yore', 'hello indexed', 'indexed words', 'words of', 'of yore', 'hello indexed words', 'indexed words of', 'words of yore'])
        self.assertEqual(expected_result, result)

    def test_should_only_print_nr_of_words_in_result_to_show_how_many_indexes_are_needed_for_a_large_string_given_a_depth(self):
        # Given
        test_string = "Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum."
        cleaned_test_string = self.string_indexer.strip_and_lower(test_string)

        # When
        depth = 2
        result = self.string_indexer._build_substrings(cleaned_test_string, depth)

        # Then
        print u'A %s word scentence yielded %s indexes given depth %s' % (len(cleaned_test_string.split()), len(result), depth)

    def test_should_return_build_actual_indexes_from_string_dto(self):

        # Given
        test_string = 'hello indexed words of yore'
        test_row_key = 'magic_key_123'
        dto = TimestampedDataDTO('the_kids', datetime.utcnow(), 'log_text', test_string)

        # When
        result = self.string_indexer.build_indexes_from_timstamped_dto(dto, test_row_key)

        # Then
        for index_dto in result:
            print u'Index: %s' % index_dto
Ejemplo n.º 4
0
class StringIndexerTest(unittest.TestCase):
    test_strings = [
        '<1921___.bg three cats!Left__home(early)-In.Two.CARS',
        'One man left Home early!!', 'two Woman left homE Late?',
        'one Car_turned Left at Our HOME'
    ]
    string_indxer = None

    def setUp(self):
        self.string_indexer = StringIndexer()

    def test_should_split_simple_string_into_one_indexable_word(self):

        # Given
        test_string = 'sea.'

        # When
        result = self.string_indexer.strip_and_lower(test_string)

        # Then
        expected_result = 'sea'
        self.assertEqual(expected_result, result)

    def test_should_split_complex_string_into_indexable_words(self):

        # Given
        test_string = u'<1921___.bg three cäts!Left__hôme(early)-In.Two.CARS really?'

        # When
        result = self.string_indexer.strip_and_lower(test_string)

        # Then
        expected_result = '1921 bg three c\xc3\xa4ts left h\xc3\xb4me early in two cars really'
        self.assertEqual(expected_result, result)

    def test_should_return_indexable_substrings_given_depth_1(self):

        # Given
        test_string = 'hello indexed words'

        # When
        result = self.string_indexer._build_substrings(test_string, 1)

        # Then
        expected_result = set(['hello', 'indexed', 'words'])
        self.assertEqual(expected_result, result)

    def test_should_return_indexable_substrings_given_depth_2(self):

        # Given
        test_string = 'hello indexed words'

        # When
        result = self.string_indexer._build_substrings(test_string, 2)

        # Then
        expected_result = set(
            ['hello', 'indexed', 'words', 'hello indexed', 'indexed words'])
        self.assertEqual(expected_result, result)

    def test_should_return_indexable_substrings_for_3_word_string_given_depth_5(
            self):

        # Given
        test_string = 'hello indexed words'

        # When
        result = self.string_indexer._build_substrings(test_string, 5)

        # Then
        expected_result = set([
            'hello', 'indexed', 'words', 'hello indexed', 'indexed words',
            'hello indexed words'
        ])
        self.assertEqual(expected_result, result)

    def test_should_return_indexable_substrings_for_5_word_string_given_depth_3(
            self):

        # Given
        test_string = 'hello indexed words of yore'

        # When
        result = self.string_indexer._build_substrings(test_string, 3)

        # Then
        expected_result = set([
            'hello', 'indexed', 'words', 'of', 'yore', 'hello indexed',
            'indexed words', 'words of', 'of yore', 'hello indexed words',
            'indexed words of', 'words of yore'
        ])
        self.assertEqual(expected_result, result)

    def test_should_only_print_nr_of_words_in_result_to_show_how_many_indexes_are_needed_for_a_large_string_given_a_depth(
            self):
        # Given
        test_string = "Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum."
        cleaned_test_string = self.string_indexer.strip_and_lower(test_string)

        # When
        depth = 2
        result = self.string_indexer._build_substrings(cleaned_test_string,
                                                       depth)

        # Then
        print u'A %s word scentence yielded %s indexes given depth %s' % (len(
            cleaned_test_string.split()), len(result), depth)

    def test_should_return_build_actual_indexes_from_string_dto(self):

        # Given
        test_string = 'hello indexed words of yore'
        test_row_key = 'magic_key_123'
        dto = TimestampedDataDTO('the_kids', datetime.utcnow(), 'log_text',
                                 test_string)

        # When
        result = self.string_indexer.build_indexes_from_timstamped_dto(
            dto, test_row_key)

        # Then
        for index_dto in result:
            print u'Index: %s' % index_dto