Example #1
0
    def test_options_most_common_words_count(self):
        # None value for number of common words
        options = TextProfilerOptions()
        options.top_k_words = None
        options.stop_words = [
        ]  # set stop_words to empty list for easy inspection

        text_profile = TextProfiler("Name", options=options)
        sample = pd.Series(
            ["this is test,", " this is a test sentence", "this is", "this"])
        text_profile.update(sample)
        profile = text_profile.profile

        expected_word_count = {
            'this': 4,
            'is': 3,
            'test': 2,
            'a': 1,
            'sentence': 1
        }
        self.assertDictEqual(expected_word_count, profile["word_count"])

        # set number of common words to 3
        options.top_k_words = 3
        options.stop_words = [
        ]  # set stop_words to empty list for easy inspection

        text_profile = TextProfiler("Name", options=options)
        sample = pd.Series(
            ["this is test,", " this is a test sentence", "this is", "this"])
        text_profile.update(sample)
        profile = text_profile.profile

        expected_word_count = {'this': 4, 'is': 3, 'test': 2}
        self.assertDictEqual(expected_word_count, profile["word_count"])

        # change number of common words
        options.top_k_words = 2
        text_profile = TextProfiler("Name", options=options)
        text_profile.update(sample)
        profile = text_profile.profile

        expected_word_count = {'this': 4, 'is': 3}
        self.assertDictEqual(expected_word_count, profile["word_count"])

        # change number of common words greater than length of word_counts list
        options.top_k_words = 10
        text_profile = TextProfiler("Name", options=options)
        text_profile.update(sample)
        profile = text_profile.profile

        expected_word_count = {
            'this': 4,
            'is': 3,
            'test': 2,
            'a': 1,
            'sentence': 1
        }
        self.assertDictEqual(expected_word_count, profile["word_count"])
Example #2
0
    def test_options_stop_words(self):
        # change stop_words, other options remain the same as default values

        # with a list of stopwords
        options = TextProfilerOptions()
        options.stop_words = ['hello', 'sentence', 'is', 'a']

        ## input with one sample
        text_profile = TextProfiler("Name", options=options)
        sample = pd.Series(["This is test, a Test sentence.!!!"])
        text_profile.update(sample)

        expected_word_count = {'This': 1, 'Test': 1, 'test': 1}
        expected_vocab = {
            's': 5,
            ' ': 5,
            'e': 5,
            't': 4,
            '!': 3,
            'T': 2,
            'i': 2,
            'n': 2,
            'h': 1,
            ',': 1,
            'a': 1,
            'c': 1,
            '.': 1
        }
        self.assertDictEqual(expected_word_count, text_profile.word_count)
        self.assertDictEqual(expected_vocab, text_profile.vocab_count)

        ## input with two samples
        text_profile = TextProfiler("Name", options=options)
        sample = pd.Series(["This is test,", " a Test sentence.!!!"])
        text_profile.update(sample)

        expected_word_count = {'This': 1, 'Test': 1, 'test': 1}
        expected_vocab = {
            's': 5,
            ' ': 5,
            'e': 5,
            't': 4,
            '!': 3,
            'T': 2,
            'i': 2,
            'n': 2,
            'h': 1,
            ',': 1,
            'a': 1,
            'c': 1,
            '.': 1
        }
        self.assertDictEqual(expected_word_count, text_profile.word_count)
        self.assertDictEqual(expected_vocab, text_profile.vocab_count)

        # with an empty list
        options = TextProfilerOptions()
        options.stop_words = []

        ## input with one sample
        text_profile = TextProfiler("Name", options=options)
        sample = pd.Series(["This is test, a Test sentence.!!!"])
        text_profile.update(sample)

        expected_word_count = {
            'This': 1,
            'is': 1,
            'test': 1,
            'a': 1,
            'Test': 1,
            'sentence': 1
        }
        expected_vocab = {
            's': 5,
            ' ': 5,
            'e': 5,
            't': 4,
            '!': 3,
            'T': 2,
            'i': 2,
            'n': 2,
            'h': 1,
            ',': 1,
            'a': 1,
            'c': 1,
            '.': 1
        }
        self.assertDictEqual(expected_word_count, text_profile.word_count)
        self.assertDictEqual(expected_vocab, text_profile.vocab_count)

        ## input with two samples
        text_profile = TextProfiler("Name", options=options)
        sample = pd.Series(["This is test,", " a Test sentence.!!!"])
        text_profile.update(sample)

        expected_word_count = {
            'This': 1,
            'is': 1,
            'test': 1,
            'a': 1,
            'Test': 1,
            'sentence': 1
        }
        expected_vocab = {
            's': 5,
            ' ': 5,
            'e': 5,
            't': 4,
            '!': 3,
            'T': 2,
            'i': 2,
            'n': 2,
            'h': 1,
            ',': 1,
            'a': 1,
            'c': 1,
            '.': 1
        }
        self.assertDictEqual(expected_word_count, text_profile.word_count)
        self.assertDictEqual(expected_vocab, text_profile.vocab_count)
Example #3
0
    def test_options_stop_words(self):
        # change stop_words, other options remain the same as default values

        # with a list of stopwords
        options = TextProfilerOptions()
        options.stop_words = ["hello", "sentence", "is", "a"]

        ## input with one sample
        text_profile = TextProfiler("Name", options=options)
        sample = pd.Series(["This is test, a Test sentence.!!!"])
        text_profile.update(sample)

        expected_word_count = {"This": 1, "Test": 1, "test": 1}
        expected_vocab = {
            "s": 5,
            " ": 5,
            "e": 5,
            "t": 4,
            "!": 3,
            "T": 2,
            "i": 2,
            "n": 2,
            "h": 1,
            ",": 1,
            "a": 1,
            "c": 1,
            ".": 1,
        }
        self.assertDictEqual(expected_word_count, text_profile.word_count)
        self.assertDictEqual(expected_vocab, text_profile.vocab_count)

        ## input with two samples
        text_profile = TextProfiler("Name", options=options)
        sample = pd.Series(["This is test,", " a Test sentence.!!!"])
        text_profile.update(sample)

        expected_word_count = {"This": 1, "Test": 1, "test": 1}
        expected_vocab = {
            "s": 5,
            " ": 5,
            "e": 5,
            "t": 4,
            "!": 3,
            "T": 2,
            "i": 2,
            "n": 2,
            "h": 1,
            ",": 1,
            "a": 1,
            "c": 1,
            ".": 1,
        }
        self.assertDictEqual(expected_word_count, text_profile.word_count)
        self.assertDictEqual(expected_vocab, text_profile.vocab_count)

        # with an empty list
        options = TextProfilerOptions()
        options.stop_words = []

        ## input with one sample
        text_profile = TextProfiler("Name", options=options)
        sample = pd.Series(["This is test, a Test sentence.!!!"])
        text_profile.update(sample)

        expected_word_count = {
            "This": 1,
            "is": 1,
            "test": 1,
            "a": 1,
            "Test": 1,
            "sentence": 1,
        }
        expected_vocab = {
            "s": 5,
            " ": 5,
            "e": 5,
            "t": 4,
            "!": 3,
            "T": 2,
            "i": 2,
            "n": 2,
            "h": 1,
            ",": 1,
            "a": 1,
            "c": 1,
            ".": 1,
        }
        self.assertDictEqual(expected_word_count, text_profile.word_count)
        self.assertDictEqual(expected_vocab, text_profile.vocab_count)

        ## input with two samples
        text_profile = TextProfiler("Name", options=options)
        sample = pd.Series(["This is test,", " a Test sentence.!!!"])
        text_profile.update(sample)

        expected_word_count = {
            "This": 1,
            "is": 1,
            "test": 1,
            "a": 1,
            "Test": 1,
            "sentence": 1,
        }
        expected_vocab = {
            "s": 5,
            " ": 5,
            "e": 5,
            "t": 4,
            "!": 3,
            "T": 2,
            "i": 2,
            "n": 2,
            "h": 1,
            ",": 1,
            "a": 1,
            "c": 1,
            ".": 1,
        }
        self.assertDictEqual(expected_word_count, text_profile.word_count)
        self.assertDictEqual(expected_vocab, text_profile.vocab_count)