Ejemplo n.º 1
0
 def test_extract_words_quoted_words(self):
     self.assertEqual(
         ParseTools.extract_words("String with a 'quoted' word"),
         ["String", "with", "a", "quoted", "word"])
     self.assertEqual(
         ParseTools.extract_words("String with a 'quoted' word and don't"),
         ["String", "with", "a", "quoted", "word", "and", "don't"])
     self.assertEqual(
         ParseTools.extract_words("String with 'quoted' words 'don't'"),
         ["String", "with", "quoted", "words", "don't"])
Ejemplo n.º 2
0
 def test_extract_words_regular_words(self):
     """
         Verify that extract_words can correctly pick out words in a string following common english rules.
     """
     self.assertEqual(ParseTools.extract_words("This string has words."),
                      ["This", "string", "has", "words"])
     self.assertEqual(ParseTools.extract_words("Nonsense word: asd"),
                      ["Nonsense", "word", "asd"])
     self.assertEqual(ParseTools.extract_words("Dashed-phrase"),
                      ["Dashed", "phrase"])
    def mimic(examples, punctuation_odds=.05):

        all_words = []
        example_word_counts = []

        for example in examples:
            example_words = ParseTools.extract_words(example)
            all_words += example_words
            example_word_counts.append(len(example_words))

        example_word_count_mean, example_word_count_std = np.mean(example_word_counts), np.std(example_word_counts)

        word_distribution = element_distribution(all_words)

        mimiced = []
        for _ in examples:
            sampled_length = np.clip(int(np.random.normal(example_word_count_mean, example_word_count_std)), a_min=1, a_max=None)

            mimiced_example = ""
            for word in sample(word_distribution, element_column_name='elements', probability_column_name='probabilities', n=sampled_length):
                for punctuation in {'.', '?', '!'}:
                    if np.random.uniform() <= punctuation_odds:
                        word += punctuation
                mimiced_example += word + " "
            mimiced_example = mimiced_example[:-1]

            mimiced.append(mimiced_example)

        return mimiced
Ejemplo n.º 4
0
 def test_extract_words_twitter_words(self):
     """
         Verify that extract_words can correctly pick out twitter words from a string (ie. usernames, links, hashtags).
     """
     self.assertEqual(
         ParseTools.extract_words(
             "This string contains a twitter handle: @realDonaldTrump"), [
                 "This", "string", "contains", "a", "twitter", "handle",
                 "@realDonaldTrump"
             ])
     self.assertEqual(
         ParseTools.extract_words(
             "This string contains a twitter link: http://t.co/0DlGChTBIx"),
         [
             "This", "string", "contains", "a", "twitter", "link",
             "http://t.co/0DlGChTBIx"
         ])
     self.assertEqual(
         ParseTools.extract_words(
             "This string contains a twitter link and handle: http://t.co/0DlGChTBIx @realDonaldTrump"
         ), [
             "This", "string", "contains", "a", "twitter", "link", "and",
             "handle", "http://t.co/0DlGChTBIx", "@realDonaldTrump"
         ])
     self.assertEqual(
         ParseTools.extract_words(
             "This string contains a twitter hashtag: #MakeAmericaGreatAgain"
         ), [
             "This", "string", "contains", "a", "twitter", "hashtag",
             "#MakeAmericaGreatAgain"
         ])
     self.assertEqual(
         ParseTools.extract_words(
             "This string contains a twitter username followed by a colon @realDonaldTrump:"
         ), [
             "This", "string", "contains", "a", "twitter", "username",
             "followed", "by", "a", "colon", "@realDonaldTrump"
         ])
     self.assertEqual(
         ParseTools.extract_words(
             "This string contains a twitter username with a trailing period .@realDonaldTrump"
         ), [
             "This", "string", "contains", "a", "twitter", "username",
             "with", "a", "trailing", "period", "@realDonaldTrump"
         ])
Ejemplo n.º 5
0
 def test_extract_words_null_string(self):
     """
         Verify that extract_words correctly extracts nothing from the null string.
     """
     self.assertEqual(ParseTools.extract_words(""), [])
Ejemplo n.º 6
0
 def test_extract_words_quoted_phrases(self):
     self.assertEqual(
         ParseTools.extract_words("String with a 'quoted phrase'"),
         ["String", "with", "a", "quoted", "phrase"])
Ejemplo n.º 7
0
 def test_extract_words_apostrophes(self):
     self.assertEqual(ParseTools.extract_words("doesn't"), ["doesn't"])
     self.assertEqual(ParseTools.extract_words("Trump's"), ["Trump's"])
     self.assertEqual(ParseTools.extract_words("it's"), ["it's"])
     self.assertEqual(ParseTools.extract_words("its'"), ["its"])