def test_extract_words_quoted_words(self): self.assertEqual( ParseTools.extract_words("String with a 'quoted' word"), ["String", "with", "a", "quoted", "word"]) self.assertEqual( ParseTools.extract_words("String with a 'quoted' word and don't"), ["String", "with", "a", "quoted", "word", "and", "don't"]) self.assertEqual( ParseTools.extract_words("String with 'quoted' words 'don't'"), ["String", "with", "quoted", "words", "don't"])
def test_extract_words_regular_words(self): """ Verify that extract_words can correctly pick out words in a string following common english rules. """ self.assertEqual(ParseTools.extract_words("This string has words."), ["This", "string", "has", "words"]) self.assertEqual(ParseTools.extract_words("Nonsense word: asd"), ["Nonsense", "word", "asd"]) self.assertEqual(ParseTools.extract_words("Dashed-phrase"), ["Dashed", "phrase"])
def mimic(examples, punctuation_odds=.05): all_words = [] example_word_counts = [] for example in examples: example_words = ParseTools.extract_words(example) all_words += example_words example_word_counts.append(len(example_words)) example_word_count_mean, example_word_count_std = np.mean(example_word_counts), np.std(example_word_counts) word_distribution = element_distribution(all_words) mimiced = [] for _ in examples: sampled_length = np.clip(int(np.random.normal(example_word_count_mean, example_word_count_std)), a_min=1, a_max=None) mimiced_example = "" for word in sample(word_distribution, element_column_name='elements', probability_column_name='probabilities', n=sampled_length): for punctuation in {'.', '?', '!'}: if np.random.uniform() <= punctuation_odds: word += punctuation mimiced_example += word + " " mimiced_example = mimiced_example[:-1] mimiced.append(mimiced_example) return mimiced
def test_extract_words_twitter_words(self): """ Verify that extract_words can correctly pick out twitter words from a string (ie. usernames, links, hashtags). """ self.assertEqual( ParseTools.extract_words( "This string contains a twitter handle: @realDonaldTrump"), [ "This", "string", "contains", "a", "twitter", "handle", "@realDonaldTrump" ]) self.assertEqual( ParseTools.extract_words( "This string contains a twitter link: http://t.co/0DlGChTBIx"), [ "This", "string", "contains", "a", "twitter", "link", "http://t.co/0DlGChTBIx" ]) self.assertEqual( ParseTools.extract_words( "This string contains a twitter link and handle: http://t.co/0DlGChTBIx @realDonaldTrump" ), [ "This", "string", "contains", "a", "twitter", "link", "and", "handle", "http://t.co/0DlGChTBIx", "@realDonaldTrump" ]) self.assertEqual( ParseTools.extract_words( "This string contains a twitter hashtag: #MakeAmericaGreatAgain" ), [ "This", "string", "contains", "a", "twitter", "hashtag", "#MakeAmericaGreatAgain" ]) self.assertEqual( ParseTools.extract_words( "This string contains a twitter username followed by a colon @realDonaldTrump:" ), [ "This", "string", "contains", "a", "twitter", "username", "followed", "by", "a", "colon", "@realDonaldTrump" ]) self.assertEqual( ParseTools.extract_words( "This string contains a twitter username with a trailing period .@realDonaldTrump" ), [ "This", "string", "contains", "a", "twitter", "username", "with", "a", "trailing", "period", "@realDonaldTrump" ])
def test_extract_words_null_string(self): """ Verify that extract_words correctly extracts nothing from the null string. """ self.assertEqual(ParseTools.extract_words(""), [])
def test_extract_words_quoted_phrases(self): self.assertEqual( ParseTools.extract_words("String with a 'quoted phrase'"), ["String", "with", "a", "quoted", "phrase"])
def test_extract_words_apostrophes(self): self.assertEqual(ParseTools.extract_words("doesn't"), ["doesn't"]) self.assertEqual(ParseTools.extract_words("Trump's"), ["Trump's"]) self.assertEqual(ParseTools.extract_words("it's"), ["it's"]) self.assertEqual(ParseTools.extract_words("its'"), ["its"])