def test_remove_dots(self): self.assertEqual( ParseTools.remove_dots("String with three dot pattern…"), "String with three dot pattern") self.assertEqual( ParseTools.remove_dots("String with three dot pattern …"), "String with three dot pattern ")
def test_extract_sentences_regular(self): """ Verify that extract_sentences can be correctly applied to strings with sentences whose only punctuation is at the end. """ # Strings with one grammatically CORRECT sentence self.assertEqual( ParseTools.extract_sentences("Here is a declaritive sentence."), ["Here is a declaritive sentence."]) self.assertEqual( ParseTools.extract_sentences("Is this an interrogative sentence?"), ["Is this an interrogative sentence?"]) self.assertEqual( ParseTools.extract_sentences("This is an excalamtory sentence!"), ["This is an excalamtory sentence!"]) # Strings with multiple grammatically CORRECT sentences self.assertEqual( ParseTools.extract_sentences( "This string has two sentences. Both of the sentences are grammatically correct." ), [ "This string has two sentences.", "Both of the sentences are grammatically correct." ]) # Strings with multiple grammatically INCORRECT sentences self.assertEqual( ParseTools.extract_sentences( "Sentences have this does two. Grammar not is."), ["Sentences have this does two.", "Grammar not is."])
def test_extract_apostrophe_words_string_without_apostrophe_words(self): self.assertEqual(ParseTools.extract_apostrophe_words(""), []) self.assertEqual( ParseTools.extract_apostrophe_words( "This sentence has no words with an apostrophe"), []) self.assertEqual(ParseTools.extract_apostrophe_words("and'"), []) self.assertEqual(ParseTools.extract_apostrophe_words("'and"), [])
def test_get_random_string_lengths(self): """ Verify that the random strings produced by get_random_string contain the expected number of characters. """ self.assertEqual(len(ParseTools.get_random_string(0)), 0) self.assertEqual(len(ParseTools.get_random_string(1)), 1) self.assertEqual(len(ParseTools.get_random_string(100)), 100)
def test_is_proper_sentence_propers(self): """ Verify that proper sentences (ones that start with a capital letter and end with a punctuation) can be identified as such. """ self.assertTrue(ParseTools.is_proper_sentence("Regular sentence.")) self.assertTrue(ParseTools.is_proper_sentence("Regular sentence...")) self.assertTrue(ParseTools.is_proper_sentence("Regular sentence?")) self.assertTrue(ParseTools.is_proper_sentence("Regular sentence!"))
def test_extract_apostrophe_words_string_with_apostrophe_words(self): self.assertEqual( ParseTools.extract_apostrophe_words( "This sentence's got a word with an apostrophe"), ["sentence's"]) self.assertEqual(ParseTools.extract_apostrophe_words("doesn't"), ["doesn't"]) self.assertEqual(ParseTools.extract_apostrophe_words("'don't'"), ["don't"])
def test_replace_all_replaced(self): """ Verify that replace_all can correctly replace characters in the string that are not in meant to be kept. """ self.assertEqual(ParseTools.replace_all(string.printable, " ", "⚾️"), " ") self.assertEqual(ParseTools.replace_all(string.printable, " ", "🇺🇸"), " ")
def test_replace_all_not_replaced(self): """ Verify that replace_all can correctly leave string with all characters present in the keeps unaltered. """ self.assertEqual( ParseTools.replace_all(string.printable, " ", string.printable), string.printable) self.assertEqual(ParseTools.replace_all(string.printable, " ", "a"), "a")
def test_extract_words_regular_words(self): """ Verify that extract_words can correctly pick out words in a string following common english rules. """ self.assertEqual(ParseTools.extract_words("This string has words."), ["This", "string", "has", "words"]) self.assertEqual(ParseTools.extract_words("Nonsense word: asd"), ["Nonsense", "word", "asd"]) self.assertEqual(ParseTools.extract_words("Dashed-phrase"), ["Dashed", "phrase"])
def test_fix_ats(self): test_at_set = {'@realDonaldTrump', '@joe'} self.assertEqual( ParseTools.fix_ats('Tweet with @realDOOOONALDTRUMP', test_at_set), 'Tweet with @realDonaldTrump') self.assertEqual(ParseTools.fix_ats('Tweet with @jump', test_at_set), 'Tweet with @joe') self.assertEqual(ParseTools.fix_ats('Tweet with no ats', test_at_set), 'Tweet with no ats') self.assertEqual(ParseTools.fix_ats('', test_at_set), '')
def test_extract_hts_strings_with_hts(self): """ Verify that extract_hts correctly returns the hashtags in the string in the order in which they occur. """ self.assertEqual( ParseTools.extract_hts("String with a few #Hash #Tags"), ["#Hash", "#Tags"]) self.assertEqual( ParseTools.extract_hts("String with a #Hashtags in weird #Places"), ["#Hashtags", "#Places"])
def test_extract_words_quoted_words(self): self.assertEqual( ParseTools.extract_words("String with a 'quoted' word"), ["String", "with", "a", "quoted", "word"]) self.assertEqual( ParseTools.extract_words("String with a 'quoted' word and don't"), ["String", "with", "a", "quoted", "word", "and", "don't"]) self.assertEqual( ParseTools.extract_words("String with 'quoted' words 'don't'"), ["String", "with", "quoted", "words", "don't"])
def test_extract_ats_strings_with_ats(self): """ Verify that extract_ats returns twitter usernames from string in the order in which they occur. """ self.assertEqual( ParseTools.extract_ats("String with a few @twitter @names"), ["@twitter", "@names"]) self.assertEqual( ParseTools.extract_ats( "@twitter String with names @cool in wide varitey of places @names" ), ["@twitter", "@cool", "@names"])
def test_find_nearest_string_on_ats(self): """ Verify that closest twitter username candidate can be found. """ test_candidates = {"@realDonalTrump", "@FES", "@jack"} self.assertEqual( ParseTools.find_nearest_string("@trump", test_candidates), "@realDonalTrump") self.assertEqual(ParseTools.find_nearest_string("@j", test_candidates), "@jack")
def test_extract_sentences_twitter_words(self): """ Verify behavior of extract_sentences when it enconters twitter words. """ self.assertEqual( ParseTools.extract_sentences( "@TrumpPeeLannin I am there and and you can see the proof?"), ["@TrumpPeeLannin I am there and and you can see the proof?"]) self.assertEqual( ParseTools.extract_sentences("@TheBearthen Thanks!"), ["@TheBearthen Thanks!"])
def test_remove_outer_quotes_quoted_string(self): """ Verify that remove_outer_quotes performs correcly on quoted strings. """ self.assertEqual( ParseTools.remove_outer_quotes( '"String with double outer quotes"'), 'String with double outer quotes') self.assertEqual( ParseTools.remove_outer_quotes( "'String with single outer quotes'"), "String with single outer quotes")
def test_extract_ats_at_with_symbols(self): """ Verify that extract_ats can catch twitter names when they are surrounded by symbols. """ self.assertEqual( ParseTools.extract_ats( "String @realDonaldTrump: that has twitter name-colon combo"), ["@realDonaldTrump"]) self.assertEqual( ParseTools.extract_ats( ".@realDonaldTrump String that has twitter name-period combo"), ["@realDonaldTrump"])
def test_split_join_extra_space_remove(self): """ Verify that using split/join string combo removes all spaces that are not in between words. """ self.assertEqual( ParseTools.split_join("string with extra space at end "), "string with extra space at end") self.assertEqual(ParseTools.split_join(" word "), "word") self.assertEqual( ParseTools.split_join("string with extra spaces"), "string with extra spaces")
def test_remove_outer_quotes_invalid_cases(self): """ Verify that remove_outer_quotes leaves alone strings that are not surrounded by two double quotes or single quotes alone. """ self.assertEqual( ParseTools.remove_outer_quotes('String without outer quotes'), 'String without outer quotes') self.assertEqual( ParseTools.remove_outer_quotes('String with one outer quote"'), 'String with one outer quote"') self.assertEqual( ParseTools.remove_outer_quotes('"String with one outer quote'), '"String with one outer quote')
def test_is_quoted_tweet_on_regular_strings(self): self.assertFalse( ParseTools.is_quoted_tweet( "\"String surrounded by double quotes\"")) self.assertFalse( ParseTools.is_quoted_tweet("\'String surrounded single quotes\'")) self.assertFalse( ParseTools.is_quoted_tweet( "@realDonaldTrump: Missing surrounding quotes")) self.assertFalse( ParseTools.is_quoted_tweet( "@thequote: Missing surrounding quotes"))
def test_extract_ats_strings_without_ats(self): """ Verify that extract_ats correctly indicates that a string has no twitter usernames. """ self.assertEqual( ParseTools.extract_ats("String with no twitter names"), []) self.assertEqual( ParseTools.extract_ats( "String with a twitter name without the at: realDonaldTrump"), []) self.assertEqual( ParseTools.extract_ats("String with just the @ symbol"), []) self.assertEqual( ParseTools.extract_ats("String with name like phrase @000"), [])
def test_remove_at_prefixes_non_prefixed_strings(self): """ Verify that remove_at_prefixes will not alter strings that do not begin with twitter usernames. """ self.assertEqual( ParseTools.remove_at_prefixes("Tweet without an at prefix"), "Tweet without an at prefix") self.assertEqual( ParseTools.remove_at_prefixes( "Tweet with an at @realDonaldTrump but no at prefix"), "Tweet with an at @realDonaldTrump but no at prefix") self.assertEqual( ParseTools.remove_at_prefixes("a @realDonalTrump @FES"), "a @realDonalTrump @FES") self.assertEqual(ParseTools.remove_at_prefixes("@ a"), "@ a")
def test_split_join_alt_chars(self): """ Verify that using split/join string combo removes all space-like extra characters (carriage return, tab, etc.) """ self.assertEqual( ParseTools.split_join("string with carriage \n return"), "string with carriage return") self.assertEqual(ParseTools.split_join("string with \t tab"), "string with tab") self.assertEqual( ParseTools.split_join("string with carriage return\n"), "string with carriage return") self.assertEqual(ParseTools.split_join("string with tab\t"), "string with tab")
def test_find_nearest_string_string_in_candidates(self): """ Verify that find_nearest_string can correctly match a string in the candidates with itself. """ test_candidates = {"Oranges", "Tomatoes", "Grapes"} self.assertEqual( ParseTools.find_nearest_string("Oranges", test_candidates), "Oranges") self.assertEqual( ParseTools.find_nearest_string("Tomatoes", test_candidates), "Tomatoes") self.assertEqual( ParseTools.find_nearest_string("Grapes", test_candidates), "Grapes")
def mimic(examples, punctuation_odds=.05): all_words = [] example_word_counts = [] for example in examples: example_words = ParseTools.extract_words(example) all_words += example_words example_word_counts.append(len(example_words)) example_word_count_mean, example_word_count_std = np.mean(example_word_counts), np.std(example_word_counts) word_distribution = element_distribution(all_words) mimiced = [] for _ in examples: sampled_length = np.clip(int(np.random.normal(example_word_count_mean, example_word_count_std)), a_min=1, a_max=None) mimiced_example = "" for word in sample(word_distribution, element_column_name='elements', probability_column_name='probabilities', n=sampled_length): for punctuation in {'.', '?', '!'}: if np.random.uniform() <= punctuation_odds: word += punctuation mimiced_example += word + " " mimiced_example = mimiced_example[:-1] mimiced.append(mimiced_example) return mimiced
def generate(model, gen_count, temperature, weight_adjust={ '.': 2, '?': 2, '!': 2, ',': 2 }): generations = [] for _ in range(gen_count): generation = custom_generate(model, temperature=temperature, weight_adjustments=weight_adjust, include_stop_token=True) # If generation was able to finish (as opposed to being manually stopped), # and if tweet contains letters (is not just series of symbols) if generation[-3:] == '<s>' and ParseTools.contains_letters( generation[:-3]): # Clip stop character and save tweet generation = generation[:-3] generations.append(generation) return generations
def test_remove_hts_strings_with_hts(self): """ Verify that remove_hts correctly removes hashtags from string. """ self.assertEqual( ParseTools.remove_hts("String with a few #Hash #Tags"), "String with a few ")
def test_remove_ats_strings_with_ats(self): """ Verify that remove_ats removes twitter usernames from a string. """ self.assertEqual( ParseTools.remove_ats("String with a few @twitter @names"), "String with a few ")
def test_remove_http_links_strings_without_http_links(self): """ Verify that remove_http_links leaves a string without twitter http links unchanged. """ self.assertEqual( ParseTools.remove_http_links("String with no http links"), "String with no http links")
def test_remove_at_prefixes_prefixed_strings(self): """ Verify that remove_at_prefixes can remove initial twitter usernames from a string. """ self.assertEqual( ParseTools.remove_at_prefixes( "@realDonalTrump a tweet with an initial at prefix"), "a tweet with an initial at prefix") self.assertEqual( ParseTools.remove_at_prefixes( "@realDonalTrump @FES a tweet with initial at prefixes"), "a tweet with initial at prefixes") self.assertEqual(ParseTools.remove_at_prefixes("@realDonalTrump @FES"), "") self.assertEqual( ParseTools.remove_at_prefixes(" @realDonalTrump @FES"), "")