def test_update_fdist(self): filtered_words = utils.tokenize_and_filter(self.sr) fdist = utils.get_freq_dist(filtered_words) # take distribution and send it empty list fdist2 = update_fdist(fdist, []) self.assertEqual(fdist, fdist2) time.sleep(5) self.g.latitude = 40.734073 self.g.longitude = -73.990663 self.g.count = 100 self.sr = self.g.search() filtered_words = utils.tokenize_and_filter(self.sr) # updating with entirely new word set -> should be longer old_len_fdist = len(fdist) fdist = update_fdist(fdist, filtered_words) self.assertTrue(len(fdist) > old_len_fdist)
def main(): parser = get_parser() args = parser.parse_args() if args.doc: print __doc__ sys.exit() g = geosearchclass.GeoSearchClass() if args.filename: print 'Using parameters from ' + str(args.filename) # turn parameter file into dictionary g.set_params_from_file(args.filename) else: if args.default: print 'Using default search terms' else: print 'Using parameters from params.txt' g.set_params_from_file('params.txt') g.search() # print formatted results with extra info to terminal if args.verbose: g.print_search_results() if args.output: g.write_search_results(args.output) else: g.write_search_results() if args.json: g.json_search_results(args.json) if args.visualize: import utils filtered_words = utils.tokenize_and_filter(g.search_results) utils.visualize(filtered_words)
def create_poem(words, g=None): """ This creates a poem with user input by suggesting from the words supplied. A user can use the word, decline the word, or add their own input. g is for geosearchclass. It is none by default. """ formatted_poem = '''''' # for no, yes and finish (print poem) options = ['y', 'n', 's', 'f'] keep_adding = True print "And using these words: " print words print "\n\n\n" print """ This robot poet will present a series of suggestions. You can either choose to use these suggestions by typing 'y' (for yes), or 'n' (for no) or by typing your own input then hitting enter. You may also type 's' for search, to add more search terms from geolocated tweets to your word corpus. The words you choose or add will be succeessively added to a poem, which will be printed and saved to an output file. To add a new line, type '\n'. To finish writing type f (for finish). y: yes use this word or phrase n: no, skip this and give me a new phrase s: add more geolocated search terms \n: carriage return (new line) f: finish writing """ while keep_adding: chosen = random.choice(words) print chosen, response = raw_input(" [y, n, s, \\n, f or your own words] : ") # include the chosen word: if response == "y": formatted_poem = formatted_poem + ''' ''' + chosen print print formatted_poem continue elif response == "n": continue elif response == "s": print "Searching geo-located tweets to add to vocab" print "This can only be used once every 5 seconds" if g is None: g = geosearchclass.GeoSearchClass() search_results = g.search() filtered_words = utils.tokenize_and_filter(search_results) print "\n\n\nAdding these Twitter words: " print filtered_words print "\n" words.extend(filtered_words) continue elif response not in options: # if response == "\\n": # response = '\n' response = response.replace('\\n', '\n') formatted_poem = formatted_poem + ''' ''' + response print print formatted_poem continue elif response == "f": print print formatted_poem keep_adding = False return formatted_poem
def updating_plot(geosearchclass, number_of_words, grow=True): search_results = geosearchclass.search() filtered_words = utils.tokenize_and_filter(search_results) fdist = utils.get_freq_dist(filtered_words) # set up plot samples = [item for item, _ in fdist.most_common(number_of_words)] freqs = [fdist[sample] for sample in samples] plt.grid(True, color="silver") plt.plot(freqs, range(len(freqs))) plt.yticks(range(len(samples)), [s for s in samples]) plt.ylabel("Samples") plt.xlabel("Counts") plt.title("Top Words Frequency Distribution") plt.ion() plt.show() # set up loop old_ids = set([s.id for s in search_results]) for i in xrange(100): plt.pause(5) # use mixed above, change to recent here geosearchclass.result_type = "recent" # perturbation study # if i%2: # for testing purposes # # #change location every odd time to nyc # # geosearchclass.latitude =40.734073 # # geosearchclass.longitude =-73.990663 # # perturb latitude # geosearchclass.latitude =geosearchclass.latitude + .001 # else: # #now back to sf # # geosearchclass.latitude = 37.7821 # # geosearchclass.longitude = -122.4093 # geosearchclass.longitude =geosearchclass.longitude + .001 search_results = geosearchclass.search() new_search_results = utils.new_tweets(search_results, old_ids) if new_search_results: filtered_words = utils.tokenize_and_filter(new_search_results) fdist = update_fdist(fdist, filtered_words) if grow: newsamples = [ item for item, _ in fdist.most_common(number_of_words) ] s1 = set(newsamples) s2 = set(samples) s1.difference_update(s2) if s1: print "New words: " + str(list(s1)) newsamples = list(s1) samples.extend(newsamples) plt.yticks(range(len(samples)), [s for s in samples]) freqs = [fdist[sample] for sample in samples] plt.plot(freqs, range(len(freqs))) if grow: plt.draw() print '%d new tweet(s)' % len(new_search_results) old_ids.update(set([s.id for s in new_search_results])) else: print "no updates"
def updating_stream_plot(q, number_of_words=30): """This plot uses the streaming API to get real time twitter information from a given region, determined by a geo-coordinate bounding box. The upper left and lower right determine the bounding box. q is a queue instance, which holds tweets number_of_words determines the average number of words in the plot. Once the plot reaches 2 x number_of_words, it is shrunk down to the new set of words and starts growing again To exit the program early, hit CTRL + Z to stop the python script and then CTRL + D twice to kill the terminal process and close the window. """ setup = False fdist = None samples = None draw_time = 0.1 samples = [] plt.ion() plt.grid(True, color="silver") for i in range(100000): status = q.get() search_results = [status] while not q.empty(): print "getting another tweet" status = q.get() search_results.append(status) if not setup: print "Gathering enough data to begin plotting" while len(samples) < 1: status = q.get() search_results.append(status) filtered_words = utils.tokenize_and_filter(search_results) if fdist is None: fdist = utils.get_freq_dist(filtered_words) else: fdist = update_fdist(fdist, filtered_words) n_words = min(10, len(fdist)) samples = [item for item, _ in fdist.most_common(n_words)] # print "len(samples) = {}".format(len(samples)) samples = remove_infrequent_words(samples, fdist) freqs = [fdist[sample] for sample in samples] plt.plot(freqs, range(len(freqs))) plt.yticks(range(len(samples)), [s for s in samples]) plt.ylabel("Samples") plt.xlabel("Counts") plt.title("Top Words Frequency Distribution") plt.show() plt.pause(draw_time) setup = True else: filtered_words = utils.tokenize_and_filter(search_results) fdist = update_fdist(fdist, filtered_words) newsamples = [ item for item, _ in fdist.most_common(number_of_words) ] newsamples = remove_infrequent_words(newsamples, fdist) s1 = set(newsamples) s2 = set(samples) s1.difference_update(s2) if s1: print "New words: " + str(list(s1)) newsamples = list(s1) samples.extend(newsamples) if len(samples) > 2 * number_of_words: samples = newsamples plt.close() plt.yticks(range(len(samples)), [s for s in samples]) freqs = [fdist[sample] for sample in samples] plt.plot(freqs, range(len(freqs))) plt.draw() plt.pause(draw_time) kill_plot() return
def create_poem(g=None, default_words=None, ngram=None): """ This creates a poem with user input by suggesting from the words supplied. A user can use the word, decline the word, or add their own input. g is for geosearchclass. It is none by default. default_words is a list of words that can be enabled by default. """ words = [] formatted_poem = '''''' # for no, yes and finish (print poem) options = ['y', 'n', 's', 'd', 'r', 'e', 'f', '\n'] keep_adding = True added_default = False use_phrases = False random_word = False print "\n\n\n" print """ This robot poet will present a series of suggestions. You can either use these suggestions, edit them, or type your own input. You may also add more words from geolocated tweets to your word corpus. The words you choose or add will be succeessively added to a poem, which will be printed and saved to an output file. To add a new line, type '\\n'. To finish writing type f (for finish). y: yes use this word n: no, skip this and give me a new phrase s: search: add more geolocated terms from twitter d: default words added to corpus r: get random word, when running markov model e: edit the text \\n: enter line f: finish """ if ngram: print "Populating seed words from markov chain ngram" values = sum(ngram.values(), []) words.extend(values) chosen = "" while keep_adding: if len(words) == 0: print "Nothing in corpus. Type d for default words or s to search\ twitter" if ngram and formatted_poem and not random_word: tokens = utils.tokenize_normal_words(formatted_poem) num = random.random() potential_word = "" if len(tokens) > 0: # This is for trigrams if num > 0.66 and len(tokens) > 1: # 50% of time get trigram potential_word = tokens_to_word(tokens, ngram, 2) if potential_word: chosen = potential_word else: potential_word = tokens_to_word(tokens, ngram, 1) if potential_word: chosen = potential_word else: chosen = random.choice(words) elif num > 0.33: # 30% of time get bigram potential_word = tokens_to_word(tokens, ngram, 1) if potential_word: chosen = potential_word else: chosen = random.choice(words) else: # 20% of time get random word chosen = random.choice(words) else: chosen = random.choice(words) elif words: chosen = random.choice(words) random_word = False else: pass if chosen: print chosen, response_string = " " + str(options) + " or your own :" response = raw_input(response_string) # include the chosen word: if response == "y": if len(words) == 0: continue formatted_poem = formatted_poem + ''' ''' + chosen print print formatted_poem continue elif response == "n": continue elif response == "r": random_word = True elif response == "s": print "Searching geo-located tweets to add to vocab" print "This can only be used once every 5 seconds" if g is None: g = geosearchclass.GeoSearchClass() search_results = g.search() phrase_response = "" while phrase_response not in ["y", "n"]: phrase_response = raw_input("\nWould you like to use phrases (\ (otherwise, just words)? [y/n]: ") if phrase_response == "y": list_of_info_dicts = write.parse_tweets(search_results) filtered_words = [] if len(list_of_info_dicts) < 1: filtered_words = utils.tokenize_and_filter( search_results) else: for d in list_of_info_dicts: filtered_words.append(d['phrase']) elif phrase_response == "n": filtered_words = utils.tokenize_and_filter(search_results) else: continue print "\n\n\nAdding these Twitter words: " print filtered_words print "\n" words.extend(filtered_words) continue elif response == "d": if not added_default: print "\nadding in these words to corpus:" print default_words print "\n\n\n" words.extend(default_words) options.remove('d') added_default = True elif response == "e": formatted_poem = editor.create_editor(formatted_poem) print formatted_poem elif response not in options: response = response.replace('\\n', '\n') formatted_poem = formatted_poem + ''' ''' + response print print formatted_poem continue elif response == "f": print print formatted_poem keep_adding = False return formatted_poem
def updating_plot(geosearchclass, number_of_words, grow=True): search_results = geosearchclass.search() filtered_words = utils.tokenize_and_filter(search_results) fdist = utils.get_freq_dist(filtered_words) # set up plot samples = [item for item, _ in fdist.most_common(number_of_words)] freqs = [fdist[sample] for sample in samples] plt.grid(True, color="silver") plt.plot(freqs, range(len(freqs))) plt.yticks(range(len(samples)), [s for s in samples]) plt.ylabel("Samples") plt.xlabel("Counts") plt.title("Top Words Frequency Distribution") plt.ion() plt.show() # set up loop old_ids = set([s.id for s in search_results]) for i in xrange(100): plt.pause(5) # use mixed above, change to recent here geosearchclass.result_type = "recent" # perturbation study # if i%2: # for testing purposes # # #change location every odd time to nyc # # geosearchclass.latitude =40.734073 # # geosearchclass.longitude =-73.990663 # # perturb latitude # geosearchclass.latitude =geosearchclass.latitude + .001 # else: # #now back to sf # # geosearchclass.latitude = 37.7821 # # geosearchclass.longitude = -122.4093 # geosearchclass.longitude =geosearchclass.longitude + .001 search_results = geosearchclass.search() new_search_results = utils.new_tweets(search_results, old_ids) if new_search_results: filtered_words = utils.tokenize_and_filter(new_search_results) fdist = update_fdist(fdist, filtered_words) if grow: newsamples = [item for item, _ in fdist.most_common(number_of_words) ] s1 = set(newsamples) s2 = set(samples) s1.difference_update(s2) if s1: print "New words: " + str(list(s1)) newsamples = list(s1) samples.extend(newsamples) plt.yticks(range(len(samples)), [s for s in samples]) freqs = [fdist[sample] for sample in samples] plt.plot(freqs, range(len(freqs))) if grow: plt.draw() print '%d new tweet(s)' % len(new_search_results) old_ids.update(set([s.id for s in new_search_results])) else: print "no updates"
def updating_stream_plot(q, number_of_words=30): """This plot uses the streaming API to get real time twitter information from a given region, determined by a geo-coordinate bounding box. The upper left and lower right determine the bounding box. q is a queue instance, which holds tweets number_of_words determines the average number of words in the plot. Once the plot reaches 2 x number_of_words, it is shrunk down to the new set of words and starts growing again To exit the program early, hit CTRL + Z to stop the python script and then CTRL + D twice to kill the terminal process and close the window. """ setup = False fdist = None samples = None draw_time = 0.1 samples = [] plt.ion() plt.grid(True, color="silver") for i in range(100000): status = q.get() search_results = [status] while not q.empty(): print "getting another tweet" status = q.get() search_results.append(status) if not setup: print "Gathering enough data to begin plotting" while len(samples) < 1: status = q.get() search_results.append(status) filtered_words = utils.tokenize_and_filter(search_results) if fdist is None: fdist = utils.get_freq_dist(filtered_words) else: fdist = update_fdist(fdist, filtered_words) n_words = min(10, len(fdist)) samples = [item for item, _ in fdist.most_common(n_words)] # print "len(samples) = {}".format(len(samples)) samples = remove_infrequent_words(samples, fdist) freqs = [fdist[sample] for sample in samples] plt.plot(freqs, range(len(freqs))) plt.yticks(range(len(samples)), [s for s in samples]) plt.ylabel("Samples") plt.xlabel("Counts") plt.title("Top Words Frequency Distribution") plt.show() plt.pause(draw_time) setup = True else: filtered_words = utils.tokenize_and_filter(search_results) fdist = update_fdist(fdist, filtered_words) newsamples = [item for item, _ in fdist.most_common(number_of_words)] newsamples = remove_infrequent_words(newsamples, fdist) s1 = set(newsamples) s2 = set(samples) s1.difference_update(s2) if s1: print "New words: " + str(list(s1)) newsamples = list(s1) samples.extend(newsamples) if len(samples) > 2*number_of_words: samples = newsamples plt.close() plt.yticks(range(len(samples)), [s for s in samples]) freqs = [fdist[sample] for sample in samples] plt.plot(freqs, range(len(freqs))) plt.draw() plt.pause(draw_time) kill_plot() return