def test_update_fdist(self): filtered_words = utils.tokenize_and_filter(self.sr) fdist = utils.get_freq_dist(filtered_words) # take distribution and send it empty list fdist2 = update_fdist(fdist, []) self.assertEqual(fdist, fdist2) time.sleep(5) self.g.latitude = 40.734073 self.g.longitude = -73.990663 self.g.count = 100 self.sr = self.g.search() filtered_words = utils.tokenize_and_filter(self.sr) # updating with entirely new word set -> should be longer old_len_fdist = len(fdist) fdist = update_fdist(fdist, filtered_words) self.assertTrue(len(fdist) > old_len_fdist)
def test_update_fdist(self): filtered_words = utils.tokenize_and_filter(self.sr) fdist = utils.get_freq_dist(filtered_words) # take distribution and send it empty list fdist2 = update_fdist(fdist, []) self.assertEqual(fdist, fdist2) time.sleep(5) self.g.latitude = 40.734073 self.g.longitude = -73.990663 self.g.count = 100 self.sr = self.g.search() filtered_words = utils.tokenize_and_filter(self.sr) # updating with entirely new word set -> should be longer old_len_fdist = len(fdist) fdist = update_fdist(fdist, filtered_words) self.assertTrue(len(fdist) > old_len_fdist)
def updating_plot(geosearchclass, number_of_words, grow=True): search_results = geosearchclass.search() filtered_words = utils.tokenize_and_filter(search_results) fdist = utils.get_freq_dist(filtered_words) # set up plot samples = [item for item, _ in fdist.most_common(number_of_words)] freqs = [fdist[sample] for sample in samples] plt.grid(True, color="silver") plt.plot(freqs, range(len(freqs))) plt.yticks(range(len(samples)), [s for s in samples]) plt.ylabel("Samples") plt.xlabel("Counts") plt.title("Top Words Frequency Distribution") plt.ion() plt.show() # set up loop old_ids = set([s.id for s in search_results]) for i in xrange(100): plt.pause(5) # use mixed above, change to recent here geosearchclass.result_type = "recent" # perturbation study # if i%2: # for testing purposes # # #change location every odd time to nyc # # geosearchclass.latitude =40.734073 # # geosearchclass.longitude =-73.990663 # # perturb latitude # geosearchclass.latitude =geosearchclass.latitude + .001 # else: # #now back to sf # # geosearchclass.latitude = 37.7821 # # geosearchclass.longitude = -122.4093 # geosearchclass.longitude =geosearchclass.longitude + .001 search_results = geosearchclass.search() new_search_results = utils.new_tweets(search_results, old_ids) if new_search_results: filtered_words = utils.tokenize_and_filter(new_search_results) fdist = update_fdist(fdist, filtered_words) if grow: newsamples = [ item for item, _ in fdist.most_common(number_of_words) ] s1 = set(newsamples) s2 = set(samples) s1.difference_update(s2) if s1: print "New words: " + str(list(s1)) newsamples = list(s1) samples.extend(newsamples) plt.yticks(range(len(samples)), [s for s in samples]) freqs = [fdist[sample] for sample in samples] plt.plot(freqs, range(len(freqs))) if grow: plt.draw() print '%d new tweet(s)' % len(new_search_results) old_ids.update(set([s.id for s in new_search_results])) else: print "no updates"
def updating_stream_plot(q, number_of_words=30): """This plot uses the streaming API to get real time twitter information from a given region, determined by a geo-coordinate bounding box. The upper left and lower right determine the bounding box. q is a queue instance, which holds tweets number_of_words determines the average number of words in the plot. Once the plot reaches 2 x number_of_words, it is shrunk down to the new set of words and starts growing again To exit the program early, hit CTRL + Z to stop the python script and then CTRL + D twice to kill the terminal process and close the window. """ setup = False fdist = None samples = None draw_time = 0.1 samples = [] plt.ion() plt.grid(True, color="silver") for i in range(100000): status = q.get() search_results = [status] while not q.empty(): print "getting another tweet" status = q.get() search_results.append(status) if not setup: print "Gathering enough data to begin plotting" while len(samples) < 1: status = q.get() search_results.append(status) filtered_words = utils.tokenize_and_filter(search_results) if fdist is None: fdist = utils.get_freq_dist(filtered_words) else: fdist = update_fdist(fdist, filtered_words) n_words = min(10, len(fdist)) samples = [item for item, _ in fdist.most_common(n_words)] # print "len(samples) = {}".format(len(samples)) samples = remove_infrequent_words(samples, fdist) freqs = [fdist[sample] for sample in samples] plt.plot(freqs, range(len(freqs))) plt.yticks(range(len(samples)), [s for s in samples]) plt.ylabel("Samples") plt.xlabel("Counts") plt.title("Top Words Frequency Distribution") plt.show() plt.pause(draw_time) setup = True else: filtered_words = utils.tokenize_and_filter(search_results) fdist = update_fdist(fdist, filtered_words) newsamples = [ item for item, _ in fdist.most_common(number_of_words) ] newsamples = remove_infrequent_words(newsamples, fdist) s1 = set(newsamples) s2 = set(samples) s1.difference_update(s2) if s1: print "New words: " + str(list(s1)) newsamples = list(s1) samples.extend(newsamples) if len(samples) > 2 * number_of_words: samples = newsamples plt.close() plt.yticks(range(len(samples)), [s for s in samples]) freqs = [fdist[sample] for sample in samples] plt.plot(freqs, range(len(freqs))) plt.draw() plt.pause(draw_time) kill_plot() return
def updating_plot(geosearchclass, number_of_words, grow=True): search_results = geosearchclass.search() filtered_words = utils.tokenize_and_filter(search_results) fdist = utils.get_freq_dist(filtered_words) # set up plot samples = [item for item, _ in fdist.most_common(number_of_words)] freqs = [fdist[sample] for sample in samples] plt.grid(True, color="silver") plt.plot(freqs, range(len(freqs))) plt.yticks(range(len(samples)), [s for s in samples]) plt.ylabel("Samples") plt.xlabel("Counts") plt.title("Top Words Frequency Distribution") plt.ion() plt.show() # set up loop old_ids = set([s.id for s in search_results]) for i in xrange(100): plt.pause(5) # use mixed above, change to recent here geosearchclass.result_type = "recent" # perturbation study # if i%2: # for testing purposes # # #change location every odd time to nyc # # geosearchclass.latitude =40.734073 # # geosearchclass.longitude =-73.990663 # # perturb latitude # geosearchclass.latitude =geosearchclass.latitude + .001 # else: # #now back to sf # # geosearchclass.latitude = 37.7821 # # geosearchclass.longitude = -122.4093 # geosearchclass.longitude =geosearchclass.longitude + .001 search_results = geosearchclass.search() new_search_results = utils.new_tweets(search_results, old_ids) if new_search_results: filtered_words = utils.tokenize_and_filter(new_search_results) fdist = update_fdist(fdist, filtered_words) if grow: newsamples = [item for item, _ in fdist.most_common(number_of_words) ] s1 = set(newsamples) s2 = set(samples) s1.difference_update(s2) if s1: print "New words: " + str(list(s1)) newsamples = list(s1) samples.extend(newsamples) plt.yticks(range(len(samples)), [s for s in samples]) freqs = [fdist[sample] for sample in samples] plt.plot(freqs, range(len(freqs))) if grow: plt.draw() print '%d new tweet(s)' % len(new_search_results) old_ids.update(set([s.id for s in new_search_results])) else: print "no updates"
def updating_stream_plot(q, number_of_words=30): """This plot uses the streaming API to get real time twitter information from a given region, determined by a geo-coordinate bounding box. The upper left and lower right determine the bounding box. q is a queue instance, which holds tweets number_of_words determines the average number of words in the plot. Once the plot reaches 2 x number_of_words, it is shrunk down to the new set of words and starts growing again To exit the program early, hit CTRL + Z to stop the python script and then CTRL + D twice to kill the terminal process and close the window. """ setup = False fdist = None samples = None draw_time = 0.1 samples = [] plt.ion() plt.grid(True, color="silver") for i in range(100000): status = q.get() search_results = [status] while not q.empty(): print "getting another tweet" status = q.get() search_results.append(status) if not setup: print "Gathering enough data to begin plotting" while len(samples) < 1: status = q.get() search_results.append(status) filtered_words = utils.tokenize_and_filter(search_results) if fdist is None: fdist = utils.get_freq_dist(filtered_words) else: fdist = update_fdist(fdist, filtered_words) n_words = min(10, len(fdist)) samples = [item for item, _ in fdist.most_common(n_words)] # print "len(samples) = {}".format(len(samples)) samples = remove_infrequent_words(samples, fdist) freqs = [fdist[sample] for sample in samples] plt.plot(freqs, range(len(freqs))) plt.yticks(range(len(samples)), [s for s in samples]) plt.ylabel("Samples") plt.xlabel("Counts") plt.title("Top Words Frequency Distribution") plt.show() plt.pause(draw_time) setup = True else: filtered_words = utils.tokenize_and_filter(search_results) fdist = update_fdist(fdist, filtered_words) newsamples = [item for item, _ in fdist.most_common(number_of_words)] newsamples = remove_infrequent_words(newsamples, fdist) s1 = set(newsamples) s2 = set(samples) s1.difference_update(s2) if s1: print "New words: " + str(list(s1)) newsamples = list(s1) samples.extend(newsamples) if len(samples) > 2*number_of_words: samples = newsamples plt.close() plt.yticks(range(len(samples)), [s for s in samples]) freqs = [fdist[sample] for sample in samples] plt.plot(freqs, range(len(freqs))) plt.draw() plt.pause(draw_time) kill_plot() return