Python tokenize_and_filter Beispiele, utils.tokenize_and_filter Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: test_real_time_vis.py Projekt: owenst/geotweets

    def test_update_fdist(self):
        filtered_words = utils.tokenize_and_filter(self.sr)
        fdist = utils.get_freq_dist(filtered_words)
        # take distribution and send it empty list
        fdist2 = update_fdist(fdist, [])
        self.assertEqual(fdist, fdist2)

        time.sleep(5)
        self.g.latitude = 40.734073
        self.g.longitude = -73.990663
        self.g.count = 100
        self.sr = self.g.search()
        filtered_words = utils.tokenize_and_filter(self.sr)
        # updating with entirely new word set -> should be longer
        old_len_fdist = len(fdist)
        fdist = update_fdist(fdist, filtered_words)
        self.assertTrue(len(fdist) > old_len_fdist)

Beispiel #2

0

Datei anzeigen

    def test_update_fdist(self):
        filtered_words = utils.tokenize_and_filter(self.sr)
        fdist = utils.get_freq_dist(filtered_words)
        # take distribution and send it empty list
        fdist2 = update_fdist(fdist, [])
        self.assertEqual(fdist, fdist2)

        time.sleep(5)
        self.g.latitude = 40.734073
        self.g.longitude = -73.990663
        self.g.count = 100
        self.sr = self.g.search()
        filtered_words = utils.tokenize_and_filter(self.sr)
        # updating with entirely new word set -> should be longer
        old_len_fdist = len(fdist)
        fdist = update_fdist(fdist, filtered_words)
        self.assertTrue(len(fdist) > old_len_fdist)

Beispiel #3

0

Datei anzeigen

Datei: sample.py Projekt: RobDavis/geotweets

def main():

    parser = get_parser()
    args = parser.parse_args()

    if args.doc:
        print __doc__
        sys.exit()

    g = geosearchclass.GeoSearchClass()

    if args.filename:
        print 'Using parameters from ' + str(args.filename)
        # turn parameter file into dictionary
        g.set_params_from_file(args.filename)
    else:
        if args.default:
            print 'Using default search terms'
        else:
            print 'Using parameters from params.txt'
            g.set_params_from_file('params.txt')

    g.search()
    # print formatted results with extra info to terminal
    if args.verbose:
        g.print_search_results()

    if args.output:
        g.write_search_results(args.output)
    else:
        g.write_search_results()

    if args.json:
        g.json_search_results(args.json)

    if args.visualize:
        import utils
        filtered_words = utils.tokenize_and_filter(g.search_results)
        utils.visualize(filtered_words)

Beispiel #4

0

Datei anzeigen

Datei: suggest_bot.py Projekt: owenst/geotweets

def create_poem(words, g=None):
    """ This creates a poem with user input by suggesting from the words supplied.

    A user can use the word, decline the word, or add their own input.
    g is for geosearchclass. It is none by default.
    """

    formatted_poem = ''''''
    # for no, yes and finish (print poem)
    options = ['y', 'n', 's', 'f']
    keep_adding = True
    print "And using these words: "
    print words
    print "\n\n\n"
    print """

        This robot poet will present a series of suggestions. You can
        either choose to use these suggestions by typing 'y' (for
        yes), or 'n' (for no) or by typing your own input then hitting
        enter. You may also type 's' for search, to add more search
        terms from geolocated tweets to your word corpus. The words
        you choose or add will be succeessively added to a poem, which
        will be printed and saved to an output file. To add a new
        line, type '\n'. To finish writing type f (for finish).

        y: yes use this word or phrase
        n: no, skip this and give me a new phrase
        s: add more geolocated search terms
        \n: carriage return (new line)
        f: finish writing
    """
    while keep_adding:
        chosen = random.choice(words)
        print chosen,
        response = raw_input("      [y, n, s, \\n, f or your own words] :  ")
        # include the chosen word:
        if response == "y":
            formatted_poem = formatted_poem + ''' ''' + chosen
            print
            print formatted_poem
            continue
        elif response == "n":
            continue
        elif response == "s":
            print "Searching geo-located tweets to add to vocab"
            print "This can only be used once every 5 seconds"
            if g is None:
                g = geosearchclass.GeoSearchClass()
            search_results = g.search()
            filtered_words = utils.tokenize_and_filter(search_results)
            print "\n\n\nAdding these Twitter words: "
            print filtered_words
            print "\n"
            words.extend(filtered_words)
            continue
        elif response not in options:
            # if response == "\\n":
            #     response = '\n'
            response = response.replace('\\n', '\n')
            formatted_poem = formatted_poem + ''' ''' + response
            print
            print formatted_poem
            continue
        elif response == "f":
            print
            print formatted_poem
            keep_adding = False
    return formatted_poem

Beispiel #5

0

Datei anzeigen

Datei: real_time_vis.py Projekt: RobDavis/geotweets

def updating_plot(geosearchclass, number_of_words, grow=True):
    search_results = geosearchclass.search()
    filtered_words = utils.tokenize_and_filter(search_results)
    fdist = utils.get_freq_dist(filtered_words)
    # set up plot
    samples = [item for item, _ in fdist.most_common(number_of_words)]
    freqs = [fdist[sample] for sample in samples]
    plt.grid(True, color="silver")
    plt.plot(freqs, range(len(freqs)))
    plt.yticks(range(len(samples)), [s for s in samples])
    plt.ylabel("Samples")
    plt.xlabel("Counts")
    plt.title("Top Words Frequency Distribution")
    plt.ion()
    plt.show()

    # set up loop
    old_ids = set([s.id for s in search_results])
    for i in xrange(100):
        plt.pause(5)
        # use mixed above, change to recent here
        geosearchclass.result_type = "recent"
        # perturbation study
        # if i%2:  # for testing purposes
        #     # #change location every odd time to nyc
        #     # geosearchclass.latitude =40.734073
        #     # geosearchclass.longitude =-73.990663
        #     # perturb latitude
        #     geosearchclass.latitude =geosearchclass.latitude + .001

        # else:
        #     #now back to sf
        #     # geosearchclass.latitude = 37.7821
        #     # geosearchclass.longitude =  -122.4093
        #     geosearchclass.longitude =geosearchclass.longitude + .001

        search_results = geosearchclass.search()
        new_search_results = utils.new_tweets(search_results, old_ids)
        if new_search_results:
            filtered_words = utils.tokenize_and_filter(new_search_results)
            fdist = update_fdist(fdist, filtered_words)
            if grow:
                newsamples = [
                    item for item, _ in fdist.most_common(number_of_words)
                ]
                s1 = set(newsamples)
                s2 = set(samples)
                s1.difference_update(s2)
                if s1:
                    print "New words: " + str(list(s1))
                    newsamples = list(s1)
                    samples.extend(newsamples)
                    plt.yticks(range(len(samples)), [s for s in samples])
            freqs = [fdist[sample] for sample in samples]
            plt.plot(freqs, range(len(freqs)))
            if grow:
                plt.draw()
            print '%d new tweet(s)' % len(new_search_results)
            old_ids.update(set([s.id for s in new_search_results]))
        else:
            print "no updates"

Beispiel #6

0

Datei anzeigen

Datei: real_time_vis.py Projekt: RobDavis/geotweets

def updating_stream_plot(q, number_of_words=30):
    """This plot uses the streaming API to get real time twitter
    information from a given region, determined by a geo-coordinate
    bounding box. The upper left and lower right determine the
    bounding box.

    q is a queue instance, which holds tweets

    number_of_words determines the average number of words in the
    plot. Once the plot reaches 2 x number_of_words, it is shrunk down
    to the new set of words and starts growing again

    To exit the program early, hit CTRL + Z to stop the python script
    and then CTRL + D twice to kill the terminal process and close the
    window.

    """
    setup = False
    fdist = None
    samples = None
    draw_time = 0.1
    samples = []
    plt.ion()
    plt.grid(True, color="silver")

    for i in range(100000):
        status = q.get()
        search_results = [status]
        while not q.empty():
            print "getting another tweet"
            status = q.get()
            search_results.append(status)

        if not setup:
            print "Gathering enough data to begin plotting"
            while len(samples) < 1:
                status = q.get()
                search_results.append(status)
                filtered_words = utils.tokenize_and_filter(search_results)
                if fdist is None:
                    fdist = utils.get_freq_dist(filtered_words)
                else:
                    fdist = update_fdist(fdist, filtered_words)
                n_words = min(10, len(fdist))
                samples = [item for item, _ in fdist.most_common(n_words)]
                # print "len(samples) = {}".format(len(samples))
                samples = remove_infrequent_words(samples, fdist)
            freqs = [fdist[sample] for sample in samples]
            plt.plot(freqs, range(len(freqs)))
            plt.yticks(range(len(samples)), [s for s in samples])
            plt.ylabel("Samples")
            plt.xlabel("Counts")
            plt.title("Top Words Frequency Distribution")
            plt.show()
            plt.pause(draw_time)
            setup = True

        else:
            filtered_words = utils.tokenize_and_filter(search_results)
            fdist = update_fdist(fdist, filtered_words)
            newsamples = [
                item for item, _ in fdist.most_common(number_of_words)
            ]
            newsamples = remove_infrequent_words(newsamples, fdist)
            s1 = set(newsamples)
            s2 = set(samples)
            s1.difference_update(s2)
            if s1:
                print "New words: " + str(list(s1))
                newsamples = list(s1)
                samples.extend(newsamples)
                if len(samples) > 2 * number_of_words:
                    samples = newsamples
                    plt.close()
                plt.yticks(range(len(samples)), [s for s in samples])
            freqs = [fdist[sample] for sample in samples]
            plt.plot(freqs, range(len(freqs)))
            plt.draw()
            plt.pause(draw_time)
    kill_plot()
    return

Beispiel #7

0

Datei anzeigen

def create_poem(g=None, default_words=None, ngram=None):
    """ This creates a poem with user input by suggesting from the words supplied.

    A user can use the word, decline the word, or add their own input.
    g is for geosearchclass. It is none by default.
    default_words is a list of words that can be enabled by default.
    """
    words = []
    formatted_poem = ''''''
    # for no, yes and finish (print poem)
    options = ['y', 'n', 's', 'd', 'r', 'e', 'f', '\n']
    keep_adding = True
    added_default = False
    use_phrases = False
    random_word = False
    print "\n\n\n"
    print """

        This robot poet will present a series of suggestions. You can
        either use these suggestions, edit them, or type your own
        input.  You may also add more words from geolocated tweets to
        your word corpus. The words you choose or add will be
        succeessively added to a poem, which will be printed and saved
        to an output file. To add a new line, type '\\n'. To finish
        writing type f (for finish).

        y: yes use this word
        n: no, skip this and give me a new phrase
        s: search: add more geolocated terms from twitter
        d: default words added to corpus
        r: get random word, when running markov model
        e: edit the text
        \\n: enter line
        f: finish

    """

    if ngram:
        print "Populating seed words from markov chain ngram"
        values = sum(ngram.values(), [])
        words.extend(values)
    chosen = ""
    while keep_adding:
        if len(words) == 0:
            print "Nothing in corpus. Type d for default words or s to search\
twitter"
        if ngram and formatted_poem and not random_word:
            tokens = utils.tokenize_normal_words(formatted_poem)
            num = random.random()
            potential_word = ""
            if len(tokens) > 0:
                #  This is for trigrams
                if num > 0.66 and len(tokens) > 1:  # 50% of time get trigram
                    potential_word = tokens_to_word(tokens, ngram, 2)
                    if potential_word:
                        chosen = potential_word
                    else:
                        potential_word = tokens_to_word(tokens, ngram, 1)
                        if potential_word:
                            chosen = potential_word
                        else:
                            chosen = random.choice(words)
                elif num > 0.33:  # 30% of time get bigram
                    potential_word = tokens_to_word(tokens, ngram, 1)
                    if potential_word:
                        chosen = potential_word
                    else:
                        chosen = random.choice(words)
                else:  # 20% of time get random word
                    chosen = random.choice(words)
            else:
                chosen = random.choice(words)
        elif words:
            chosen = random.choice(words)
            random_word = False
        else:
            pass
        if chosen:
            print chosen,
        response_string = "     " + str(options) + " or your own :"
        response = raw_input(response_string)
        # include the chosen word:
        if response == "y":
            if len(words) == 0:
                continue
            formatted_poem = formatted_poem + ''' ''' + chosen
            print
            print formatted_poem
            continue
        elif response == "n":
            continue
        elif response == "r":
            random_word = True
        elif response == "s":
            print "Searching geo-located tweets to add to vocab"
            print "This can only be used once every 5 seconds"
            if g is None:
                g = geosearchclass.GeoSearchClass()
            search_results = g.search()

            phrase_response = ""
            while phrase_response not in ["y", "n"]:
                phrase_response = raw_input("\nWould you like to use phrases (\
(otherwise, just words)? [y/n]: ")
                if phrase_response == "y":
                    list_of_info_dicts = write.parse_tweets(search_results)
                    filtered_words = []
                    if len(list_of_info_dicts) < 1:
                        filtered_words = utils.tokenize_and_filter(
                            search_results)
                    else:
                        for d in list_of_info_dicts:
                            filtered_words.append(d['phrase'])
                elif phrase_response == "n":
                    filtered_words = utils.tokenize_and_filter(search_results)
                else:
                    continue
            print "\n\n\nAdding these Twitter words: "
            print filtered_words
            print "\n"
            words.extend(filtered_words)
            continue
        elif response == "d":
            if not added_default:
                print "\nadding in these words to corpus:"
                print default_words
                print "\n\n\n"
                words.extend(default_words)
                options.remove('d')
                added_default = True
        elif response == "e":
            formatted_poem = editor.create_editor(formatted_poem)
            print formatted_poem
        elif response not in options:
            response = response.replace('\\n', '\n')
            formatted_poem = formatted_poem + ''' ''' + response
            print
            print formatted_poem
            continue
        elif response == "f":
            print
            print formatted_poem
            keep_adding = False
    return formatted_poem

Beispiel #8

0

Datei anzeigen

Datei: real_time_vis.py Projekt: owenst/geotweets

def updating_plot(geosearchclass, number_of_words, grow=True):
    search_results = geosearchclass.search()
    filtered_words = utils.tokenize_and_filter(search_results)
    fdist = utils.get_freq_dist(filtered_words)
    # set up plot
    samples = [item for item, _ in fdist.most_common(number_of_words)]
    freqs = [fdist[sample] for sample in samples]
    plt.grid(True, color="silver")
    plt.plot(freqs, range(len(freqs)))
    plt.yticks(range(len(samples)), [s for s in samples])
    plt.ylabel("Samples")
    plt.xlabel("Counts")
    plt.title("Top Words Frequency Distribution")
    plt.ion()
    plt.show()

    # set up loop
    old_ids = set([s.id for s in search_results])
    for i in xrange(100):
        plt.pause(5)
        # use mixed above, change to recent here
        geosearchclass.result_type = "recent"
        # perturbation study
        # if i%2:  # for testing purposes
        #     # #change location every odd time to nyc
        #     # geosearchclass.latitude =40.734073
        #     # geosearchclass.longitude =-73.990663
        #     # perturb latitude
        #     geosearchclass.latitude =geosearchclass.latitude + .001

        # else:
        #     #now back to sf
        #     # geosearchclass.latitude = 37.7821
        #     # geosearchclass.longitude =  -122.4093
        #     geosearchclass.longitude =geosearchclass.longitude + .001

        search_results = geosearchclass.search()
        new_search_results = utils.new_tweets(search_results, old_ids)
        if new_search_results:
            filtered_words = utils.tokenize_and_filter(new_search_results)
            fdist = update_fdist(fdist, filtered_words)
            if grow:
                newsamples = [item
                              for item, _ in fdist.most_common(number_of_words)
                              ]
                s1 = set(newsamples)
                s2 = set(samples)
                s1.difference_update(s2)
                if s1:
                    print "New words: " + str(list(s1))
                    newsamples = list(s1)
                    samples.extend(newsamples)
                    plt.yticks(range(len(samples)), [s for s in samples])
            freqs = [fdist[sample] for sample in samples]
            plt.plot(freqs, range(len(freqs)))
            if grow:
                plt.draw()
            print '%d new tweet(s)' % len(new_search_results)
            old_ids.update(set([s.id for s in new_search_results]))
        else:
            print "no updates"

Beispiel #9

0

Datei anzeigen

Datei: real_time_vis.py Projekt: owenst/geotweets

def updating_stream_plot(q, number_of_words=30):
    """This plot uses the streaming API to get real time twitter
    information from a given region, determined by a geo-coordinate
    bounding box. The upper left and lower right determine the
    bounding box.

    q is a queue instance, which holds tweets

    number_of_words determines the average number of words in the
    plot. Once the plot reaches 2 x number_of_words, it is shrunk down
    to the new set of words and starts growing again

    To exit the program early, hit CTRL + Z to stop the python script
    and then CTRL + D twice to kill the terminal process and close the
    window.

    """
    setup = False
    fdist = None
    samples = None
    draw_time = 0.1
    samples = []
    plt.ion()
    plt.grid(True, color="silver")

    for i in range(100000):
        status = q.get()
        search_results = [status]
        while not q.empty():
            print "getting another tweet"
            status = q.get()
            search_results.append(status)

        if not setup:
            print "Gathering enough data to begin plotting"
            while len(samples) < 1:
                status = q.get()
                search_results.append(status)
                filtered_words = utils.tokenize_and_filter(search_results)
                if fdist is None:
                    fdist = utils.get_freq_dist(filtered_words)
                else:
                    fdist = update_fdist(fdist, filtered_words)
                n_words = min(10, len(fdist))
                samples = [item for item, _ in fdist.most_common(n_words)]
                # print "len(samples) = {}".format(len(samples))
                samples = remove_infrequent_words(samples, fdist)
            freqs = [fdist[sample] for sample in samples]
            plt.plot(freqs, range(len(freqs)))
            plt.yticks(range(len(samples)), [s for s in samples])
            plt.ylabel("Samples")
            plt.xlabel("Counts")
            plt.title("Top Words Frequency Distribution")
            plt.show()
            plt.pause(draw_time)
            setup = True

        else:
            filtered_words = utils.tokenize_and_filter(search_results)
            fdist = update_fdist(fdist, filtered_words)
            newsamples = [item
                          for item, _ in fdist.most_common(number_of_words)]
            newsamples = remove_infrequent_words(newsamples, fdist)
            s1 = set(newsamples)
            s2 = set(samples)
            s1.difference_update(s2)
            if s1:
                print "New words: " + str(list(s1))
                newsamples = list(s1)
                samples.extend(newsamples)
                if len(samples) > 2*number_of_words:
                    samples = newsamples
                    plt.close()
                plt.yticks(range(len(samples)), [s for s in samples])
            freqs = [fdist[sample] for sample in samples]
            plt.plot(freqs, range(len(freqs)))
            plt.draw()
            plt.pause(draw_time)
    kill_plot()
    return