Exemple #1
0
def sampletoscreen_demo(limit=20):
    """
    Sample from the Streaming API and send output to terminal.
    """
    oauth = credsfromfile()
    client = Streamer(**oauth)
    client.register(TweetViewer(limit=limit))
    client.sample()
Exemple #2
0
def tracktoscreen_demo(track="taylor swift", limit=10):
    """
    Track keywords from the public Streaming API and send output to terminal.
    """
    oauth = credsfromfile()
    client = Streamer(**oauth)
    client.register(TweetViewer(limit=limit))
    client.filter(track=track)
Exemple #3
0
def followtoscreen_demo(limit=10):
    """
    Using the Streaming API, select just the tweets from a specified list of
    userIDs.

    This is will only give results in a reasonable time if the users in
    question produce a high volume of tweets, and may even so show some delay.
    """
    oauth = credsfromfile()
    client = Streamer(**oauth)
    client.register(TweetViewer(limit=limit))
    client.statuses.filter(follow=USERIDS)
def senti_score_daily(keyword,
                      client,
                      classifier,
                      twt_num,
                      start_time,
                      days,
                      verb=0):
    score_all = np.zeros((twt_num, days))
    year = start_time[0]
    month = start_time[1]
    day = start_time[2]
    origin = datetime.date(year, month, day)

    for i in range(days):
        start_t = origin + datetime.timedelta(days=i)
        end_t = origin + datetime.timedelta(days=i + 1)
        date1 = start_t.timetuple()[:6]
        date2 = end_t.timetuple()[:6]

        # if not exits get tweets from server and save them, otherwise just load them
        filename = keyword + '_' + str(start_t) + '_' + str(end_t) + '.pickle'
        path = os.path.join(os.path.dirname(__file__), os.pardir, 'data',
                            'tweets')
        if os.path.isfile(os.path.join(path, filename)):
            f = open(os.path.join(path, filename), 'rb')
            tweets = pickle.load(f)
            f.close()
        else:
            client.register(
                TweetViewer(limit=twt_num,
                            lower_date_limit=date1,
                            upper_date_limit=date2))
            tweets_gen = client.search_tweets(keywords=keyword,
                                              limit=twt_num,
                                              lang='en')
            tweets = []
            for t in tweets_gen:
                tweets.append(t['text'])
            f = open(os.path.join(path, filename), 'wb')
            pickle.dump(tweets, f)
            f.close()

        score = classifier.test(tweets)

        if len(score[:, 0]) <= twt_num:
            score_all[0:len(score[:, 0]), i] = score[:, 0]
        else:
            score_all[0:twt_num, i] = score[:twt_num, 0]
        if verb == 1:
            print keyword + ' : ' + str(start_t) + ' to ' + str(
                end_t) + ' score=' + str(np.mean(score_all[:, i], axis=0))

    return score_all
Exemple #5
0
def limit_by_time_demo(keywords="nltk"):
    """
    Query the REST API for Tweets about NLTK since yesterday and send
    the output to terminal.

    This example makes the assumption that there are sufficient Tweets since
    yesterday for the date to be an effective cut-off.
    """
    date = yesterday()
    dt_date = datetime.datetime(*date)
    oauth = credsfromfile()
    client = Query(**oauth)
    client.register(TweetViewer(limit=100, lower_date_limit=date))

    print(f"Cutoff date: {dt_date}\n")

    for tweet in client.search_tweets(keywords=keywords):
        print("{} ".format(tweet["created_at"]), end="")
        client.handler.handle(tweet)
def senti_score_time(keyword,
                     client,
                     classifier,
                     twt_num,
                     start_time,
                     step,
                     step_num,
                     verb=0):
    score_all = np.zeros((twt_num, step_num))
    year = start_time[0]
    month = start_time[1]
    day = start_time[2]
    hour = start_time[3]
    minute = start_time[4]
    origin = datetime.datetime(year, month, day, hour, minute)

    for i in range(step_num):
        start_t = origin + datetime.timedelta(minutes=step * i)
        end_t = origin + datetime.timedelta(minutes=step * (i + 1))
        date1 = start_t.timetuple()[:6]
        date2 = end_t.timetuple()[:6]

        # don't store files in this case
        client.register(
            TweetViewer(limit=twt_num,
                        lower_date_limit=date1,
                        upper_date_limit=date2))
        tweets_gen = client.search_tweets(keywords=keyword,
                                          limit=twt_num,
                                          lang='en')
        tweets = []
        for t in tweets_gen:
            tweets.append(t['text'])

        score = classifier.test(tweets)

        score_all[0:len(score[:, 0]), i] = score[:, 0]
        if verb == 1:
            print keyword + ' : ' + str(start_t) + ' to ' + str(
                end_t) + ' score=' + str(np.mean(score_all[:, i], axis=0))

    return score_all
Exemple #7
0
from nltk.twitter import Query, credsfromfile, TweetViewer
import process_twt
from NBClassifier import NBClassifier
from SCClassifier import SCClassifier
from BGClassifier import BGClassifier
from nltk.corpus import twitter_samples, TwitterCorpusReader
import os
import pickle
import matplotlib.pyplot as plt
import numpy as np

# settings
oauth = credsfromfile()
client = Query(**oauth)
twtNum = 10
client.register(TweetViewer(limit=twtNum))
tweets_gen = client.search_tweets(keywords='hearthstone', lang='en')
tweets = []
slangdict = process_twt.get_slang_dict()
twt_list = []
for t in tweets_gen:
    twt_list.append(process_twt.preprocess(t['text'], slangdict=slangdict))
twt_list = list(set(twt_list))

for t in twt_list[:twtNum]:
    print t

fileIds = twitter_samples.fileids()
root = twitter_samples.root

# read tweet data from corpus
class TwitterAPI():
    oauth = credsfromfile()
    client = Streamer(**oauth)
    client.register(TweetViewer(limit=10))
    client.sample()
    path = BASE_DIR

    def __init__(self, to_screen: True, follow: None, keywords, limit: 10):
        self.follow = follow
        self.keywords = keywords
        tw = Twitter.tweets(to_screen=to_screen,
                            follow=follow,
                            keywords=keywords,
                            limit)

    def get_twiter(self, keywords):
        client = Query(**oauth)
        tweets = client.search_tweets(keywords, limit)
        tweet = next(tweets)
        return tweet

    def get_users(self, *args: None):  #by IdUser
        client = Query(**oauth)
        user_info = clinet.user_info_from_id(*args)
        users = []
        for user in user_info:
            name, followers, following = user_info['name'], user_info[
                'followers_count'], user_info['friends_count']
            users.append(user)
            print(f'{name} {followers} {following}\n')
        return users

    def save_tweetes_file(self):
        client = Streamer(**oauth)
        client.register(TweetWriter(limit=100, subdir='twitter_samples_files'))
        client.statuses.sample()

    def get_tweet_JsonFiles(self, json_file2: None):
        if (json_file2 == None):
            all_tweets_samples = twitter_samples.fileids()
            json_file = all_tweet_samples[2]  #json file
            tweet_string = twitter_samples.strings(json_file)
            return tweet_string
        tweet_string = json_file2
        return tweet_string

    def tokenize_tweets(self, string_tweet):
        toked_tweet = twitter_sample.tokenized(string_tweet)
        return toked_tweet

    def convert_csv_tweet_file(self,
                               input_file,
                               args=[
                                   'created_at', 'favorite_count', 'id',
                                   'in_reply_to_status_id',
                                   'in_reply_to_user_id', 'retweet_count',
                                   'text', 'truncated', 'user.id'
                               ]):
        with open(input_file) as file:
            json2csv(file, path + 'tweets_text.csv', args)
            return open(path + 'tweets_text.csv', 'r').readlines()

    def read_csv_tweets(self, filepath, *args):
        tw = pd.read_csv(filepath,
                         header='tweets',
                         index_col=1,
                         encoding='utf-8',
                         dtype=32).head()
        return tw

    def get_tweet_by_id(self, filepath, tw_id):
        ids = str(tw_id)
        ids = StringIO(ids)
        client = Query(**oauth)
        hydrated = client.expand_tweetids(ids_f)
        tw = read_csv_tweets(filepath)
        for i in hydrated:
            yield tw.loc[tw['user.id'] == i]['text']
from nltk.twitter import Query, credsfromfile, TweetViewer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
import sys

if (len(sys.argv)<4):
    print ('Usage:', sys.argv[0], ' twitter_username max_tweets_to_search max_top_words_to_print lemmatize(optional)' )
    quit()

#capture the output of tweetViewer to file for processing
sys.stdout = open('tweets.txt', 'w')

oauth = credsfromfile()
client = Query(**oauth)
client.register(TweetViewer(limit=sys.argv[2]))
client.user_tweets(sys.argv[1], sys.argv[2])


#give back control to stdout
sys.stdout = sys.__stdout__
lemmatizer = WordNetLemmatizer()

if (len(sys.argv)>4 and sys.argv[4].lower()=='lemmatize'):
    lemmatize=True
else:
    lemmatize=False


def text_cleaner(documents):
    text_cleaned = []
    for document in documents: