def expand_tweetids_demo(): """ Given a file object containing a list of Tweet IDs, fetch the corresponding full Tweets. """ ids_f =\ StringIO("""\ 588665495492124672 588665495487909888 588665495508766721 588665495513006080 588665495517200384 588665495487811584 588665495525588992 588665495487844352 588665495492014081 588665495512948737""") oauth = credsfromfile() client = Query(**oauth) hydrated = client.expand_tweetids(ids_f) for tweet in hydrated: try: id_str = tweet['id_str'] print('id: {}\ntext: {}\n'.format(id_str, tweet['text'])) except IndexError: pass
def find_matching_tweets(num_tweets=100, fname="matching_tweets.csv", shownum=50): """Given the number of tweets to retrieve, queries that number of tweets with the keyword "Trump" and saves the tweet id and text as a csv file "fname". Prints out the shownum amount of tweets using panda. Does not remove retweets.""" oauth = credsfromfile() # create and register a streamer client = Streamer(**oauth) writer = TweetWriter(limit=num_tweets) client.register(writer) # get the name of the newly-created json file input_file = writer.timestamped_file() client.filter(track="trump") # case-insensitive client.sample() with open(input_file) as fp: # these two fields for now json2csv(fp, fname, [ 'id', 'text', ]) # pretty print using pandas tweets = pd.read_csv(fname, encoding="utf8") return tweets.head(shownum)
def expand_tweetids_demo(): """ Given a file object containing a list of Tweet IDs, fetch the corresponding full Tweets, if available. """ ids_f = StringIO( """\ 588665495492124672 588665495487909888 588665495508766721 588665495513006080 588665495517200384 588665495487811584 588665495525588992 588665495487844352 588665495492014081 588665495512948737""" ) oauth = credsfromfile() client = Query(**oauth) hydrated = client.expand_tweetids(ids_f) for tweet in hydrated: id_str = tweet['id_str'] print('id: {}'.format(id_str)) text = tweet['text'] if text.startswith('@null'): text = "[Tweet not available]" print(text + '\n')
def expand_tweetids_demo(): """ Given a file object containing a list of Tweet IDs, fetch the corresponding full Tweets. """ ids_f =\ io.StringIO("""\ 588665495492124672 588665495487909888 588665495508766721 588665495513006080 588665495517200384 588665495487811584 588665495525588992 588665495487844352 588665495492014081 588665495512948737""") oauth = credsfromfile() client = Query(**oauth) hydrated = client.expand_tweetids(ids_f) for tweet in hydrated: try: id_str = tweet['id_str'] print('id: {}\ntext: {}\n'.format(id_str, tweet['text'])) except IndexError: pass
def expand_tweetids_demo(): """ Given a file object containing a list of Tweet IDs, fetch the corresponding full Tweets, if available. """ ids_f = StringIO("""\ 588665495492124672 588665495487909888 588665495508766721 588665495513006080 588665495517200384 588665495487811584 588665495525588992 588665495487844352 588665495492014081 588665495512948737""") oauth = credsfromfile() client = Query(**oauth) hydrated = client.expand_tweetids(ids_f) for tweet in hydrated: id_str = tweet["id_str"] print(f"id: {id_str}") text = tweet["text"] if text.startswith("@null"): text = "[Tweet not available]" print(text + "\n")
def scrape_twitter(google_client): tw = Twitter() # tweets = tw.tweets(keywords='JetBlue', stream=False, limit=10) #sample from the public stream # print(tweets) oauth = credsfromfile() client = Query(**oauth) tweets = client.search_tweets( keywords='JetBlue OR #JetBlue -filter:retweets', limit=10000) topics_dict = { "tweet_texts":[], \ "ent_score":[], \ "ent_magn":[], \ "overall_score":[], \ "overall_magn":[]} for tweet in tqdm(tweets): topics_dict["tweet_texts"].append(tweet['text']) ent_score, ent_magnitude, doc_score, doc_magnitude = analyze_text( google_client, text=tweet['text']) topics_dict["ent_score"].append(ent_score) topics_dict["ent_magn"].append(ent_magnitude) topics_dict["overall_score"].append(doc_score) topics_dict["overall_magn"].append(doc_magnitude) # pprint(tweet, depth=1) # print('\n\n') print('Total Count:', len(topics_dict["tweet_texts"])) metrics = ["ent_score", "ent_magn", "overall_score", "overall_magn"] for metric in metrics: metric_score = np.asarray(topics_dict[metric]) print(metric, "Mean:", np.mean(metric_score), "St Dev:", np.std(metric_score)) with open('./csvs/twitter-jetblue-sentiment.json', 'w') as fp: json.dump(topics_dict, fp)
def collect_tweets(my_keyword, json_writer, stop_num): my_keyword = my_keyword.strip() print('finding tweets with {} keyword'.format(my_keyword)) oauth = credsfromfile() client = Query(**oauth) tweets = client.search_tweets(keywords=my_keyword, limit=stop_num) dump_tweets(tweets, json_writer)
def streamtofile_demo(limit=20): """ Write 20 tweets sampled from the public Streaming API to a file. """ oauth = credsfromfile() client = Streamer(**oauth) client.register(TweetWriter(limit=limit, repeat=False)) client.statuses.sample()
def limit_by_time_demo(limit=20): """ Sample from the Streaming API and send output to terminal. """ oauth = credsfromfile() client = Streamer(**oauth) client.register(TweetWriter(limit=limit, date_limit=DATE)) client.sample()
def search_demo(keywords="nltk"): """ Use the REST API to search for past tweets containing a given keyword. """ oauth = credsfromfile() client = Query(**oauth) for tweet in client.search_tweets(keywords=keywords, limit=10): print(tweet["text"])
def sampletoscreen_demo(limit=20): """ Sample from the Streaming API and send output to terminal. """ oauth = credsfromfile() client = Streamer(**oauth) client.register(TweetViewer(limit=limit)) client.sample()
def tracktoscreen_demo(track="taylor swift", limit=10): """ Track keywords from the public Streaming API and send output to terminal. """ oauth = credsfromfile() client = Streamer(**oauth) client.register(TweetViewer(limit=limit)) client.filter(track=track)
def tweets_by_user_demo(user='******', count=200): """ Use the REST API to search for past tweets by a given user. """ oauth = credsfromfile() client = Query(**oauth) client.register(TweetWriter()) client.user_tweets(user, count)
def search_demo(keywords='nltk'): """ Use the REST API to search for past tweets containing a given keyword. """ oauth = credsfromfile() client = Query(**oauth) for tweet in client.search_tweets(keywords=keywords, limit=10): print(tweet['text'])
def tweets_by_user_demo(user="******", count=200): """ Use the REST API to search for past tweets by a given user. """ oauth = credsfromfile() client = Query(**oauth) client.register(TweetWriter()) client.user_tweets(user, count)
def lookup_by_userid_demo(): """ Use the REST API to convert a userID to a screen name. """ oauth = credsfromfile() client = Query(**oauth) user_info = client.user_info_from_id(USERIDS) for info in user_info: name = info["screen_name"] followers = info["followers_count"] following = info["friends_count"] print(f"{name}, followers: {followers}, following: {following}")
def lookup_by_userid_demo(): """ Use the REST API to convert a userID to a screen name. """ oauth = credsfromfile() client = Query(**oauth) user_info = client.user_info_from_id(USERIDS) for info in user_info: name = info['screen_name'] followers = info['followers_count'] following = info['friends_count'] print("{0}, followers: {1}, following: {2}".format(name, followers, following))
def followtoscreen_demo(limit=10): """ Using the Streaming API, select just the tweets from a specified list of userIDs. This is will only give results in a reasonable time if the users in question produce a high volume of tweets, and may even so show some delay. """ oauth = credsfromfile() client = Streamer(**oauth) client.register(TweetViewer(limit=limit)) client.statuses.filter(follow=USERIDS)
def obtener_Twits(listaPalabras, DicPalabras): listaPalabrasConsulta = [] # Esto podria mejorarlo # size = len(listaPalabras) / 2 for x in list(DicPalabras)[0:4]: listaPalabrasConsulta.append(x) print("Lista de palabras para la consulta: ", listaPalabrasConsulta) # Consulta a Twitter, genera un and de las palabras mmas importantes (El espacio es AND logico y , es un OR Logico) txt = ' '.join(listaPalabrasConsulta) oauth = credsfromfile() client = Query(**oauth) tweets = client.search_tweets(keywords=txt, limit=10) arrTweets = [] for tweet in tweets: arrTweets.append(Standardizer.standardize(tweet['text'])) return arrTweets
def limit_by_time_demo(keywords="nltk"): """ Query the REST API for Tweets about NLTK since yesterday and send the output to terminal. This example makes the assumption that there are sufficient Tweets since yesterday for the date to be an effective cut-off. """ date = yesterday() dt_date = datetime.datetime(*date) oauth = credsfromfile() client = Query(**oauth) client.register(TweetViewer(limit=100, lower_date_limit=date)) print("Cutoff date: {}\n".format(dt_date)) for tweet in client.search_tweets(keywords=keywords): print("{} ".format(tweet['created_at']), end='') client.handler.handle(tweet)
def __init__( self, config_dir=DEFAULT_CONFIG_DIR, data_dir=DEFAULT_DATA_DIR): """ Initializes the data set with what data has already been downloaded. :param config_dir: directory containing the Twitter configuration. :param data_dir: the directory to which tweets are saved. """ self.oauth = credsfromfile(CREDS_FILE, config_dir) self.config_dir = config_dir self.data_dir = data_dir self.users = {} self.tweets = {} # Load the previously downloaded tweets self.load()
def limit_by_time_demo(keywords="nltk"): """ Query the REST API for Tweets about NLTK since yesterday and send the output to terminal. This example makes the assumption that there are sufficient Tweets since yesterday for the date to be an effective cut-off. """ date = yesterday() dt_date = datetime.datetime(*date) oauth = credsfromfile() client = Query(**oauth) client.register(TweetViewer(limit=100, lower_date_limit=date)) print(f"Cutoff date: {dt_date}\n") for tweet in client.search_tweets(keywords=keywords): print("{} ".format(tweet["created_at"]), end="") client.handler.handle(tweet)
def main(): oauth = credsfromfile() print('Welcome to our Twitter Sentiment Analyzer!') keyword = "" invalid = 1 while(invalid): entry = input("Enter 1 to search a topic: \n" + "Enter 2 to analyze trends: \n") if entry == '1': while keyword == "" or keyword == " ": keyword = input("Enter a keyword or hashtag to search: ") num_of_tweets = Number_of_Tweets() getOpinionsOfTopic(keyword, oauth, num_of_tweets) invalid = 0 elif entry == '2': auth = tweepy.OAuthHandler(oauth.get('app_key'), oauth.get('app_secret')) auth.set_access_token(oauth.get('oauth_token'), oauth.get('oauth_token_secret')) api = tweepy.API(auth) options = [] for trend in api.trends_available(): if (trend['countryCode'] == 'US' and trend['name'] != 'United States'): options.append(trend['name']) totalTrends, location = get_user_input(options) trends = getTopTrends(totalTrends, location, api) num_of_tweets = Number_of_Tweets() for trend in trends: getOpinionsOfTopic(trend, oauth, num_of_tweets) invalid = 0 else: print("Invalid Selection. Try again.")
def search(): oauth = credsfromfile() client = Query(**oauth) df = pd.read_sql('SELECT URL FROM twitter_users', db.executable.raw_connection()) users = set([u.replace('https://twitter.com/', '') for u in df['URL'].values]) terms = set(['@' + u for u in users]) with open('terms.pkl', 'rb') as f: terms = terms.union(pickle.load(f)) searches = 0 li_html = 'name={0} created={1} favorited={2} retweeted={3} \ {4} query={5}' for term in terms: searches += 1 row = twitter_searches.find_one(query=term) if row is not None: if hours_from_now(row['search_date']) < 24: continue tweets = client.search_tweets(keywords=term + ' python http -RT', lang='en') for t in tweets: if int(t['favorite_count']) == 0: log.debug('No favorites') continue text = t['text'] dt = datetime.strptime(t['created_at'], '%a %b %d %H:%M:%S %z %Y') if hours_from_now(dt) > 24: continue if core.not_english(text): log.debug('Not english: {}'.format(text)) continue log.debug('Searching for {}'.format(term)) uname = t['user']['screen_name'] uname_html = '<a href="https://twitter.com/{0}">{0}</a>' if uname not in set(users): db['twitter_users'].insert( dict(Flag='Recommended', Date=datetime.now(), URL='https://twitter.com/' + uname)) html = li_html.format(uname_html.format(uname), t['created_at'], t['favorite_count'], t['retweet_count'], hrefs_from_text(text), term) twitter_searches.upsert(dict(query=term, search_date=datetime.now(), html=html), ['query', 'html']) if searches == 150: break
class TwitterAPI(): oauth = credsfromfile() client = Streamer(**oauth) client.register(TweetViewer(limit=10)) client.sample() path = BASE_DIR def __init__(self, to_screen: True, follow: None, keywords, limit: 10): self.follow = follow self.keywords = keywords tw = Twitter.tweets(to_screen=to_screen, follow=follow, keywords=keywords, limit) def get_twiter(self, keywords): client = Query(**oauth) tweets = client.search_tweets(keywords, limit) tweet = next(tweets) return tweet def get_users(self, *args: None): #by IdUser client = Query(**oauth) user_info = clinet.user_info_from_id(*args) users = [] for user in user_info: name, followers, following = user_info['name'], user_info[ 'followers_count'], user_info['friends_count'] users.append(user) print(f'{name} {followers} {following}\n') return users def save_tweetes_file(self): client = Streamer(**oauth) client.register(TweetWriter(limit=100, subdir='twitter_samples_files')) client.statuses.sample() def get_tweet_JsonFiles(self, json_file2: None): if (json_file2 == None): all_tweets_samples = twitter_samples.fileids() json_file = all_tweet_samples[2] #json file tweet_string = twitter_samples.strings(json_file) return tweet_string tweet_string = json_file2 return tweet_string def tokenize_tweets(self, string_tweet): toked_tweet = twitter_sample.tokenized(string_tweet) return toked_tweet def convert_csv_tweet_file(self, input_file, args=[ 'created_at', 'favorite_count', 'id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'retweet_count', 'text', 'truncated', 'user.id' ]): with open(input_file) as file: json2csv(file, path + 'tweets_text.csv', args) return open(path + 'tweets_text.csv', 'r').readlines() def read_csv_tweets(self, filepath, *args): tw = pd.read_csv(filepath, header='tweets', index_col=1, encoding='utf-8', dtype=32).head() return tw def get_tweet_by_id(self, filepath, tw_id): ids = str(tw_id) ids = StringIO(ids) client = Query(**oauth) hydrated = client.expand_tweetids(ids_f) tw = read_csv_tweets(filepath) for i in hydrated: yield tw.loc[tw['user.id'] == i]['text']
# export TWITTER="twitter.txt" from nltk.twitter import Twitter, Query, Streamer, credsfromfile import pickle from pprint import pprint __author__ = 'kongaloosh' import json from pprint import pprint with open('data/investments.json') as data_file: # with open('data.json') as data_file: oauth = credsfromfile() data = json.load(data_file) tw = Twitter() client = Query(**oauth) for i in range(len(data['investments'])): if type(dict(data['investments'][i])): tweets = client.search_tweets(keywords=data['investments'][i]['name'], limit=100) tweets = list(tweets) data['investments'][i]['tweets'] = tweets with open('data_pickle.pkl', 'w') as outfile: pickle.dump(data, outfile) f = pickle.load(open('data_pickle.pkl', 'r')) print(f)
def search(): oauth = credsfromfile() client = Query(**oauth) df = pd.read_csv('twitter_users.csv') df = df[df['Flag'] == 'Use'] terms = set(['@' + u.replace('https://twitter.com/', '') for u in df['URL'].values]) with open('terms.pkl', 'rb') as f: terms = terms.union(pickle.load(f)) searches = 0 li_html = '<li>name={0} created={1} favorited={2} retweeted={3} \ {4} query={5}</li>' for term in terms: searches += 1 row = twitter_searches.find_one(query=term) if row is not None: if hours_from_now(row['search_date']) < 24: continue tweets = client.search_tweets(keywords=term + ' python http -RT', lang='en', limit=5) for t in tweets: if int(t['favorite_count']) == 0: log.debug('No favorites') continue text = t['text'] dt = datetime.strptime(t['created_at'], '%a %b %d %H:%M:%S %z %Y') if hours_from_now(dt) > 24: continue if core.not_english(text): log.debug('Not english: {}'.format(text)) continue log.debug('Searching for {}'.format(term)) uname = t['user']['screen_name'] uname_html = '<a href="https://twitter.com/{0}">{0}</a>' users = [v.replace('https://twitter.com/', '') for v in pd.read_csv('twitter_users.csv')['URL'].values] with open('twitter_users.csv', 'a') as users_csv: if uname not in set(users): users_csv.write('{0},{1},Recommended\n'.format( datetime.now(), 'https://twitter.com/' + uname)) html = li_html.format(uname_html.format(uname), t['created_at'], t['favorite_count'], t['retweet_count'], hrefs_from_text(text), term) twitter_searches.upsert(dict(query=term, search_date=datetime.now(), html=html), ['query', 'html']) if searches == 150: break
def search(): oauth = credsfromfile() client = Query(**oauth) df = pd.read_csv('twitter_users.csv') df = df[df['Flag'] == 'Use'] terms = set([ '@' + u.replace('https://twitter.com/', '') for u in df['URL'].values ]) with open('terms.pkl', 'rb') as f: terms = terms.union(pickle.load(f)) searches = 0 li_html = '<li>name={0} created={1} favorited={2} retweeted={3} \ {4} query={5}</li>' for term in terms: searches += 1 row = twitter_searches.find_one(query=term) if row is not None: if hours_from_now(row['search_date']) < 24: continue tweets = client.search_tweets(keywords=term + ' python http -RT', lang='en', limit=5) for t in tweets: if int(t['favorite_count']) == 0: log.debug('No favorites') continue text = t['text'] dt = datetime.strptime(t['created_at'], '%a %b %d %H:%M:%S %z %Y') if hours_from_now(dt) > 24: continue if core.not_english(text): log.debug('Not english: {}'.format(text)) continue log.debug('Searching for {}'.format(term)) uname = t['user']['screen_name'] uname_html = '<a href="https://twitter.com/{0}">{0}</a>' users = [ v.replace('https://twitter.com/', '') for v in pd.read_csv('twitter_users.csv')['URL'].values ] with open('twitter_users.csv', 'a') as users_csv: if uname not in set(users): users_csv.write('{0},{1},Recommended\n'.format( datetime.now(), 'https://twitter.com/' + uname)) html = li_html.format(uname_html.format(uname), t['created_at'], t['favorite_count'], t['retweet_count'], hrefs_from_text(text), term) twitter_searches.upsert( dict(query=term, search_date=datetime.now(), html=html), ['query', 'html']) if searches == 150: break