def collect_tweets(): ''' collects the tweets from twitter. only gets the last 500 which is the maximum allowed by the api. ''' # try: # tso = TwitterSearch.TwitterSearchOrder() tso.set_keywords(keywords) tso.set_language('en') tso.set_include_entities(False) tso.set_count(500) # ts = TwitterSearch.TwitterSearch( consumer_key=configs['consumer_key'], consumer_secret=configs['consumer_secret'], access_token=configs['access_token'], access_token_secret=configs['access_token_secret']) # tweets = [] # for tweet in ts.search_tweets_iterable(tso): tweets.append((tweet['created_at'], tweet['text'])) except TwitterSearch.TwitterSearchException as e: None # return tweets
def searchTwitter(keywords): myReturnString = "" try: tso = tw.TwitterSearchOrder() tso.set_keywords(keywords) tso.set_language('en') tso.set_include_entities(False) # Twitter credentials ts = tw.TwitterSearch( consumer_key='w8xDxzVeKgw7dKLmZLgzsmKD4', consumer_secret= 'qXg4b6B4loG1wTLrXCinmzILQyK3HFg8EP8mDhXQATb8PGhNIA', access_token='1112704711-Lp9k0REFZmzI4ODBa6dVhyIYfrREqLjOb15dUDh', access_token_secret='kBVXbCqKPnYvJWduRUQ2Q5H0gRFocZ4duWcoDrx8DqGLb' ) count = 0 for tweet in ts.search_tweets_iterable(tso): #time.sleep(3) if (count >= 1): break myString = '@%s tweeted: %s' % (tweet['user']['screen_name'], tweet['text']) print(myString) count += 1 myReturnString += myString except tw.TwitterSearchException as e: print(e) return myReturnString
def main(argv=None): parser = create_parser() args = parser.parse_args(argv) api_search = None if args.consumer_key and args.consumer_secret and args.access_token and args.access_token_secret: api_search = ts.TwitterSearch( consumer_key=args.consumer_key, consumer_secret=args.consumer_secret, access_token=args.access_token, access_token_secret=args.access_token_secret) else: api_search = ts.TwitterSearch() tso = ts.TwitterSearchOrder() if args.language: tso.set_language(args.language) if args.count: tso.set_count(args.count) if args.enable_entities: tso.set_include_entities(args.enable_entities) if args.latitude and args.longitude: tso.set_geocode(float(args.latitude), float(args.longitude), 30, imperial_metric=False) #print("LON:{0}, LAT:{1}".format(float(args.latitude), float(args.longitude)), file=sys.stderr) if args.keywords is None: tso.set_keywords(['*']) else: tso.set_keywords(args.keywords) num_tweets = 0 for tweet in api_search.search_tweets_iterable( tso, callback=my_callback_closure): num_tweets = num_tweets + 1 fecha_parsed = datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S %z %Y') fecha = fecha_parsed.strftime("%Y%m%d_%H%M%S") #tweet['created_at'] text = tweet['text'].replace("\n", " ") #print( '%s;%s;%s' % ( tweet['user']['screen_name'], fecha, text ) ) print(text)
def count_response(self, con_list): get = gd.get_data() mod = md.modify_data() tso = ts.TwitterSearchOrder() tso.arguments.update({'tweet_mode': 'extended'}) api = get.api() coordinates = get.coordinates() con_count = 0 respo_list = [] respo_loc = [] for con in con_list: print('\tCounting ' + con + '...') tso.set_keywords([con]) for coordinate in coordinates: tso.set_geocode(coordinate['lat'], coordinate['long'], 5, False) for tweet in api.search_tweets_iterable(tso, callback=self.avoid_rate_limit): try: tweet_text = tweet['retweeted_status']['full_text'] except KeyError: tweet_text = tweet['full_text'] cleaned_tweet = mod.clean_tweet(tweet_text) temp_res = cleaned_tweet + ' --- ' + tweet['id_str'] if temp_res not in respo_list: respo_list.append(temp_res) respo_loc.append(coordinate['city']) con_count += 1 with open('raw/response.txt', 'a') as res: print('Total: ' + str(con_count)) res.write(con_list[0] + ': ' + str(con_count) + '\n') for i in range(con_count): response = respo_list[i] + ' (' + respo_loc[i] + ')' res.write(response + '\n') res.write('\n') return con_count
def search(): try: tso = TwitterSearch.TwitterSearchOrder() # create a TwitterSearchOrder object tso.set_keywords(['', 'Doktorarbeit']) # let's define all words we would like to have a look for tso.set_language('de') # we want to see German tweets only tso.set_include_entities(False) # and don't give us all those entity information # it's about time to create a TwitterSearch object with our secret tokens ts = TwitterSearch( consumer_key = 'aaabbb', consumer_secret = 'cccddd', access_token = '111222', access_token_secret = '333444' ) # this is where the fun actually starts :) for tweet in ts.search_tweets_iterable(tso): print( '@%s tweeted: %s' % ( tweet['user']['screen_name'], tweet['text'] ) ) except TwitterSearch.TwitterSearchException as e: # take care of all those ugly errors if there are some print(e)
def tweet_search(keywords, tweet_lang): # load yaml file with secrets to dictionary credentials = yaml.safe_load(open("./credentials.yml")) try: tso = TwitterSearch.TwitterSearchOrder( ) # create a TwitterSearchOrder object tso.set_keywords( keywords ) # defines all words that we like to search for in a tweet tso.set_language( tweet_lang) # set the language of tweets we are searching for tso.set_include_entities(False) # no entity information # create a TwitterSearch object with our secret tokens ts = TwitterSearch.TwitterSearch( consumer_key=credentials['database']['consumer_key'], consumer_secret=credentials['database']['consumer_secret'], access_token=credentials['database']['access_token'], access_token_secret=credentials['database']['access_token_secret']) # Save all tweets in a nested dic # twitty{"id"} # |- {date} -> tweet creation date # |- {text} -> tweet text twitty = {} for tweet in ts.search_tweets_iterable(tso): # Dict based on tweet ID, assign a new dict as value twitty[tweet["id"]] = {} # Key is date and value "created at" twitty[tweet["id"]]["date"] = tweet["created_at"] # Key is text and value is the tweet twitty[tweet["id"]]["text"] = tweet["text"] return twitty except TwitterSearch.TwitterSearchException as e: print(e)
def twitter_tag_search(ck, cs, at, ats, tag, count, lang, proxy=None): """ function for twitter search on hashtags and keywords """ tso = TwitterSearch.TwitterSearchOrder() tso.set_keywords(tag) if lang == 'en' or lang == 'nl': tso.set_language(lang) tso.set_result_type('recent') if proxy: ts = TwitterSearch.TwitterSearch(ck, cs, at, ats, proxy=proxy) else: ts = TwitterSearch.TwitterSearch(ck, cs, at, ats) tweetcount = 0 for tweet in ts.search_tweets_iterable(tso): if tweetcount < count: print(f"@{tweet['user']['screen_name']} - {tweet['created_at']}") print(f"{tweet['text']}") print(f"") tweetcount = tweetcount + 1 else: break
def get_sources(self, meme, number): """Fetches a list of Sources from Twitter that match the given meme""" #stdout available through Heroku logs. TODO: syslog print(" ".join([meme.get_body(), meme.get_exceptions()])) sources = [] try: tso = TwitterSearch.TwitterSearchOrder( ) # create a TwitterSearchOrder object tso.setSearchURL(self._format_query(meme)) tso.setLocale('en') tso.setCount(number) #smallest request that might work tso.setIncludeEntities(False) twitter_search = TwitterSearch.TwitterSearch( consumer_key=Secrets.consumer_key, consumer_secret=Secrets.consumer_secret, access_token=Secrets.access_token, access_token_secret=Secrets.access_token_secret) tweets = twitter_search.searchTweets(tso) retries = 0 while len(sources) < number and retries < 5: for tweet in tweets['content']['statuses']: sources.append( Source(tweet['user']['name'], tweet['text'], tweet['id_str'])) #print(tweet['text']) #there's a lot of strange characters coming in here tweets = twitter_search.searchNextResults() retries += 1 except TwitterSearch.TwitterSearchException as exception: print(exception) #TODO: syslog return sources
def generate_tso(keywords, db_file): ''' Generate tsos combining least frequet keywords. ''' # Get info from db on keywords # (keyword, count, max_id) with sqlite3.connect(db_file) as conn: c = conn.cursor() c.execute('SELECT keyword, count, max_id FROM exp_averages') latest = c.fetchall() c.close() # mere keywords data with sql data latest_df = pd.DataFrame(latest, columns=['keyword', 'count', 'max_id']) df = pd.DataFrame(keywords, columns=['keyword']) df = df[df['keyword'] != '$OR'] # twitter keywords... not allowed df = df[df['keyword'] != 'OR'] # same df = pd.merge(df, latest_df, how='left', on='keyword') thresholds = [ { 'count': 3, 'combine': 50 }, { 'count': 10, 'combine': 10 }, { 'count': 20, 'combine': 4 }, { 'count': 40, 'combine': 2 }, { 'count': None, 'combine': 1 }, ] for threshold in thresholds: # Select a section from the df # truncating the df as it goes through the thresholds # new tweets will have None as count and thus will not # meet any thresholds and will be processed one at time. if threshold['count']: section = df[df['count'] < threshold['count']] else: section = df[:] df.drop(section.index, inplace=True) # Generate tsos while len(section) > 0: # determine the right number of keywords to combine try_n = threshold['combine'] too_long = True while too_long: subsection = section[:try_n] combine = list(subsection.keyword) # use the smallest of the max_id because in the time # from min(max_id) to max(max_id) there might have been # tweets for keywords other then the one of max(max_id) max_id = subsection['max_id'].min() tso = TwitterSearch.TwitterSearchOrder() tso.set_include_entities(True) tso.set_result_type('recent') tso.set_keywords(combine, or_operator=True) if not pd.isnull(max_id): tso.set_since_id(int(max_id)) url = tso.create_search_url() if (len(url) < 450) | (try_n == 1): # exit clause too_long = False logging.debug('Number of tickers combnied {}'.format( len(combine))) logging.debug(combine) logging.debug(tso.create_search_url()) else: try_n -= 1 yield tso section = section.iloc[try_n:]
def __init__(self): print('Gathering tweets with political context...') get = gd.get_data() mod = md.modify_data() api = get.api() tso = ts.TwitterSearchOrder() tso.arguments.update({'tweet_mode': 'extended'}) res_list = [] res_dict = {} json_data = {} senators = get.senators() concerns = get.concerns() coordinates = get.coordinates() for senator in senators: json_data[senator] = {} print('Gathering tweets mentioning ' + senator + '...') for concern in concerns: json_data[senator][concern] = [] con_en = concern.split(',')[0] try: con_tl = concern.split(', ')[1] con_cb = concern.split(', ')[2] con_list = [con_en, con_tl, con_cb] except IndexError: con_tl = concern.split(', ')[1] con_cb = None con_list = [con_en, con_tl] print('\t' + concern + '...') for con_item in con_list: tso.set_keywords([senator, con_item]) for coordinate in coordinates: tso.set_geocode(coordinate['lat'], coordinate['long'], 5, False) for tweet in api.search_tweets_iterable(tso, callback=self.avoid_rate_limit): try: tweet_text = tweet['retweeted_status']['full_text'] is_retweet = True except KeyError: tweet_text = tweet['full_text'] is_retweet = False res_text = tweet['id_str'] + ': ' + tweet_text if res_text not in res_list: res_list.append(res_text) if tweet['is_quote_status']: if is_retweet: quote_text = tweet['retweeted_status']['quoted_status']['full_text'] else: quote_text = tweet['quoted_status']['full_text'] else: quote_text = None tweet_text2 = mod.clean_tweet(tweet_text) tweet_text2 = mod.translate(tweet_text2) if tweet_text2 is None: continue if quote_text is not None: quote_text2 = mod.clean_tweet(quote_text) quote_text2 = mod.translate(quote_text2) else: quote_text2 = None json_data[senator][concern].append({ 'tweet_text': tweet_text, 'tweet_text2': tweet_text2, 'is_retweet': is_retweet, 'quote_text': quote_text, 'quote_text2': quote_text2, 'tweet_id': tweet['id'], 'rt_count': tweet['retweet_count'], 'tweet_created': tweet['created_at'], 'tweet_loc': coordinate['city'], 'user_id': tweet['user']['id'], 'user_created': tweet['user']['created_at'], 'user_verified': tweet['user']['verified'], 'user_follower': tweet['user']['followers_count'], 'user_total_tweet': tweet['user']['statuses_count'], 'user_loc': tweet['user']['location'] }) res_tweet = mod.remove_stopwords(tweet_text2) if quote_text2 is not None: res_dict = self.initialize_triangulation( res_dict, res_tweet + ' ' + quote_text2 + ' ' + coordinate['city']) else: res_dict = self.initialize_triangulation( res_dict, res_tweet + ' ' + coordinate['city']) print('Saving collected tweets into \"gathered_tweets.json\" file...') self.save_tweet(json_data) self.save_cleaned_tweet(res_dict) print('Finished gathering tweets with political context...')
def twitter_search(db_file, output_dir, keywords_file): ts = TwitterSearch.TwitterSearch( consumer_key=twitter_keys.consumer_key, consumer_secret=twitter_keys.consumer_secret, access_token=twitter_keys.access_token, access_token_secret=twitter_keys.access_token_secret) start = time.time() window_count = 1 conn = sqlite3.connect(db_file) c = conn.cursor() if keywords_file: keywords = helpers.get_keywords_file(keywords_file) else: keywords = helpers.get_keywords_sql(db_file) pbar = tqdm(keywords) for keyword in pbar: logging.debug('Getting: ' + keyword) # keyword = keyword.replace('/','_') pbar.set_description("Processing {:10}".format(keyword)) pbar.refresh() tso = TwitterSearch.TwitterSearchOrder() tso.set_include_entities(True) tso.set_result_type('recent') tso.set_keywords([keyword]) # only look for tweets since last search.. c.execute('SELECT max_id FROM latest_search WHERE keyword=?', [keyword]) fetched = c.fetchone() since_id = fetched[0] if not fetched is None else None if since_id: tso.set_since_id(since_id) ts.search_tweets(tso) max_id = [] max_date = [] min_date = [] count = [] try_next = True while try_next: # parse response meta = ts.get_metadata() remaining_limit = int(meta.get('x-rate-limit-remaining', 0)) num_tweets = ts.get_amount_of_tweets() tweets = ts.get_tweets().get('statuses', []) helpers.write_tweets(tweets, output_dir) if num_tweets != 0: max_id.append(max([tweet['id'] for tweet in tweets])) max_date.append( max([ pd.to_datetime(tweet['created_at'], utc=True) for tweet in tweets ])) min_date.append( min([ pd.to_datetime(tweet['created_at'], utc=True) for tweet in tweets ])) count.append(num_tweets) if remaining_limit == 0: try: limit_reset = int( meta.get('x-rate-limit-reset', time.time() + 15 * 60)) + 10 # extra sec to be on the safe side # convert to correct datetime limit_reset_dt = pd.to_datetime(limit_reset, unit='s', utc=True) limit_reset_dt = limit_reset_dt.tz_convert('Europe/London') pbar.set_description( 'Sleeping until {:%H:%M:%S}'.format(limit_reset_dt)) pbar.refresh() pause.until(limit_reset) pbar.set_description("Processing %s" % keyword) pbar.refresh() window_count += 1 except Exception as e: logging.warn('limit_reset ERROR: ' + keyword) logging.warn(str(e)) logging.warn('Sleep for 15min...') # wait the maximum time until next window... pbar.set_description("Sleeping for 15 min.") pbar.refresh() pause.minutes(15) pbar.set_description("Processing {:10}".format(keywords)) pbar.refresh() window_count += 1 # check if there is a next page for this search try: try_next = ts.search_next_results() except: try_next = False # stats and logging for current keyword max_id = max(max_id) if len(max_id) != 0 else since_id max_date = max(max_date) if len(max_date) != 0 else None min_date = min(min_date) if len(min_date) != 0 else None count = sum(count) search_stats = { 'keyword': keyword, 'count': count, 'min_date': min_date.strftime('%Y-%m-%d %H:%M:%S') if not min_date is None else None, 'max_date': max_date.strftime('%Y-%m-%d %H:%M:%S') if not max_date is None else None, 'max_id': max_id, 'search_date': pd.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), } helpers.dict_to_sqlite(search_stats, 'latest_search', db_file) # stats and logging for iteration end = time.time() total_time = round((end - start) / 60) iteration_stats = { 'start_time': pd.to_datetime(start, unit='s').strftime('%Y-%m-%d %H:%M:%S'), 'duration_min': total_time, 'keywords': len(keywords), 'tweets_got': ts.get_statistics()[1], 'queries_submitted': ts.get_statistics()[0], 'windows_used': window_count, } helpers.dict_to_sqlite(iteration_stats, 'iterations', db_file) logging.info('Total number of windows: ' + str(window_count)) logging.info('Total time (min): ' + str(total_time)) logging.info('Total tweets got: ' + str(ts.get_statistics()[1])) # close db file c.close() conn.close()
import TwitterSearch as ts try: tso = ts.TwitterSearchOrder() # create a TwitterSearchOrder object tso.set_keywords(['Trump']) # let's define all words we would like to have a look for tso.set_language('en') # we want to see English tweets only tso.set_include_entities(False) # and don't give us all those entity information # it's about time to # create a TwitterSearch object with our secret tokens print "Tessstttttttt" ts = ts.TwitterSearch( consumer_key='z0JO1aunGAWu0xgxtpOMiw2qx', consumer_secret='8vwUONvjOAfBcnNU9X1mtg9YJGPvDLjGZsZnbgs0CWhbOxYZDc', access_token='3021210887-iKtdExGlsNC6JNGqsgKdSTgjaKVjyTDLMDLiXKM', access_token_secret='GdeRJ504DoANMZqDuE02vO4XFPJcux4pUzzqTCa3Gg6Oj') # this is where the fun actually starts : for tweet in ts.search_tweets_iterable(tso): # print('@%s tweeted: %s' % ( # tweet['user']['screen_name'], # tweet['text'])) print tweet except ts.TwitterSearchException as e: # take care of all those ugly errors if there are some print(e)