def get_tweets(query_set, twitter_args, query_filter=None): tweets_list = list() params = c.TWITTER_PARAMS for query in query_set: curr_month = "{}-{}".format(dt.now().year, format_date_str(dt.now().month)) _, curr_usage = get_twitter_api_usage(curr_month) if curr_usage >= 24999: print("Twitter API limit is about to exceed! Returning now ...\n") break if query_filter: q = '("{}") {}'.format(query, query_filter) else: q = "{}".format(query) print("No filter/Filter in query_set: {}".format(q)) print("Collecting for {}".format(q)) try: rule = gen_rule_payload( q, results_per_call=params["RESULTS_PER_CALL"]) tweets = collect_results(rule, max_results=params["MAX_RESULTS"], result_stream_args=twitter_args) print("number of tweets: {}".format(len(tweets))) update_twitter_api_usage(curr_month, len(tweets)) tweets_list.append(tweets) except Exception as e: print("Exception occurred while fetching tweets: {}".format(e)) break return tweets_list
def search_lima(search, premium_search_args): # rule = gen_rule_payload(search + " point_radius:[-12.089282 -77.020041 10mi]", results_per_call=100) rule = gen_rule_payload(search + "place:Peru", results_per_call=100) data = collect_results(rule, max_results=100, result_stream_args=premium_search_args) return data
def fetch_tweets(): tweets = collect_results(rule, max_results=10, result_stream_args=premium_search_args) with open(out_file, 'w') as out: csv = writer(out) row = ("user", "city/state", "country", "text", "created_at") csv.writerow(row) for tweet in tweets: place_name = "" place = tweet.get("place") if place != None: place_name = place["full_name"] print(tweet) row = ( tweet.get("screen_name"), place_name, country_code, tweet.all_text, tweet. created_at_string ##tweet.created_at_datetime.strftime("%Y-%m-%d|%H:%M:%S") ) values = [(value.encode('unicode') if hasattr(value, 'encode') else value) for value in row] csv.writerow(values) out.close()
def searchtweets_query(file_name='../../Twitter_cred/full_arch_2007-2020.txt'): premium_search_args = load_credentials( filename="~/.twitter_keys.yaml", yaml_key="search_tweets_fullarchive_dev", env_overwrite=False) print(premium_search_args) print() #query = "(nat OR natte OR water OR wateroverlast OR regen OR storm OR blank OR bui OR overstroming OR hoosbui OR schade OR noodweer OR wolkbreuk OR waterschade) has:geo place_country:NL" query = "(wateroverlast OR overstroming OR waterschade) has:geo place_country:NL" do_query = False if do_query: from_date = "2007-01-01" to_date = "2020-01-01" rule = gen_rule_payload(query, results_per_call=500, from_date=from_date, to_date=to_date) tweets = collect_results(rule, max_results=500 * 50, result_stream_args=premium_search_args ) # change this if you need to for tweet in tweets: with open(file_name, 'a') as fp: fp.write(json.dumps(tweet) + '\n') else: print( "No query was done, in order to perform a Twitter query, set do_query to True in Twitter/searchtweets_query.py" )
def load(): config = twitter_conifg() base_date = datetime.datetime.today() date_list = [base_date - datetime.timedelta(days=x) for x in range(5)] date_list.reverse() all_tweets = [] for idx, date in enumerate(date_list): if idx != 4: final_date = date + datetime.timedelta(days=1) search_args = load_credentials( filename="./configs/twitter_api.yaml", yaml_key="search_tweets_v2", env_overwrite=False) query = gen_request_parameters( config['query'], results_per_call=100, place_fields='country', start_time=date.strftime('%Y-%m-%d'), end_time=final_date.strftime('%Y-%m-%d')) tweets = collect_results(query, max_tweets=1000, result_stream_args=search_args) def add_date(x): x['fecha'] = date.strftime('%Y-%m-%d') return x tweets = list(map(add_date, tweets)) all_tweets.append(tweets) all_tweets = reduce(lambda x, y: x + y, all_tweets) return all_tweets
def counts(queries, nameList): # premium_search_args = load_credentials(filename="twitter_keys.yaml", yaml_key="search_tweets_api", env_overwrite=False) # queries = ['"$LTC" OR "Litecoin"','"$ETH" OR "Ethereum"','"$BTC" OR "Bitcoin"', 'Holochain', '"$NPXS" OR "Pundi X"'] counts = [] for i in range(0, len(queries)): count_rule = gen_rule_payload(queries[i], count_bucket="day") temp = collect_results(count_rule, result_stream_args=premium_search_args) print(temp) print("\n") counts.append(temp[1]['count']) print('\n', counts) """CryptoCompare""" from cryptocompy import price avgPrices = [] toCurr = 'USD' yesterday = date.today() - timedelta(1) datestr = str(yesterday) + ' 00:00:00' for elem in nameList: # avgtemp = price.get_day_average_price(elem[0], toCurr)[elem[0]]['USD'] # avgPrices.append(avgtemp) eodtemp = price.get_historical_eod_price(elem[0], toCurr, datestr, try_conversion=True) eodtemp = eodtemp[elem[0]][toCurr] avgPrices.append(eodtemp) plot(counts, avgPrices, nameList)
def count_tweets(query, from_date, to_date, credentials_path, yaml_key, count_bucket="day", results_per_call=500, verbose=False, **kwargs): """ Returns the number of existing Tweets for a given query and time frame. Since this function doesn't pull tweets, this is a safe option to check the effectiveness of your filters without exhausting the API's capacity. Parameters ---------- query : str Query passed to the Twitter API to fecth Tweets. from_date : str or None Date format as specified by `convert_utc_time` for the starting time of your search. to_date : str or None Date format as specified by `convert_utc_time` for the end time of your search. credentials_path : str Path for the yaml file with the Twitter API credentials. yaml_key : str Key within the yaml file containing the Twitter API credentials to be used. count_bucket : str or None, default="day" If using the counts api endpoint, will define the count bucket for which tweets are aggregated. results_per_call : int, default=500 Number of Tweets returned per call. verbose : int or bool, default=False Controls the verbosity when pulling the tweet count. Returns ------- counts : dict Number of existing tweets for each bucket. """ logger = logging.getLogger(__name__) logger.propagate = verbose logger.info('Counting Tweets') search_args = load_credentials(credentials_path, yaml_key=yaml_key) count_rule = gen_rule_payload(query, from_date=from_date, to_date=to_date, count_bucket=count_bucket, results_per_call=results_per_call) counts = collect_results(count_rule, result_stream_args=search_args) return counts
def get_rule_count(self): """ before calling the production api, get a count of the tweets that match the rule """ rule_count = gen_rule_payload(self.raw_rule, from_date=self.from_date, to_date=self.to_date, results_per_call=500, count_bucket='day') counts_list = collect_results(rule_count, max_results=500, result_stream_args=self.premium_search_args) [print(count) for count in counts_list]
def use_premium(search, filename, from_date, to_date, enpoint='full'): ''' Collect historical tweets ''' if endpoint == '30day': endpoint_key = 'search_premium_30day_api' #endpoint_key = 'search_lynxx_30day_api' else: endpoint_key = 'search_premium_full_api' #endpoint_key = 'search_lynxx_full_api' try: tweet_df = pd.read_csv(filename, dtype=str, encoding='ISO-8859-1') except FileNotFoundError: tweet_df = pd.DataFrame() # Extract the credentials for the endpoint. search_stream = load_credentials(filename='./credentials.yaml', yaml_key=endpoint_key, env_overwrite=False) # Collect tweets while we are permitted. # Todo: Still dont know how to catch the re-try limit error? while to_date > from_date: rule = gen_rule_payload(search, from_date=from_date, to_date=to_date, results_per_call=100) try: tweets = collect_results(rule, max_results=2000, result_stream_args=search_stream) except: break for idx, tweet in enumerate(tweets): tweet_df = tweet_df.append([json_normalize(tweet)], ignore_index=True, sort=False) if idx % 1000 == 0: print(f'{tweet["created_at"]}: {tweet["text"]}') tweet_df.to_csv(filename, index=False) tweet_df['created_at'] = pd.to_datetime(tweet_df['created_at'], utc=True) mindate = min(tweet_df['created_at']).date() - timedelta(hours=1) to_date = mindate.strftime('%Y-%m-%d %H:%M') tweet_df['created_at'] = pd.to_datetime(tweet_df['created_at']) min(tweet_df['created_at']) tweet_df.drop_duplicates(subset=['created_at', 'user.screen_name'], keep='first', inplace=True) tweet_df.sort_values(by='created_at', inplace=True) tweet_df.to_csv(filename, index=False)
def tweetCollector(self, rule): try: tweets = st.collect_results(rule, max_results=100, result_stream_args=self.thirtyAuth) except HTTPError as e: print(e) self.collectionCounter = self.collectionCounter + 1 return tweets
def __init__(self, search_query): print(self.__class__.__name__) self.premium_search_args = searchtweets.load_credentials() self.rule = searchtweets.gen_rule_payload( search_query.query, to_date=(datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')) try: self.iter = iter( searchtweets.collect_results( self.rule, result_stream_args=self.premium_search_args)) except Exception: self.iter = iter([])
def fetch_and_parse_tweets(): search_args = searchtweets.load_credentials() tweets = searchtweets.collect_results( {"query": "from:hypertextadrien"}, max_results=100, result_stream_args=search_args ) parsed_tweets = [parse_tweet(status.all_text) for status in tweets] parsed_tweets = [t for t in parsed_tweets if t is not None] print("%s tweets were fetched." % len(parsed_tweets)) pickle.dump(parsed_tweets, open(CACHE_TWEETS_FILE_PATH, "wb")) return parsed_tweets
def get_emoji_tweets(self, emoji_list): emoji_list = ' OR '.join(emoji_list) print(emoji_list) max_tweets = 100 rule = searchtweets.gen_rule_payload( emoji_list, # from_date="2017-01-01", #UTC 2017-09-01 00:00 # to_date="2019-02-12",#UTC 2017-10-30 00:00 results_per_call=max_tweets) print(rule) tweets = searchtweets.collect_results( rule, max_results=500, result_stream_args=self.premium_search_args) return tweets
def read_tweets(term): """ @return: string output split into 2000 messages. """ rule = gen_rule_payload( term, results_per_call=100) # testing with a sandbox account print(rule) tweets = collect_results(rule, 100, premium_search_args()) print(tweets[:10]) output = '\n\n'.join( [f'@{t.screen_name}: {t.all_text}' for t in tweets[:10]]) output = split_2000(output) return output
def get_premium_tweets(candidate, handle, topic): rule = gen_rule_payload(topic + " to:" + handle, results_per_call=100) tweets = collect_results(rule, max_results=100, result_stream_args=premium_search_args) data = TweetHelpers.get_tweet_objects(candidate, topic) if os.path.exists(candidate + "/" + topic + "/raw/tweets.pkl"): os.remove(candidate + "/" + topic + "/raw/tweets.pkl") if data is None: data = [] data += tweets TweetHelpers.pickle_data(candidate + "/" + topic + "/raw/tweets.pkl", tweets)
def get_premium_all_tweets(self): rule_str = "from:" + self.twitter_user.screen_name print("get_all_twitter_user_tweets: rule_str: " + rule_str) rule = gen_rule_payload(rule_str) tweets_paresd = [] try: tweets = collect_results(rule, max_results=100, result_stream_args=search_args) print("tweets len:" + str(len(tweets))) for t in tweets: tweets_paresd.append(twitter.Status().NewFromJsonDict(t)) print("tweets_paresd len:" + str(len(tweets_paresd))) self.place.add_tweet_list(tweets_paresd, self.user, self.region, old_user=True) except Exception as exc: print("In get_all_twitter_user_tweets, Problem loading tweets") print(exc) return tweets_paresd
def premium_download_save_tweets(self, file_name, max_results=100): """ Downloads all Tweets since from_date for a Query and saves them into txt File (in append mode) """ tweets = collect_results(self.rule, max_results=max_results, result_stream_args=self.premium_search_args) # save all tweets into specified file_name with open(file_name, 'a+') as f: for i, tweet in enumerate(tweets): if i % 100 == 0: print('write tweet %s to %s' % (i, file_name)) tw = self.get_tweet_attributes(tweet) f.write(jsonpickle.encode(tw, unpicklable=False) + '\n')
def collect_and_write_tweets(self, query: str, results_per_call: int = 100, num_tweets: int = 100, from_date: datetime.date = None, to_date: datetime.date = None): """ :param query: :param results_per_call :param num_tweets: :param from_date: :param to_date: :return: """ if results_per_call > 100: print( "Sandbox API limited to 100 results per request, cannot retrieve {} results" .format(results_per_call)) rule = gen_rule_payload(query, results_per_call=results_per_call, from_date=from_date.isoformat(), to_date=to_date.isoformat()) tweets = collect_results(rule, max_results=num_tweets, result_stream_args=self.premium_search_args) # cast tweet objects to dict and create pandas data frame tweets_dict_list = [dict(tweet) for tweet in tweets] tweets_df = pd.DataFrame(tweets_dict_list) tweets_df.index = tweets_df.id try: # write new data set to .csv file without duplicates self.tweets_df = pd.concat([self.tweets_df, tweets_df], axis=0, join='outer') self.tweets_df = self.tweets_df[~self.tweets_df.index.duplicated()] self.tweets_df.to_csv("{}_tweets.csv".format(self.topic)) except: # save backup of collected tweets tweets_df.to_csv("{}_{}_{}_backup_tweets.csv".format( self.topic, datetime.datetime.now().date(), datetime.datetime.now().time()))
def collect_tweets(query, from_date, to_date, results_per_call, max_results, premium_search_args): # query: rule to query twitter API. eg if wanting to collect tweets related to bitcoin, then query='bitcoin' # maxResults is capped at 100 for sandbox account, even though there should be a next function to get more, it # appears max_results=500 is accepted without any extra work # date format: YYYY-mm-DD HH:MM string format which is automatically called by convert_utc_time. eg '2019-09-09' -> '201909090000' # from_date is inclusive. to_date is non-inclusive. Appears to start at from_date and start collecting tweets working # backwards to to_date collect_rule = gen_rule_payload(pt_rule=query, results_per_call=results_per_call, from_date=from_date, to_date=to_date) print(collect_rule) collected_tweets = collect_results(collect_rule, max_results=max_results, result_stream_args=premium_search_args) return collected_tweets
def auth(dates): premium_args = load_credentials(filename="credentials.yaml", yaml_key='search_tweets_api_dev', env_overwrite=False) # Change the below string to the candidate you're looking for info on. Don't remove the lang:en otherwise you'll # get results in any language queryString = 'Donald Trump lang:en' rule = gen_rule_payload(queryString, results_per_call=100, from_date=dates[0], to_date=dates[1]) print(rule) tweets = collect_results(rule, max_results=100, result_stream_args=premium_args) [print(tweet.all_text) for tweet in tweets] return tweets, queryString
def fullarchivetweetsearch(event, context): data = json.loads(event['body']) screen_name = data['screenname'] hash_tag = data['hashtag'] from_past_number_of_days = data['numberofDays'] """ Call the method to get the access token """ access_token = app_only_oauth_access_token(os.environ['CONSUMER_KEY'], os.environ['CONSUMER_SECRET']) from_to_dates = get_tweet_time_window(from_past_number_of_days) """ Generate the rule criteria to filter the tweets """ rule = gen_rule_payload("from:" + screen_name + " lang:en " + hash_tag, from_date=str(from_to_dates['from_date']), to_date=str(from_to_dates['to_date']), results_per_call=100) print("rule:", rule) search_args = { "bearer_token": access_token, "endpoint": os.environ['FULLARCHIVE_TWEETSEARCH_ENDPOINT']} """ calling the twitter api """ tweets_list = collect_results(rule, max_results=100, result_stream_args=search_args) appended_tweets = [] """ Iterating the twitter search response """ for tweet in tweets_list: appended_tweets.append(str(tweet.created_at_datetime) + " " + tweet.text) json_response = { "Given Hashtag": hash_tag, "Given TwitterAccount": screen_name, "Tweet count": str(len(tweets_list)), "Tweet Text": appended_tweets } output = {'statusCode': 200, 'body': json.dumps(json_response)} return output
def get_tweets(keyword, limit='100', begin_date=datetime.now().strftime('%Y-%m-%d'), end_date=datetime.now().strftime('%Y-%m-%d'), lang='id'): query = keyword + ' lang:' + lang rule = gen_rule_payload(query, from_date=begin_date, to_date=end_date, results_per_call=500) tweets = collect_results(rule, max_results=500, result_stream_args=search_args) return [tweet.all_text for tweet in tweets]
def collate_tweets( keywords, max_tweets_per_request=100, locations=states_list, ): tweets_by_location = {} for location in locations: query = '{} place:"{}"'.format(keywords, '{}, USA'.format(location)) tweets = collect_results({"query": query}, max_results=max_tweets_per_request, result_stream_args=search_args) tweets_by_location[location] = tweets return tweets_by_location
def createTestData(search_string): try: print('Start Fetching') #print(date,nextdate) rule = gen_rule_payload(search_string, from_date="2019-05-18", to_date="2019-05-20", results_per_call=500) alltweets = collect_results(rule, max_results=500, result_stream_args=premium_search_args) print("data fetched") return alltweets except: print("error")
def main(mytimer: func.TimerRequest, fetchedTweetsQue: func.Out[func.QueueMessage]) -> None: time = datetime.utcnow().replace(tzinfo=timezone.utc) hashtags = get_hashtags() credentials = load_twitter_credentials() start_time = time - timedelta(minutes=5) tweet_fields = ['id', 'text', 'created_at', 'lang'] for hashtag in hashtags: query = hashtag logging.info(f'Fetching tweets with query: {query}') retquest_params = gen_request_parameters( query, start_time=start_time.strftime("%Y-%m-%d %H:%M"), tweet_fields=','.join(tweet_fields), # since_id= # TODO: Use last fetch tweet id in request ) response = collect_results(retquest_params, max_tweets=100, result_stream_args=credentials) if (response): tweets = response[:-1] response_metadata = response[-1] # TODO: Store 'newest_id' # TODO: Support pagination logging.info(f'Unfiltered tweets count: {len(tweets)}') messages = [] for t in filter_tweets(tweets): t['hashtag'] = hashtag messages.append(dumps(t)) logging.info(f'Filtered tweets count: {len(messages)}') logging.info(messages) fetchedTweetsQue.set(messages) logging.info('Python timer trigger function ran at %s', time.isoformat())
def read_user_timeline(name='', from_date=pd.to_datetime('2020-1-1'), to_date=pd.to_datetime('2020-9-1'), method='tweepy'): if method == 'fullsearch': premium_search_args = load_credentials(".twitter_keys.yaml", account_type="premium", env_overwrite=False) rule = gen_rule_payload( "from:" + name, from_date=str( from_date.strftime('%Y-%m-%d')), #UTC 2017-09-01 00:00 to_date=str(to_date.strftime('%Y-%m-%d')), results_per_call=100) tweets = collect_results(rule, max_results=100, result_stream_args=premium_search_args ) # change this if you need to elif method == 'tweepy': creds = json.load(open("twitter_credentials.json", "r")) api = Twitter( auth=OAuth(creds['ACCESS_TOKEN'], creds['ACCESS_SECRET'], creds['CONSUMER_KEY'], creds['CONSUMER_SECRET'])) n = 10 for i in range(n): if (i == 0): tweets = api.statuses.user_timeline(screen_name="@" + name, count=400) last_id = tweets[-1]['id'] else: t = api.statuses.user_timeline(screen_name="@" + name, count=400, max_id=last_id) last_id = t[-1]['id'] tweets.extend(t) pickle.dump(tweets, open(name + 'tweets.sav', 'wb')) return tweets
def get_querydata(query, write_file): count = 0 errorCount= 0 msg = "" filename = write_file searchquery = query ''' rule = gen_rule_payload(searchquery, from_date="2018-01-01", #UTC 2018-10-01 00:00 to_date="2018-10-31",#UTC 2018-10-30 00:00 results_per_call=30) ''' rule = gen_rule_payload(searchquery, results_per_call=30) print(rule) tweets = collect_results(rule, max_results=30, result_stream_args=premium_search_args) tweets = iter(tweets) with open(filename, 'a', newline='') as alltweets: writer = csv.writer(alltweets) while True: try: tweet = next(tweets) count += 1 # use count-break during dev to avoid twitter restrictions if (count > 30): break # need to handle other exceptions throw by twitterdev api wrapper except StopIteration: break try: print("Writing to CSV tweet number:"+str(count)) writer.writerow([tweet.created_at_datetime, tweet.text, tweet.generator.get("name")]) except UnicodeEncodeError: errorCount += 1 print("UnicodeEncodeError,errorCount ="+str(errorCount)) print("completed, errorCount ="+str(errorCount)+" total tweets="+ str(count))
def full_archive(request, which, where, fr, until): call = '' strn = which lenght = len(strn.split(',')) - 1 for i, ref in enumerate(strn.split(',')): if (i == lenght): call += '#' + ref else: call += '#' + ref + ' OR ' print(call) rule = gen_rule_payload(f"({call}) has:images point_radius:[{where}]", results_per_call=100, from_date=f"{fr}", to_date=f"{until}") print(rule) tweets = collect_results(rule, max_results=100, result_stream_args=premium_search_full_args) [print(tweet.all_text) for tweet in tweets[0:10]] return JsonResponse(tweets, safe=False)
def searchTweetsAndWriteToFile(search_term, file_name, lang): if (not isinstance(lang, LanguageEnum)): raise TypeError("lang must be LanguageEnum instance") if (not len(search_term > 0)): return "" premium_search_args = load_credentials("~/.twitter_keys.yaml", yaml_key="search_tweets_30_day_dev", env_overwrite=False) # testing with a sandbox account rule = gen_rule_payload(search_term + " lang:" + lang.value, results_per_call=100) print(rule) tweets = collect_results(rule, max_results=200, result_stream_args=premium_search_args) with open(file_name, "w") as fp: for tweet in tweets: json.dump(tweet, fp) fp.write("\n") fp.close()
def thirty_days(request, which, where): call = '' strn = which lenght = len(strn.split(',')) - 1 for i, ref in enumerate(strn.split(',')): if (i == lenght): call += '#' + ref else: call += '#' + ref + ' OR ' print(call) rule = gen_rule_payload( f"({call}) has:images point_radius:[{where}]", results_per_call=100) # testing with a sandbox account print(rule) tweets = collect_results( rule, max_results=100, result_stream_args=premium_search_args) # change this if you need to [ print(tweet.all_text + '-' + tweet['created_at'], end='\n\n') for tweet in tweets[0:20] ] return JsonResponse(tweets, safe=False)
def get_tweets(query_string, days_offsets, tweet_fields, max_nb_tweets_per_day, total_nb_tweets, search_tweets_args): tweets = [] remaining_number_of_tweets = 0 # generate query and request tweets for each day offset. for i, day_offset in enumerate(days_offsets): max_tweets = max_nb_tweets_per_day + remaining_number_of_tweets if i == len(days_offsets) - 1: max_tweets = total_nb_tweets - len(tweets) query = get_query(query_string, day_offset, day_offset + 1, tweet_fields, nb_tweets=10) collected_tweets = collect_results( query, max_tweets=max_tweets, result_stream_args=search_tweets_args)[:-1] tweets.extend(collected_tweets) remaining_number_of_tweets = max_tweets - len(collected_tweets) return tweets