def create_search_payload(self): if self.do_sandbox: self.rule = gen_rule_payload(self.raw_rule, results_per_call=100, from_date=self.from_date, to_date=self.to_date) else: self.rule = gen_rule_payload(self.raw_rule, results_per_call=500, from_date=self.from_date, to_date=self.to_date)
def get_file(aname, cak, cask, etype, hashtag, keywords, fdate='00-00-0000', tdate='00-00-0000', ftime='00:00', ttime='00:00'): if etype == 'efa': # Full archive scraping (refer to limits on README) endp = 'https://api.twitter.com/1.1/tweets/search/fullarchive/' + aname + '.json' elif etype == 'tdays': # 30 days scraping (refer to limits on README) endp = 'https://api.twitter.com/1.1/tweets/search/30day/' + aname + '.json' else: endp = 'ERROR' # Creating a yaml credentials file config = dict(search_tweets_api=dict(account_type='premium', endpoint=endp, consumer_key=cak, consumer_secret=cask)) with open('C:\\Users\\Samuktha\\Documents\\USC\\twitter\\proj\\cred.yaml', 'w') as config_file: yaml.dump(config, config_file, default_flow_style=False) # loading credentials premium_search_args = load_credentials( 'C:\\Users\\Samuktha\\Documents\\USC\\twitter\\proj\\cred.yaml', yaml_key='search_tweets_api', env_overwrite=True) print(premium_search_args) if etype == 'efa': rule = gen_rule_payload( results_per_call=100, from_date=fdate + ' ' + ftime, #"2019-07-06 01:00", to_date=tdate + ' ' + ttime, #"2019-07-06 02:15", pt_rule=keywords, ) else: rule = gen_rule_payload(results_per_call=100, pt_rule=keywords) # result stream rs = ResultStream(rule_payload=rule, max_results=50, **premium_search_args) return rs
def searchtweets_query(file_name='../../Twitter_cred/full_arch_2007-2020.txt'): premium_search_args = load_credentials( filename="~/.twitter_keys.yaml", yaml_key="search_tweets_fullarchive_dev", env_overwrite=False) print(premium_search_args) print() #query = "(nat OR natte OR water OR wateroverlast OR regen OR storm OR blank OR bui OR overstroming OR hoosbui OR schade OR noodweer OR wolkbreuk OR waterschade) has:geo place_country:NL" query = "(wateroverlast OR overstroming OR waterschade) has:geo place_country:NL" do_query = False if do_query: from_date = "2007-01-01" to_date = "2020-01-01" rule = gen_rule_payload(query, results_per_call=500, from_date=from_date, to_date=to_date) tweets = collect_results(rule, max_results=500 * 50, result_stream_args=premium_search_args ) # change this if you need to for tweet in tweets: with open(file_name, 'a') as fp: fp.write(json.dumps(tweet) + '\n') else: print( "No query was done, in order to perform a Twitter query, set do_query to True in Twitter/searchtweets_query.py" )
def get_tweets(query_set, twitter_args, query_filter=None): tweets_list = list() params = c.TWITTER_PARAMS for query in query_set: curr_month = "{}-{}".format(dt.now().year, format_date_str(dt.now().month)) _, curr_usage = get_twitter_api_usage(curr_month) if curr_usage >= 24999: print("Twitter API limit is about to exceed! Returning now ...\n") break if query_filter: q = '("{}") {}'.format(query, query_filter) else: q = "{}".format(query) print("No filter/Filter in query_set: {}".format(q)) print("Collecting for {}".format(q)) try: rule = gen_rule_payload( q, results_per_call=params["RESULTS_PER_CALL"]) tweets = collect_results(rule, max_results=params["MAX_RESULTS"], result_stream_args=twitter_args) print("number of tweets: {}".format(len(tweets))) update_twitter_api_usage(curr_month, len(tweets)) tweets_list.append(tweets) except Exception as e: print("Exception occurred while fetching tweets: {}".format(e)) break return tweets_list
def premium_set_search_params(self, search_query, from_date, to_date, no_retweets=True, results_per_call=500): """ Sets the Search Query and maximum Tweets to be retrieved to save Quota """ # Set a static Language Filter for English Tweets lang_filter = ' lang:en' if no_retweets: rt_filter = ' -is:retweet' # Adds an ignore Retweets tag to the (Altcoin) Query self.query = search_query + lang_filter + rt_filter else: # This Query includes all Tweets, also Retweets self.query = search_query + lang_filter # Sets the Rule for the Query to be executed (time frame & # of Results) self.rule = gen_rule_payload(self.query, results_per_call=results_per_call, from_date=from_date, to_date=to_date)
def counts(queries, nameList): # premium_search_args = load_credentials(filename="twitter_keys.yaml", yaml_key="search_tweets_api", env_overwrite=False) # queries = ['"$LTC" OR "Litecoin"','"$ETH" OR "Ethereum"','"$BTC" OR "Bitcoin"', 'Holochain', '"$NPXS" OR "Pundi X"'] counts = [] for i in range(0, len(queries)): count_rule = gen_rule_payload(queries[i], count_bucket="day") temp = collect_results(count_rule, result_stream_args=premium_search_args) print(temp) print("\n") counts.append(temp[1]['count']) print('\n', counts) """CryptoCompare""" from cryptocompy import price avgPrices = [] toCurr = 'USD' yesterday = date.today() - timedelta(1) datestr = str(yesterday) + ' 00:00:00' for elem in nameList: # avgtemp = price.get_day_average_price(elem[0], toCurr)[elem[0]]['USD'] # avgPrices.append(avgtemp) eodtemp = price.get_historical_eod_price(elem[0], toCurr, datestr, try_conversion=True) eodtemp = eodtemp[elem[0]][toCurr] avgPrices.append(eodtemp) plot(counts, avgPrices, nameList)
def search_lima(search, premium_search_args): # rule = gen_rule_payload(search + " point_radius:[-12.089282 -77.020041 10mi]", results_per_call=100) rule = gen_rule_payload(search + "place:Peru", results_per_call=100) data = collect_results(rule, max_results=100, result_stream_args=premium_search_args) return data
def arquive_search(self, query, start, end, dev_env, max_size=2500, max_call=100): self.settings['search_tweets_api']['endpoint'] =\ f"https://api.twitter.com/1.1/tweets/search/fullarchive/{dev_env}.json" credentials = load_credentials("archive_keys.yaml", yaml_key="search_tweets_api", env_overwrite=False) with open('archive_keys.yaml', 'w') as config_file: yaml.dump(self.settings, config_file, default_flow_style=False) q_rule = gen_rule_payload(query, results_per_call=max_call, from_date=start, to_date=end) rs = ResultStream(rule_payload=q_rule, max_results=max_size, **credentials) with open('tweet_data_archive.csv', 'a', encoding='utf-8') as file: n = 0 for tweet in rs.stream(): n += 1 if n % (max_size / 10) == 0: print('{0}: {1}'.format(str(n), tweet['created_at'])) json.dump(tweet, file) file.write('\n')
def count_tweets(query, from_date, to_date, credentials_path, yaml_key, count_bucket="day", results_per_call=500, verbose=False, **kwargs): """ Returns the number of existing Tweets for a given query and time frame. Since this function doesn't pull tweets, this is a safe option to check the effectiveness of your filters without exhausting the API's capacity. Parameters ---------- query : str Query passed to the Twitter API to fecth Tweets. from_date : str or None Date format as specified by `convert_utc_time` for the starting time of your search. to_date : str or None Date format as specified by `convert_utc_time` for the end time of your search. credentials_path : str Path for the yaml file with the Twitter API credentials. yaml_key : str Key within the yaml file containing the Twitter API credentials to be used. count_bucket : str or None, default="day" If using the counts api endpoint, will define the count bucket for which tweets are aggregated. results_per_call : int, default=500 Number of Tweets returned per call. verbose : int or bool, default=False Controls the verbosity when pulling the tweet count. Returns ------- counts : dict Number of existing tweets for each bucket. """ logger = logging.getLogger(__name__) logger.propagate = verbose logger.info('Counting Tweets') search_args = load_credentials(credentials_path, yaml_key=yaml_key) count_rule = gen_rule_payload(query, from_date=from_date, to_date=to_date, count_bucket=count_bucket, results_per_call=results_per_call) counts = collect_results(count_rule, result_stream_args=search_args) return counts
def get_rule_count(self): """ before calling the production api, get a count of the tweets that match the rule """ rule_count = gen_rule_payload(self.raw_rule, from_date=self.from_date, to_date=self.to_date, results_per_call=500, count_bucket='day') counts_list = collect_results(rule_count, max_results=500, result_stream_args=self.premium_search_args) [print(count) for count in counts_list]
def use_premium(search, filename, from_date, to_date, enpoint='full'): ''' Collect historical tweets ''' if endpoint == '30day': endpoint_key = 'search_premium_30day_api' #endpoint_key = 'search_lynxx_30day_api' else: endpoint_key = 'search_premium_full_api' #endpoint_key = 'search_lynxx_full_api' try: tweet_df = pd.read_csv(filename, dtype=str, encoding='ISO-8859-1') except FileNotFoundError: tweet_df = pd.DataFrame() # Extract the credentials for the endpoint. search_stream = load_credentials(filename='./credentials.yaml', yaml_key=endpoint_key, env_overwrite=False) # Collect tweets while we are permitted. # Todo: Still dont know how to catch the re-try limit error? while to_date > from_date: rule = gen_rule_payload(search, from_date=from_date, to_date=to_date, results_per_call=100) try: tweets = collect_results(rule, max_results=2000, result_stream_args=search_stream) except: break for idx, tweet in enumerate(tweets): tweet_df = tweet_df.append([json_normalize(tweet)], ignore_index=True, sort=False) if idx % 1000 == 0: print(f'{tweet["created_at"]}: {tweet["text"]}') tweet_df.to_csv(filename, index=False) tweet_df['created_at'] = pd.to_datetime(tweet_df['created_at'], utc=True) mindate = min(tweet_df['created_at']).date() - timedelta(hours=1) to_date = mindate.strftime('%Y-%m-%d %H:%M') tweet_df['created_at'] = pd.to_datetime(tweet_df['created_at']) min(tweet_df['created_at']) tweet_df.drop_duplicates(subset=['created_at', 'user.screen_name'], keep='first', inplace=True) tweet_df.sort_values(by='created_at', inplace=True) tweet_df.to_csv(filename, index=False)
def tw_get_premium_search(self, keyword: str): with open(f'datasets/tw_{keyword.lower()}_searches_premium.json', 'w') as f: try: f.write('{"statuses": [') rule = gen_rule_payload( pt_rule="near:\"New York, NY\" within:50mi".format(), results_per_call=100, from_date="2018-07-01", to_date="2018-10-01") rule = gen_rule_payload( pt_rule="place:\"New York, NY\"".format(), results_per_call=100, from_date=(datetime.date.today() - datetime.timedelta(31)).isoformat(), to_date=datetime.date.today().isoformat()) next_token = None while True: results = ResultStream(rule_payload=rule, **self.twitter_premium_api) results.next_token = next_token tweets = [] try: tweets = list(results.stream()) except Exception as ex: print(str(ex)) for tweet in tweets: f.write("%s," % json.dumps(tweet)) if results.next_token is None: break else: next_token = results.next_token next_token is not None and f.seek(f.tell() - 1, os.SEEK_SET) f.write("]}") except Exception as ex: print("Error:\n" + str(ex))
def _download_tweets(trend, enterprise_search_args): powertrack_rule = '(has:geo OR has:profile_geo) lang:en -is:retweet %s' % trend rule = gen_rule_payload(powertrack_rule, results_per_call=500) rs = ResultStream(rule_payload=rule, max_requests=2, **enterprise_search_args) for tweet in rs.stream(): print(tweet) _store_tweet(tweet)
def get_tweets(trend,date): enddate = date+datetime.timedelta(days=1) username="******" password="******" endpoint="https://gnip-api.twitter.com/search/fullarchive/accounts/greg-students/prod.json" bearer_token="" rule = gen_rule_payload(trend+" lang:en",from_date=date.isoformat() ,to_date=enddate.isoformat(), results_per_call=500) # testing with a sandbox account rs=ResultStream(rule_payload=rule,max_results=10000,max_pages=10, username=username,endpoint=endpoint, password=password) #tweets=collect_results(rule, result_stream_args=args,max_results=20000) return rs
def __init__(self, search_query): print(self.__class__.__name__) self.premium_search_args = searchtweets.load_credentials() self.rule = searchtweets.gen_rule_payload( search_query.query, to_date=(datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')) try: self.iter = iter( searchtweets.collect_results( self.rule, result_stream_args=self.premium_search_args)) except Exception: self.iter = iter([])
def read_tweets(term): """ @return: string output split into 2000 messages. """ rule = gen_rule_payload( term, results_per_call=100) # testing with a sandbox account print(rule) tweets = collect_results(rule, 100, premium_search_args()) print(tweets[:10]) output = '\n\n'.join( [f'@{t.screen_name}: {t.all_text}' for t in tweets[:10]]) output = split_2000(output) return output
def get_premium_tweets(candidate, handle, topic): rule = gen_rule_payload(topic + " to:" + handle, results_per_call=100) tweets = collect_results(rule, max_results=100, result_stream_args=premium_search_args) data = TweetHelpers.get_tweet_objects(candidate, topic) if os.path.exists(candidate + "/" + topic + "/raw/tweets.pkl"): os.remove(candidate + "/" + topic + "/raw/tweets.pkl") if data is None: data = [] data += tweets TweetHelpers.pickle_data(candidate + "/" + topic + "/raw/tweets.pkl", tweets)
def make_rule(handle, to_date, from_date, results_per_call): """ Inputs: - handle (should be changed to id) - to_date """ #print('Using',results_per_call,' results per call. Should be 100 for sandbox, 500 for premium') _rule_a = "from:" + handle rule = gen_rule_payload(_rule_a, from_date=from_date, to_date=to_date, results_per_call=results_per_call) return rule
def _download_tweets(trend): powertrack_rule = '%s (has:geo OR has:profile_geo) lang:en -is:retweet' % trend rule = gen_rule_payload(powertrack_rule, results_per_call=500, to_date=None, from_date='201207220000') logging.info("PowerTrack rule: %s" % rule) rs = ResultStream(rule_payload=rule, max_results=500, max_requests=1, **enterprise_search_args) for tweet in rs.stream(): _push_tweet(tweet, trend)
def get_emoji_tweets(self, emoji_list): emoji_list = ' OR '.join(emoji_list) print(emoji_list) max_tweets = 100 rule = searchtweets.gen_rule_payload( emoji_list, # from_date="2017-01-01", #UTC 2017-09-01 00:00 # to_date="2019-02-12",#UTC 2017-10-30 00:00 results_per_call=max_tweets) print(rule) tweets = searchtweets.collect_results( rule, max_results=500, result_stream_args=self.premium_search_args) return tweets
def get_premium_all_tweets(self): rule_str = "from:" + self.twitter_user.screen_name print("get_all_twitter_user_tweets: rule_str: " + rule_str) rule = gen_rule_payload(rule_str) tweets_paresd = [] try: tweets = collect_results(rule, max_results=100, result_stream_args=search_args) print("tweets len:" + str(len(tweets))) for t in tweets: tweets_paresd.append(twitter.Status().NewFromJsonDict(t)) print("tweets_paresd len:" + str(len(tweets_paresd))) self.place.add_tweet_list(tweets_paresd, self.user, self.region, old_user=True) except Exception as exc: print("In get_all_twitter_user_tweets, Problem loading tweets") print(exc) return tweets_paresd
def collect_and_write_tweets(self, query: str, results_per_call: int = 100, num_tweets: int = 100, from_date: datetime.date = None, to_date: datetime.date = None): """ :param query: :param results_per_call :param num_tweets: :param from_date: :param to_date: :return: """ if results_per_call > 100: print( "Sandbox API limited to 100 results per request, cannot retrieve {} results" .format(results_per_call)) rule = gen_rule_payload(query, results_per_call=results_per_call, from_date=from_date.isoformat(), to_date=to_date.isoformat()) tweets = collect_results(rule, max_results=num_tweets, result_stream_args=self.premium_search_args) # cast tweet objects to dict and create pandas data frame tweets_dict_list = [dict(tweet) for tweet in tweets] tweets_df = pd.DataFrame(tweets_dict_list) tweets_df.index = tweets_df.id try: # write new data set to .csv file without duplicates self.tweets_df = pd.concat([self.tweets_df, tweets_df], axis=0, join='outer') self.tweets_df = self.tweets_df[~self.tweets_df.index.duplicated()] self.tweets_df.to_csv("{}_tweets.csv".format(self.topic)) except: # save backup of collected tweets tweets_df.to_csv("{}_{}_{}_backup_tweets.csv".format( self.topic, datetime.datetime.now().date(), datetime.datetime.now().time()))
def fullarchivetweetsearch(event, context): data = json.loads(event['body']) screen_name = data['screenname'] hash_tag = data['hashtag'] from_past_number_of_days = data['numberofDays'] """ Call the method to get the access token """ access_token = app_only_oauth_access_token(os.environ['CONSUMER_KEY'], os.environ['CONSUMER_SECRET']) from_to_dates = get_tweet_time_window(from_past_number_of_days) """ Generate the rule criteria to filter the tweets """ rule = gen_rule_payload("from:" + screen_name + " lang:en " + hash_tag, from_date=str(from_to_dates['from_date']), to_date=str(from_to_dates['to_date']), results_per_call=100) print("rule:", rule) search_args = { "bearer_token": access_token, "endpoint": os.environ['FULLARCHIVE_TWEETSEARCH_ENDPOINT']} """ calling the twitter api """ tweets_list = collect_results(rule, max_results=100, result_stream_args=search_args) appended_tweets = [] """ Iterating the twitter search response """ for tweet in tweets_list: appended_tweets.append(str(tweet.created_at_datetime) + " " + tweet.text) json_response = { "Given Hashtag": hash_tag, "Given TwitterAccount": screen_name, "Tweet count": str(len(tweets_list)), "Tweet Text": appended_tweets } output = {'statusCode': 200, 'body': json.dumps(json_response)} return output
def get_tweets(keyword, limit='100', begin_date=datetime.now().strftime('%Y-%m-%d'), end_date=datetime.now().strftime('%Y-%m-%d'), lang='id'): query = keyword + ' lang:' + lang rule = gen_rule_payload(query, from_date=begin_date, to_date=end_date, results_per_call=500) tweets = collect_results(rule, max_results=500, result_stream_args=search_args) return [tweet.all_text for tweet in tweets]
def auth(dates): premium_args = load_credentials(filename="credentials.yaml", yaml_key='search_tweets_api_dev', env_overwrite=False) # Change the below string to the candidate you're looking for info on. Don't remove the lang:en otherwise you'll # get results in any language queryString = 'Donald Trump lang:en' rule = gen_rule_payload(queryString, results_per_call=100, from_date=dates[0], to_date=dates[1]) print(rule) tweets = collect_results(rule, max_results=100, result_stream_args=premium_args) [print(tweet.all_text) for tweet in tweets] return tweets, queryString
def collect_tweets(query, from_date, to_date, results_per_call, max_results, premium_search_args): # query: rule to query twitter API. eg if wanting to collect tweets related to bitcoin, then query='bitcoin' # maxResults is capped at 100 for sandbox account, even though there should be a next function to get more, it # appears max_results=500 is accepted without any extra work # date format: YYYY-mm-DD HH:MM string format which is automatically called by convert_utc_time. eg '2019-09-09' -> '201909090000' # from_date is inclusive. to_date is non-inclusive. Appears to start at from_date and start collecting tweets working # backwards to to_date collect_rule = gen_rule_payload(pt_rule=query, results_per_call=results_per_call, from_date=from_date, to_date=to_date) print(collect_rule) collected_tweets = collect_results(collect_rule, max_results=max_results, result_stream_args=premium_search_args) return collected_tweets
def get_data(search_query, api_key, secret_key, to_date, from_date, filename): """ get twitter data through twitter API from full archive search sand box and return all twitters in JSONL file based on search term, the geographic location of interest the time period of interest. and personal twitter account information. Reference: https://github.com/geduldig/TwitterAPI/tree/master/TwitterAPI Reference: https://developer.twitter.com/en/docs/tweets/search/overview """ print_after_x = 1000 config = dict( search_tweets_api=dict( account_type='premium', endpoint=f"https://api.twitter.com/1.1/tweets/search/{'fullarchive'}/{'mangroveConservation'}.json", consumer_key=api_key, consumer_secret=secret_key ) ) with open('twitter_keys.yaml', 'w') as config_file: yaml.dump(config, config_file, default_flow_style=False) from searchtweets import load_credentials, gen_rule_payload, ResultStream premium_search_args = load_credentials("twitter_keys.yaml", yaml_key="search_tweets_api", env_overwrite=False) rule = gen_rule_payload(search_query, results_per_call=100, from_date=from_date, to_date=to_date ) temp = ResultStream(rule_payload=rule, max_results=100000, **premium_search_args) with open(filename, 'a', encoding='utf-8') as temp_file: num = 0 for tweet in temp.stream(): num += 1 if num % print_after_x == 0: print('{0}: {1}'.format(str(num), tweet['created_at'])) json.dump(tweet, temp_file) temp_file.write('\n') print('done')
def createTestData(search_string): try: print('Start Fetching') #print(date,nextdate) rule = gen_rule_payload(search_string, from_date="2019-05-18", to_date="2019-05-20", results_per_call=500) alltweets = collect_results(rule, max_results=500, result_stream_args=premium_search_args) print("data fetched") return alltweets except: print("error")
def save_old_tweets(): from searchtweets import load_credentials, gen_rule_payload, ResultStream import json premium_search_args = load_credentials("twitter_keys_fullarchive.yaml", yaml_key="search_tweets_api", env_overwrite=False) query = "from:NTOO_Org" rule = gen_rule_payload(query, results_per_call=100) rs = ResultStream(rule_payload=rule, max_results=1000, **premium_search_args) with open('fullTweetsData.json', 'a', encoding='utf-8') as f: for tweet in rs.stream(): json.dump(tweet, f) f.write('\n')
def read_stream(apiscope, label): API_KEY = api_key API_SECRET_KEY = api_secret_key DEV_ENVIRONMENT_LABEL = label API_SCOPE = apiscope # 'fullarchive' # 'fullarchive' for full archive, '30day' for last 31 days SEARCH_QUERY = 'delays, @WestMidRailway OR @NetworkRailBHM OR @networkrail' RESULTS_PER_CALL = 100 # 100 for sandbox, 500 for paid tiers TO_DATE = '2021-01-30' # format YYYY-MM-DD HH:MM (hour and minutes optional) FROM_DATE = '2021-01-01' # format YYYY-MM-DD HH:MM (hour and minutes optional) MAX_RESULTS = 10000 # Number of Tweets you want to collect # --------------------------- STOP -------------------------------# # Don't edit anything below, if you don't know what you are doing. # --------------------------- STOP -------------------------------# config = dict(search_tweets_api=dict( account_type='premium', endpoint= f"https://api.twitter.com/1.1/tweets/search/{API_SCOPE}/{DEV_ENVIRONMENT_LABEL}.json", consumer_key=API_KEY, consumer_secret=API_SECRET_KEY)) with open('twitter_keys.yaml', 'w') as config_file: yaml.dump(config, config_file, default_flow_style=False) premium_search_args = load_credentials("twitter_keys.yaml", yaml_key="search_tweets_api", env_overwrite=False) rule = gen_rule_payload(SEARCH_QUERY, results_per_call=RESULTS_PER_CALL, from_date=FROM_DATE, to_date=TO_DATE) rs = ResultStream(rule_payload=rule, max_results=MAX_RESULTS, **premium_search_args) return rs