def activate(args): if (datetime.datetime.fromtimestamp(time.time()) - datetime.datetime.strptime(args['from_date'], '%Y-%m-%d')).days < 30: print("will use 30-day dev environment") premium_search_args = load_credentials( "~/.twitter_keys.yaml", yaml_key="search_tweets_premium_30day", env_overwrite=False) else: print("will use full-archive dev environment") premium_search_args = load_credentials( "~/.twitter_keys.yaml", yaml_key="search_tweets_premium_fullarchive", env_overwrite=False) print("query: %s" % (args['query'])) print("start_date: %s end_date: %s" % (args['from_date'], args['to_date'])) print("frequency: %d max_results: %d" % (args['frequency'], args['max_results'])) print("file_name from args:", args['filename']) test_dates = days_to_collect(args['from_date'], args['to_date'], args['frequency']) print("test dates\n", test_dates) user_input = input( "press enter to proceed and any other button to cancel: ") if user_input != '': print("aborting") exit(0) tweets = [] for i in range(0, len(test_dates[:-1])): # test_dates reversed. Eg. 2018-10-31 -> 2018-10-30 # collect_tweets requires forward collection: collect_tweets(from, to, max_results=100) tweets = np.append( tweets, collect_tweets(args['query'], test_dates[i], test_dates[i + 1], args['results_per_call'], args['max_results'], premium_search_args)) # Requests are limited to 30 per minute for sandbox, 60 for subscriptions # Requests are limited to 10 per second num_calls = (i + 1) * args['max_results'] // args['results_per_call'] if num_calls % 5 == 0 and num_calls % 20 != 0: print("waiting 10 seconds") time.sleep(10) # flip tweets back so that the rows are in increasing days tweets = list(reversed(tweets)) S2 = to_df(tweets) print("collected tweets\n", S2) # save file to csv S2.to_csv(args['filename'], index=False) print('saved file', args['filename'])
def authenticate(self): """ authenticate using either the sandbox or premium api with yaml configs from twitter_keys.yaml """ if self.do_sandbox: self.premium_search_args = load_credentials(filename=self.cred_file, yaml_key='full_tweets_api_sandbox', env_overwrite=False) else: self.premium_search_args = load_credentials(filename=self.cred_file, yaml_key='search_tweets_api', env_overwrite=False)
def usersTweetsByIds(): search_args1 = load_credentials(".twitter_keys.yaml", yaml_key="search_tweets_v2_id", env_overwrite=False) search_args2 = load_credentials(".twitter_keys.yaml", yaml_key="search_tweets_v2_user", env_overwrite=False) f = open( 'C:\\Users\\Josh\\Documents\\GitHub\\search-tweets-python\\enUsers_Tweets.json', 'r', encoding='utf-8') obj = json.load(f) for u in obj['includes']: idList = u.get('tweetids') ids = '' idList = list(set(idList)) if len(idList) == 0: u['tweets'] = [] continue if len(idList) > 99: ids = ','.join(idList[0:99]) else: ids = ','.join(idList) endTweet = 'https://api.twitter.com/2/tweets' query = {"ids": ids, "tweet.fields": "author_id,public_metrics,text"} rs = ResultStream(request_parameters=query, endpoint=endTweet, bearer_token=bt) tweets = [] result = list(rs.stream()) for r in result: tweets = r.get('data') u['tweets'] = tweets fo = open('Random_WithTweets.json', 'w', encoding='utf-8') json.dump(obj, fo)
def main(): args_dict = vars(parse_cmd_args().parse_args()) if args_dict.get("debug") is True: logger.setLevel(logging.DEBUG) logger.debug("command line args dict:") logger.debug(json.dumps(args_dict, indent=4)) if args_dict.get("config_filename") is not None: configfile_dict = read_config(args_dict["config_filename"]) else: configfile_dict = {} extra_headers_str = args_dict.get("extra_headers") if extra_headers_str is not None: args_dict['extra_headers_dict'] = json.loads(extra_headers_str) del args_dict['extra_headers'] logger.debug("config file ({}) arguments sans sensitive args:".format( args_dict["config_filename"])) logger.debug(json.dumps(_filter_sensitive_args(configfile_dict), indent=4)) creds_dict = load_credentials(filename=args_dict["credential_file"], account_type=args_dict["account_type"], yaml_key=args_dict["credential_yaml_key"], env_overwrite=args_dict["env_overwrite"]) dict_filter = lambda x: {k: v for k, v in x.items() if v is not None} config_dict = merge_dicts(dict_filter(configfile_dict), dict_filter(creds_dict), dict_filter(args_dict)) logger.debug("combined dict (cli, config, creds) sans password:"******"ERROR: not enough arguments for the program to work") sys.exit(1) stream_params = gen_params_from_config(config_dict) logger.debug( "full arguments passed to the ResultStream object sans password") logger.debug(json.dumps(_filter_sensitive_args(stream_params), indent=4)) rs = ResultStream(tweetify=False, **stream_params) logger.debug(str(rs)) if config_dict.get("filename_prefix") is not None: stream = write_result_stream( rs, filename_prefix=config_dict.get("filename_prefix"), results_per_file=config_dict.get("results_per_file")) else: stream = rs.stream() for tweet in stream: if config_dict["print_stream"] is True: print(json.dumps(tweet))
def load(): config = twitter_conifg() base_date = datetime.datetime.today() date_list = [base_date - datetime.timedelta(days=x) for x in range(5)] date_list.reverse() all_tweets = [] for idx, date in enumerate(date_list): if idx != 4: final_date = date + datetime.timedelta(days=1) search_args = load_credentials( filename="./configs/twitter_api.yaml", yaml_key="search_tweets_v2", env_overwrite=False) query = gen_request_parameters( config['query'], results_per_call=100, place_fields='country', start_time=date.strftime('%Y-%m-%d'), end_time=final_date.strftime('%Y-%m-%d')) tweets = collect_results(query, max_tweets=1000, result_stream_args=search_args) def add_date(x): x['fecha'] = date.strftime('%Y-%m-%d') return x tweets = list(map(add_date, tweets)) all_tweets.append(tweets) all_tweets = reduce(lambda x, y: x + y, all_tweets) return all_tweets
def twitter_auth(): twitter_credentials_file = os.path.join(os.getcwd(), "Credentials", "twitter_creds.yaml") search_tweets_api = 'search_tweets_30_day_dev' return load_credentials(filename=twitter_credentials_file, yaml_key=search_tweets_api, env_overwrite=False)
def searchtweets_query(file_name='../../Twitter_cred/full_arch_2007-2020.txt'): premium_search_args = load_credentials( filename="~/.twitter_keys.yaml", yaml_key="search_tweets_fullarchive_dev", env_overwrite=False) print(premium_search_args) print() #query = "(nat OR natte OR water OR wateroverlast OR regen OR storm OR blank OR bui OR overstroming OR hoosbui OR schade OR noodweer OR wolkbreuk OR waterschade) has:geo place_country:NL" query = "(wateroverlast OR overstroming OR waterschade) has:geo place_country:NL" do_query = False if do_query: from_date = "2007-01-01" to_date = "2020-01-01" rule = gen_rule_payload(query, results_per_call=500, from_date=from_date, to_date=to_date) tweets = collect_results(rule, max_results=500 * 50, result_stream_args=premium_search_args ) # change this if you need to for tweet in tweets: with open(file_name, 'a') as fp: fp.write(json.dumps(tweet) + '\n') else: print( "No query was done, in order to perform a Twitter query, set do_query to True in Twitter/searchtweets_query.py" )
def search(queryString, outputpath, api_key_yaml,startTime="2016-01-01",endTime="2021-03-15", lang="en"): search_args = load_credentials(api_key_yaml, yaml_key="search_tweets_v2", env_overwrite=False) print("Should be 1024, but it:") print(len(queryString + " -is:nullcast -is:retweet -is:verified -is:quote " + "lang:"+lang)) #,created_at,geo,in_reply_to_user_id,lang,author_id,conversation_id,public_metrics,entities,context_annotations query = gen_request_parameters(query=queryString.strip() + " -is:nullcast -is:retweet -is:verified -is:quote " + "lang:"+lang, media_fields="media_key,type",user_fields="id,description,location,name,entities,url,username,public_metrics,verified,withheld,protected",tweet_fields="id,text,created_at,geo,in_reply_to_user_id,lang,author_id,conversation_id,public_metrics,entities,context_annotations,attachments",start_time=startTime,end_time=endTime, stringify=False, expansions="author_id,attachments.media_keys",results_per_call=500) rs = ResultStream(request_parameters=query, max_tweets=sys.maxsize, max_requests=sys.maxsize, **search_args) i = 0 with open(outputpath, 'w') as outputcsv: writer = csv.writer(outputcsv) writer.writerow(headers) for tweet in rs.stream(): # print(tweet) if "id" in tweet: writer.writerow(createRow(headers, tweet)) if "users" in tweet: print("parsing users") dump_users_info(tweet,outputpath.replace(".csv",str(i) +"-users.csv")) i+=1
def arquive_search(self, query, start, end, dev_env, max_size=2500, max_call=100): self.settings['search_tweets_api']['endpoint'] =\ f"https://api.twitter.com/1.1/tweets/search/fullarchive/{dev_env}.json" credentials = load_credentials("archive_keys.yaml", yaml_key="search_tweets_api", env_overwrite=False) with open('archive_keys.yaml', 'w') as config_file: yaml.dump(self.settings, config_file, default_flow_style=False) q_rule = gen_rule_payload(query, results_per_call=max_call, from_date=start, to_date=end) rs = ResultStream(rule_payload=q_rule, max_results=max_size, **credentials) with open('tweet_data_archive.csv', 'a', encoding='utf-8') as file: n = 0 for tweet in rs.stream(): n += 1 if n % (max_size / 10) == 0: print('{0}: {1}'.format(str(n), tweet['created_at'])) json.dump(tweet, file) file.write('\n')
def count_tweets(query, from_date, to_date, credentials_path, yaml_key, count_bucket="day", results_per_call=500, verbose=False, **kwargs): """ Returns the number of existing Tweets for a given query and time frame. Since this function doesn't pull tweets, this is a safe option to check the effectiveness of your filters without exhausting the API's capacity. Parameters ---------- query : str Query passed to the Twitter API to fecth Tweets. from_date : str or None Date format as specified by `convert_utc_time` for the starting time of your search. to_date : str or None Date format as specified by `convert_utc_time` for the end time of your search. credentials_path : str Path for the yaml file with the Twitter API credentials. yaml_key : str Key within the yaml file containing the Twitter API credentials to be used. count_bucket : str or None, default="day" If using the counts api endpoint, will define the count bucket for which tweets are aggregated. results_per_call : int, default=500 Number of Tweets returned per call. verbose : int or bool, default=False Controls the verbosity when pulling the tweet count. Returns ------- counts : dict Number of existing tweets for each bucket. """ logger = logging.getLogger(__name__) logger.propagate = verbose logger.info('Counting Tweets') search_args = load_credentials(credentials_path, yaml_key=yaml_key) count_rule = gen_rule_payload(query, from_date=from_date, to_date=to_date, count_bucket=count_bucket, results_per_call=results_per_call) counts = collect_results(count_rule, result_stream_args=search_args) return counts
def __init__(self, cred_file, yaml_key): """ Initialize an object with loading the credentials using a credentials file and yaml key """ self.premium_search_args = load_credentials(cred_file, yaml_key=yaml_key, env_overwrite=False)
def use_premium(search, filename, from_date, to_date, enpoint='full'): ''' Collect historical tweets ''' if endpoint == '30day': endpoint_key = 'search_premium_30day_api' #endpoint_key = 'search_lynxx_30day_api' else: endpoint_key = 'search_premium_full_api' #endpoint_key = 'search_lynxx_full_api' try: tweet_df = pd.read_csv(filename, dtype=str, encoding='ISO-8859-1') except FileNotFoundError: tweet_df = pd.DataFrame() # Extract the credentials for the endpoint. search_stream = load_credentials(filename='./credentials.yaml', yaml_key=endpoint_key, env_overwrite=False) # Collect tweets while we are permitted. # Todo: Still dont know how to catch the re-try limit error? while to_date > from_date: rule = gen_rule_payload(search, from_date=from_date, to_date=to_date, results_per_call=100) try: tweets = collect_results(rule, max_results=2000, result_stream_args=search_stream) except: break for idx, tweet in enumerate(tweets): tweet_df = tweet_df.append([json_normalize(tweet)], ignore_index=True, sort=False) if idx % 1000 == 0: print(f'{tweet["created_at"]}: {tweet["text"]}') tweet_df.to_csv(filename, index=False) tweet_df['created_at'] = pd.to_datetime(tweet_df['created_at'], utc=True) mindate = min(tweet_df['created_at']).date() - timedelta(hours=1) to_date = mindate.strftime('%Y-%m-%d %H:%M') tweet_df['created_at'] = pd.to_datetime(tweet_df['created_at']) min(tweet_df['created_at']) tweet_df.drop_duplicates(subset=['created_at', 'user.screen_name'], keep='first', inplace=True) tweet_df.sort_values(by='created_at', inplace=True) tweet_df.to_csv(filename, index=False)
def authenticate(self): self.premium_search_args = searchtweets.load_credentials( "{0}/.twitter_keys.yaml".format(ROOT_DIR), yaml_key="search_tweets_30_day", env_overwrite=False) auth = tweepy.OAuthHandler(self.twitter_api_key, self.twitter_api_secret) auth.set_access_token(self.twitter_access_token, self.twitter_access_token_secret) self.api = tweepy.API(auth)
def coll_cantera_neg(): premium_search_args = load_credentials(filename="./twitter_keys.yaml", yaml_key="search_tweets_30_day_dev", env_overwrite=False) bcp = search_lima('(bcp OR BCPComunica)', premium_search_args) bbva = search_lima('(bbva)', premium_search_args) interbank = search_lima('(interbank)', premium_search_args) yape = search_lima('(yape)', premium_search_args) scotia = search_lima('(scotiabank)', premium_search_args) all_search = bcp + bbva + interbank + yape + scotia return all_search
def fetch_and_parse_tweets(): search_args = searchtweets.load_credentials() tweets = searchtweets.collect_results( {"query": "from:hypertextadrien"}, max_results=100, result_stream_args=search_args ) parsed_tweets = [parse_tweet(status.all_text) for status in tweets] parsed_tweets = [t for t in parsed_tweets if t is not None] print("%s tweets were fetched." % len(parsed_tweets)) pickle.dump(parsed_tweets, open(CACHE_TWEETS_FILE_PATH, "wb")) return parsed_tweets
def __init__(self, search_query): print(self.__class__.__name__) self.premium_search_args = searchtweets.load_credentials() self.rule = searchtweets.gen_rule_payload( search_query.query, to_date=(datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')) try: self.iter = iter( searchtweets.collect_results( self.rule, result_stream_args=self.premium_search_args)) except Exception: self.iter = iter([])
def __init__(self): __dir_path = os.path.dirname(os.path.realpath(__file__)) credentials = get_credidentials() self.twitter_premium_api = load_credentials( filename="{}/{}".format(__dir_path, "twitter_keys.yaml"), yaml_key="search_tweets_api_30day") self.twitter_api = Twitter(auth=OAuth( consumer_key=credentials['twitter']['consumer_key'], consumer_secret=credentials['twitter']['consumer_secret'], token=credentials['twitter']['access_token_key'], token_secret=credentials['twitter']['access_token_secret'])) self.yelp_api = YelpAPI(credentials['yelp']['api_key']) self.__data_path = "../data/raw" logger.info("initiation started.")
def get_file(aname, cak, cask, etype, hashtag, keywords, fdate='00-00-0000', tdate='00-00-0000', ftime='00:00', ttime='00:00'): if etype == 'efa': # Full archive scraping (refer to limits on README) endp = 'https://api.twitter.com/1.1/tweets/search/fullarchive/' + aname + '.json' elif etype == 'tdays': # 30 days scraping (refer to limits on README) endp = 'https://api.twitter.com/1.1/tweets/search/30day/' + aname + '.json' else: endp = 'ERROR' # Creating a yaml credentials file config = dict(search_tweets_api=dict(account_type='premium', endpoint=endp, consumer_key=cak, consumer_secret=cask)) with open('C:\\Users\\Samuktha\\Documents\\USC\\twitter\\proj\\cred.yaml', 'w') as config_file: yaml.dump(config, config_file, default_flow_style=False) # loading credentials premium_search_args = load_credentials( 'C:\\Users\\Samuktha\\Documents\\USC\\twitter\\proj\\cred.yaml', yaml_key='search_tweets_api', env_overwrite=True) print(premium_search_args) if etype == 'efa': rule = gen_rule_payload( results_per_call=100, from_date=fdate + ' ' + ftime, #"2019-07-06 01:00", to_date=tdate + ' ' + ttime, #"2019-07-06 02:15", pt_rule=keywords, ) else: rule = gen_rule_payload(results_per_call=100, pt_rule=keywords) # result stream rs = ResultStream(rule_payload=rule, max_results=50, **premium_search_args) return rs
def twitter_login(self, ACCESS_TOKEN, ACCESS_TOKEN_SECRET, CONSUMER_KEY, CONSUMER_SECRET): ''' :param ACCESS_TOKEN: :param ACCESS_TOKEN_SECRET: :param CONSUMER_KEY: :param CONSUMER_SECRET: :return: ''' # Create login for search by users and search by words self._auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) self._auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET) # Create credential variable for the historical API search self._premium_search_args = load_credentials(r"data/login.yaml", yaml_key="search_tweets_premium", env_overwrite=False)
def main(): parser = parse_cmd_args() args_dict = vars(parse_cmd_args().parse_args()) if args_dict.get("debug") is True: logger.setLevel(logging.DEBUG) logger.debug(json.dumps(args_dict, indent=4)) if args_dict.get("config_filename") is not None: configfile_dict = read_config(args_dict["config_filename"]) else: configfile_dict = {} creds_dict = load_credentials(filename=args_dict["credential_file"], account_type=args_dict["account_type"], yaml_key=args_dict["credential_yaml_key"], env_overwrite=args_dict["env_overwrite"]) dict_filter = lambda x: {k: v for k, v in x.items() if v is not None} config_dict = merge_dicts(dict_filter(configfile_dict), dict_filter(args_dict), dict_filter(creds_dict)) logger.debug(json.dumps(config_dict, indent=4)) if len(dict_filter(config_dict).keys() & REQUIRED_KEYS) < len(REQUIRED_KEYS): print(REQUIRED_KEYS - dict_filter(config_dict).keys()) logger.error("ERROR: not enough arguments for the program to work") sys.exit(1) stream_params = gen_params_from_config(config_dict) logger.debug(json.dumps(config_dict, indent=4)) rs = ResultStream(tweetify=False, **stream_params) logger.debug(str(rs)) if config_dict.get("filename_prefix") is not None: stream = write_result_stream( rs, filename_prefix=config_dict["filename_prefix"], results_per_file=config_dict["results_per_file"]) else: stream = rs.stream() for tweet in stream: if config_dict["print_stream"] is True: print(json.dumps(tweet))
def getPremiumEndpointCreds(endpointType): """ fetches credentials for some premium endpoint using an api key and secret which are already in the system's environment variables :parameter endpointType which premium endpoint to get the credentials for (30 day or full archive) :return credentials for some premium endpoint """ os.environ[ENDPOINT_ENV_VAR] = endpointType searchArgs = st.load_credentials(filename="NoCredsFile.yaml", account_type="premium", yaml_key="dummyYamlKey") # cleaning up this temporary environment variable to avoid causing a side effect del os.environ[ENDPOINT_ENV_VAR] return searchArgs
def set_creds(): config = dict(search_tweets_api=dict( account_type='premium', endpoint= 'https://api.twitter.com/1.1/tweets/search/fullarchive/<environment-label>.json', consumer_key='Add your consumer key', consumer_secret='Add your consumer secret')) with open('credentials/api-credentials.yaml', 'w') as config_file: yaml.dump(config, config_file, default_flow_style=False) premium_search_args = load_credentials("api-credentials.yaml", yaml_key="search_tweets_api", env_overwrite=False) print(premium_search_args) return premium_search_args
def auth(dates): premium_args = load_credentials(filename="credentials.yaml", yaml_key='search_tweets_api_dev', env_overwrite=False) # Change the below string to the candidate you're looking for info on. Don't remove the lang:en otherwise you'll # get results in any language queryString = 'Donald Trump lang:en' rule = gen_rule_payload(queryString, results_per_call=100, from_date=dates[0], to_date=dates[1]) print(rule) tweets = collect_results(rule, max_results=100, result_stream_args=premium_args) [print(tweet.all_text) for tweet in tweets] return tweets, queryString
def __init__(self, topic: str, path_to_keys: str = './keys/twitter_keys.yaml'): # set up access to API self.premium_search_args = load_credentials( path_to_keys, yaml_key="search_tweets_premium", env_overwrite=False) self.topic = topic # open topic_tweets.csv file that we will be modifying self.topic = topic try: self.tweets_df = pd.read_csv("{}_tweets.csv".format(self.topic), index_col='id') except FileNotFoundError: self.tweets_df = pd.DataFrame()
def get_data(search_query, api_key, secret_key, to_date, from_date, filename): """ get twitter data through twitter API from full archive search sand box and return all twitters in JSONL file based on search term, the geographic location of interest the time period of interest. and personal twitter account information. Reference: https://github.com/geduldig/TwitterAPI/tree/master/TwitterAPI Reference: https://developer.twitter.com/en/docs/tweets/search/overview """ print_after_x = 1000 config = dict( search_tweets_api=dict( account_type='premium', endpoint=f"https://api.twitter.com/1.1/tweets/search/{'fullarchive'}/{'mangroveConservation'}.json", consumer_key=api_key, consumer_secret=secret_key ) ) with open('twitter_keys.yaml', 'w') as config_file: yaml.dump(config, config_file, default_flow_style=False) from searchtweets import load_credentials, gen_rule_payload, ResultStream premium_search_args = load_credentials("twitter_keys.yaml", yaml_key="search_tweets_api", env_overwrite=False) rule = gen_rule_payload(search_query, results_per_call=100, from_date=from_date, to_date=to_date ) temp = ResultStream(rule_payload=rule, max_results=100000, **premium_search_args) with open(filename, 'a', encoding='utf-8') as temp_file: num = 0 for tweet in temp.stream(): num += 1 if num % print_after_x == 0: print('{0}: {1}'.format(str(num), tweet['created_at'])) json.dump(tweet, temp_file) temp_file.write('\n') print('done')
def save_old_tweets(): from searchtweets import load_credentials, gen_rule_payload, ResultStream import json premium_search_args = load_credentials("twitter_keys_fullarchive.yaml", yaml_key="search_tweets_api", env_overwrite=False) query = "from:NTOO_Org" rule = gen_rule_payload(query, results_per_call=100) rs = ResultStream(rule_payload=rule, max_results=1000, **premium_search_args) with open('fullTweetsData.json', 'a', encoding='utf-8') as f: for tweet in rs.stream(): json.dump(tweet, f) f.write('\n')
def read_stream(apiscope, label): API_KEY = api_key API_SECRET_KEY = api_secret_key DEV_ENVIRONMENT_LABEL = label API_SCOPE = apiscope # 'fullarchive' # 'fullarchive' for full archive, '30day' for last 31 days SEARCH_QUERY = 'delays, @WestMidRailway OR @NetworkRailBHM OR @networkrail' RESULTS_PER_CALL = 100 # 100 for sandbox, 500 for paid tiers TO_DATE = '2021-01-30' # format YYYY-MM-DD HH:MM (hour and minutes optional) FROM_DATE = '2021-01-01' # format YYYY-MM-DD HH:MM (hour and minutes optional) MAX_RESULTS = 10000 # Number of Tweets you want to collect # --------------------------- STOP -------------------------------# # Don't edit anything below, if you don't know what you are doing. # --------------------------- STOP -------------------------------# config = dict(search_tweets_api=dict( account_type='premium', endpoint= f"https://api.twitter.com/1.1/tweets/search/{API_SCOPE}/{DEV_ENVIRONMENT_LABEL}.json", consumer_key=API_KEY, consumer_secret=API_SECRET_KEY)) with open('twitter_keys.yaml', 'w') as config_file: yaml.dump(config, config_file, default_flow_style=False) premium_search_args = load_credentials("twitter_keys.yaml", yaml_key="search_tweets_api", env_overwrite=False) rule = gen_rule_payload(SEARCH_QUERY, results_per_call=RESULTS_PER_CALL, from_date=FROM_DATE, to_date=TO_DATE) rs = ResultStream(rule_payload=rule, max_results=MAX_RESULTS, **premium_search_args) return rs
def read_user_timeline(name='', from_date=pd.to_datetime('2020-1-1'), to_date=pd.to_datetime('2020-9-1'), method='tweepy'): if method == 'fullsearch': premium_search_args = load_credentials(".twitter_keys.yaml", account_type="premium", env_overwrite=False) rule = gen_rule_payload( "from:" + name, from_date=str( from_date.strftime('%Y-%m-%d')), #UTC 2017-09-01 00:00 to_date=str(to_date.strftime('%Y-%m-%d')), results_per_call=100) tweets = collect_results(rule, max_results=100, result_stream_args=premium_search_args ) # change this if you need to elif method == 'tweepy': creds = json.load(open("twitter_credentials.json", "r")) api = Twitter( auth=OAuth(creds['ACCESS_TOKEN'], creds['ACCESS_SECRET'], creds['CONSUMER_KEY'], creds['CONSUMER_SECRET'])) n = 10 for i in range(n): if (i == 0): tweets = api.statuses.user_timeline(screen_name="@" + name, count=400) last_id = tweets[-1]['id'] else: t = api.statuses.user_timeline(screen_name="@" + name, count=400, max_id=last_id) last_id = t[-1]['id'] tweets.extend(t) pickle.dump(tweets, open(name + 'tweets.sav', 'wb')) return tweets
def searchTweetsAndWriteToFile(search_term, file_name, lang): if (not isinstance(lang, LanguageEnum)): raise TypeError("lang must be LanguageEnum instance") if (not len(search_term > 0)): return "" premium_search_args = load_credentials("~/.twitter_keys.yaml", yaml_key="search_tweets_30_day_dev", env_overwrite=False) # testing with a sandbox account rule = gen_rule_payload(search_term + " lang:" + lang.value, results_per_call=100) print(rule) tweets = collect_results(rule, max_results=200, result_stream_args=premium_search_args) with open(file_name, "w") as fp: for tweet in tweets: json.dump(tweet, fp) fp.write("\n") fp.close()
def loadTweets (user): print('Loading tweets from ' + user + '...') search_args = st.load_credentials("twitter_keys.yaml", yaml_key="search_tweets_api", env_overwrite=False) rule = st.gen_rule_payload("from:"+user, results_per_call=100, from_date="2020-04-18", to_date="2020-05-18" ) rs = st.ResultStream(rule_payload=rule, max_results=100, **search_args) results = list(rs.stream()) with open(user+'Twts.jsonl', 'w', encoding='utf-8') as f: for tweet in results: json.dump(tweet, f) f.write('\n') print('done - ' + str(len(results)) + " tweets saved")
def getRecentTweets(): endRecent = 'https://api.twitter.com/2/tweets/search/recent' search_args_rec = load_credentials(".twitter_keys.yaml", yaml_key="search_tweets_v2_recent", env_overwrite=False) query = { "max_results": 100, "tweet.fields": "public_metrics,author_id,lang", "query": "happy -RT OR upset -RT OR lol -RT OR ugh -RT OR dog -RT OR cat -RT OR food -RT OR sucks -RT", "expansions": "author_id", "user.fields": "public_metrics" } rs = ResultStream( request_parameters=query, endpoint=endRecent, bearer_token=bt, max_tweets=100, max_requests=1, ) result = list(rs.stream()) obj = {} obj['data'] = [] obj['includes'] = [] for r in result: obj['data'] = obj['data'] + r.get('data') obj['includes'] = obj['includes'] + r.get('includes').get('users') out = open('testJson.json', 'w') json.dump(obj, out)