def search(queryString, outputpath, api_key_yaml,startTime="2016-01-01",endTime="2021-03-15", lang="en"): search_args = load_credentials(api_key_yaml, yaml_key="search_tweets_v2", env_overwrite=False) print("Should be 1024, but it:") print(len(queryString + " -is:nullcast -is:retweet -is:verified -is:quote " + "lang:"+lang)) #,created_at,geo,in_reply_to_user_id,lang,author_id,conversation_id,public_metrics,entities,context_annotations query = gen_request_parameters(query=queryString.strip() + " -is:nullcast -is:retweet -is:verified -is:quote " + "lang:"+lang, media_fields="media_key,type",user_fields="id,description,location,name,entities,url,username,public_metrics,verified,withheld,protected",tweet_fields="id,text,created_at,geo,in_reply_to_user_id,lang,author_id,conversation_id,public_metrics,entities,context_annotations,attachments",start_time=startTime,end_time=endTime, stringify=False, expansions="author_id,attachments.media_keys",results_per_call=500) rs = ResultStream(request_parameters=query, max_tweets=sys.maxsize, max_requests=sys.maxsize, **search_args) i = 0 with open(outputpath, 'w') as outputcsv: writer = csv.writer(outputcsv) writer.writerow(headers) for tweet in rs.stream(): # print(tweet) if "id" in tweet: writer.writerow(createRow(headers, tweet)) if "users" in tweet: print("parsing users") dump_users_info(tweet,outputpath.replace(".csv",str(i) +"-users.csv")) i+=1
def arquive_search(self, query, start, end, dev_env, max_size=2500, max_call=100): self.settings['search_tweets_api']['endpoint'] =\ f"https://api.twitter.com/1.1/tweets/search/fullarchive/{dev_env}.json" credentials = load_credentials("archive_keys.yaml", yaml_key="search_tweets_api", env_overwrite=False) with open('archive_keys.yaml', 'w') as config_file: yaml.dump(self.settings, config_file, default_flow_style=False) q_rule = gen_rule_payload(query, results_per_call=max_call, from_date=start, to_date=end) rs = ResultStream(rule_payload=q_rule, max_results=max_size, **credentials) with open('tweet_data_archive.csv', 'a', encoding='utf-8') as file: n = 0 for tweet in rs.stream(): n += 1 if n % (max_size / 10) == 0: print('{0}: {1}'.format(str(n), tweet['created_at'])) json.dump(tweet, file) file.write('\n')
def collect_tweets_in_files(): """ Using a ResultStream for getting tweets We can configure the amount of pages/tweets we want to obtain """ if not check_files(): # file should not already be existing max_results = 10000 max_pages = 300 max_tweets = 15000 rs = ResultStream(request_parameters=query, max_results=max_results, max_pages=max_pages, **credentials) # Set how many tweets we want to catch rs.max_tweets = max_tweets tweets_2 = list(rs.stream()) dataframe = pandas.DataFrame(tweets_2) csv_file = dataframe.to_csv(saving_path) else: print( FileExistsError, 'File already exists! Please check if you really want to overwrite the file.' )
def main(): args_dict = vars(parse_cmd_args().parse_args()) if args_dict.get("debug") is True: logger.setLevel(logging.DEBUG) logger.debug("command line args dict:") logger.debug(json.dumps(args_dict, indent=4)) if args_dict.get("config_filename") is not None: configfile_dict = read_config(args_dict["config_filename"]) else: configfile_dict = {} extra_headers_str = args_dict.get("extra_headers") if extra_headers_str is not None: args_dict['extra_headers_dict'] = json.loads(extra_headers_str) del args_dict['extra_headers'] logger.debug("config file ({}) arguments sans sensitive args:".format( args_dict["config_filename"])) logger.debug(json.dumps(_filter_sensitive_args(configfile_dict), indent=4)) creds_dict = load_credentials(filename=args_dict["credential_file"], account_type=args_dict["account_type"], yaml_key=args_dict["credential_yaml_key"], env_overwrite=args_dict["env_overwrite"]) dict_filter = lambda x: {k: v for k, v in x.items() if v is not None} config_dict = merge_dicts(dict_filter(configfile_dict), dict_filter(creds_dict), dict_filter(args_dict)) logger.debug("combined dict (cli, config, creds) sans password:"******"ERROR: not enough arguments for the program to work") sys.exit(1) stream_params = gen_params_from_config(config_dict) logger.debug( "full arguments passed to the ResultStream object sans password") logger.debug(json.dumps(_filter_sensitive_args(stream_params), indent=4)) rs = ResultStream(tweetify=False, **stream_params) logger.debug(str(rs)) if config_dict.get("filename_prefix") is not None: stream = write_result_stream( rs, filename_prefix=config_dict.get("filename_prefix"), results_per_file=config_dict.get("results_per_file")) else: stream = rs.stream() for tweet in stream: if config_dict["print_stream"] is True: print(json.dumps(tweet))
def gather_data(self, screen_name: str, user_id: int, rt_date: str, file_path: str): query_str = create_query_str(screen_name) # print(f'reconstructing timeline for @{screen_name}') time_range = get_start_and_end_date(rt_date) query_obj = create_query_obj(query_str, *time_range) rs = ResultStream( request_parameters=query_obj, # parameter changed from 2 -> 1 to avoid being ratelimited within the project timeline max_requests=1, **self.academic_search_args ) inbound_timeline = [] replies = [] retweets = [] quotes = [] for tweet in rs.stream(): if "author_id" not in tweet: if "tweets" in tweet: # Tweets are found for t in tweet["tweets"]: if int(t["author_id"]) == user_id: if "referenced_tweets" in t: ref_tweets = t["referenced_tweets"] for ref in ref_tweets: type = ref["type"] if type == "replied_to": replies.append(ref["id"]) elif type == "quoted": quotes.append(ref["id"]) else: # normal tweet, which holds no info on the information strength pass else: if "referenced_tweets" not in t: # the only way this situation can occur is when the tweet is retweeted by the autor # and someone is replying to that retweet retweets.append(t["author_id"]) else: # this indicates a reply with a quote, or a reply of a reply pass # print(f"done collecting the retweeted user objects, there are {len(retweets)} in total") # print(f"converting the {len(replies)} replied tweet objects to user ids") replies = self.gather_users(replies) # print(f"done collecting the replies user objects, there are {len(replies)} in total") # print(f"converting the {len(quotes)} quoted tweet objects to user ids") quotes = self.gather_users(quotes) # print(f"done collecting the quotes user objects, there are {len(quotes)} in total") # print(f"retweets: {len(retweets)}\treplies: {len(replies)}\tquotes: {len(quotes)}") dump_dict = {"replies": replies, "quotes": quotes, "retweets": retweets} json.dump(dump_dict, open(file_path, "w"))
def _download_tweets(trend, enterprise_search_args): powertrack_rule = '(has:geo OR has:profile_geo) lang:en -is:retweet %s' % trend rule = gen_rule_payload(powertrack_rule, results_per_call=500) rs = ResultStream(rule_payload=rule, max_requests=2, **enterprise_search_args) for tweet in rs.stream(): print(tweet) _store_tweet(tweet)
def get_tweets(trend,date): enddate = date+datetime.timedelta(days=1) username="******" password="******" endpoint="https://gnip-api.twitter.com/search/fullarchive/accounts/greg-students/prod.json" bearer_token="" rule = gen_rule_payload(trend+" lang:en",from_date=date.isoformat() ,to_date=enddate.isoformat(), results_per_call=500) # testing with a sandbox account rs=ResultStream(rule_payload=rule,max_results=10000,max_pages=10, username=username,endpoint=endpoint, password=password) #tweets=collect_results(rule, result_stream_args=args,max_results=20000) return rs
def usersTweetsByIds(): search_args1 = load_credentials(".twitter_keys.yaml", yaml_key="search_tweets_v2_id", env_overwrite=False) search_args2 = load_credentials(".twitter_keys.yaml", yaml_key="search_tweets_v2_user", env_overwrite=False) f = open( 'C:\\Users\\Josh\\Documents\\GitHub\\search-tweets-python\\enUsers_Tweets.json', 'r', encoding='utf-8') obj = json.load(f) for u in obj['includes']: idList = u.get('tweetids') ids = '' idList = list(set(idList)) if len(idList) == 0: u['tweets'] = [] continue if len(idList) > 99: ids = ','.join(idList[0:99]) else: ids = ','.join(idList) endTweet = 'https://api.twitter.com/2/tweets' query = {"ids": ids, "tweet.fields": "author_id,public_metrics,text"} rs = ResultStream(request_parameters=query, endpoint=endTweet, bearer_token=bt) tweets = [] result = list(rs.stream()) for r in result: tweets = r.get('data') u['tweets'] = tweets fo = open('Random_WithTweets.json', 'w', encoding='utf-8') json.dump(obj, fo)
def _download_tweets(trend): powertrack_rule = '%s (has:geo OR has:profile_geo) lang:en -is:retweet' % trend rule = gen_rule_payload(powertrack_rule, results_per_call=500, to_date=None, from_date='201207220000') logging.info("PowerTrack rule: %s" % rule) rs = ResultStream(rule_payload=rule, max_results=500, max_requests=1, **enterprise_search_args) for tweet in rs.stream(): _push_tweet(tweet, trend)
def get_file(aname, cak, cask, etype, hashtag, keywords, fdate='00-00-0000', tdate='00-00-0000', ftime='00:00', ttime='00:00'): if etype == 'efa': # Full archive scraping (refer to limits on README) endp = 'https://api.twitter.com/1.1/tweets/search/fullarchive/' + aname + '.json' elif etype == 'tdays': # 30 days scraping (refer to limits on README) endp = 'https://api.twitter.com/1.1/tweets/search/30day/' + aname + '.json' else: endp = 'ERROR' # Creating a yaml credentials file config = dict(search_tweets_api=dict(account_type='premium', endpoint=endp, consumer_key=cak, consumer_secret=cask)) with open('C:\\Users\\Samuktha\\Documents\\USC\\twitter\\proj\\cred.yaml', 'w') as config_file: yaml.dump(config, config_file, default_flow_style=False) # loading credentials premium_search_args = load_credentials( 'C:\\Users\\Samuktha\\Documents\\USC\\twitter\\proj\\cred.yaml', yaml_key='search_tweets_api', env_overwrite=True) print(premium_search_args) if etype == 'efa': rule = gen_rule_payload( results_per_call=100, from_date=fdate + ' ' + ftime, #"2019-07-06 01:00", to_date=tdate + ' ' + ttime, #"2019-07-06 02:15", pt_rule=keywords, ) else: rule = gen_rule_payload(results_per_call=100, pt_rule=keywords) # result stream rs = ResultStream(rule_payload=rule, max_results=50, **premium_search_args) return rs
def main(): parser = parse_cmd_args() args_dict = vars(parse_cmd_args().parse_args()) if args_dict.get("debug") is True: logger.setLevel(logging.DEBUG) logger.debug(json.dumps(args_dict, indent=4)) if args_dict.get("config_filename") is not None: configfile_dict = read_config(args_dict["config_filename"]) else: configfile_dict = {} creds_dict = load_credentials(filename=args_dict["credential_file"], account_type=args_dict["account_type"], yaml_key=args_dict["credential_yaml_key"], env_overwrite=args_dict["env_overwrite"]) dict_filter = lambda x: {k: v for k, v in x.items() if v is not None} config_dict = merge_dicts(dict_filter(configfile_dict), dict_filter(args_dict), dict_filter(creds_dict)) logger.debug(json.dumps(config_dict, indent=4)) if len(dict_filter(config_dict).keys() & REQUIRED_KEYS) < len(REQUIRED_KEYS): print(REQUIRED_KEYS - dict_filter(config_dict).keys()) logger.error("ERROR: not enough arguments for the program to work") sys.exit(1) stream_params = gen_params_from_config(config_dict) logger.debug(json.dumps(config_dict, indent=4)) rs = ResultStream(tweetify=False, **stream_params) logger.debug(str(rs)) if config_dict.get("filename_prefix") is not None: stream = write_result_stream( rs, filename_prefix=config_dict["filename_prefix"], results_per_file=config_dict["results_per_file"]) else: stream = rs.stream() for tweet in stream: if config_dict["print_stream"] is True: print(json.dumps(tweet))
def tw_get_premium_search(self, keyword: str): with open(f'datasets/tw_{keyword.lower()}_searches_premium.json', 'w') as f: try: f.write('{"statuses": [') rule = gen_rule_payload( pt_rule="near:\"New York, NY\" within:50mi".format(), results_per_call=100, from_date="2018-07-01", to_date="2018-10-01") rule = gen_rule_payload( pt_rule="place:\"New York, NY\"".format(), results_per_call=100, from_date=(datetime.date.today() - datetime.timedelta(31)).isoformat(), to_date=datetime.date.today().isoformat()) next_token = None while True: results = ResultStream(rule_payload=rule, **self.twitter_premium_api) results.next_token = next_token tweets = [] try: tweets = list(results.stream()) except Exception as ex: print(str(ex)) for tweet in tweets: f.write("%s," % json.dumps(tweet)) if results.next_token is None: break else: next_token = results.next_token next_token is not None and f.seek(f.tell() - 1, os.SEEK_SET) f.write("]}") except Exception as ex: print("Error:\n" + str(ex))
def get_data(search_query, api_key, secret_key, to_date, from_date, filename): """ get twitter data through twitter API from full archive search sand box and return all twitters in JSONL file based on search term, the geographic location of interest the time period of interest. and personal twitter account information. Reference: https://github.com/geduldig/TwitterAPI/tree/master/TwitterAPI Reference: https://developer.twitter.com/en/docs/tweets/search/overview """ print_after_x = 1000 config = dict( search_tweets_api=dict( account_type='premium', endpoint=f"https://api.twitter.com/1.1/tweets/search/{'fullarchive'}/{'mangroveConservation'}.json", consumer_key=api_key, consumer_secret=secret_key ) ) with open('twitter_keys.yaml', 'w') as config_file: yaml.dump(config, config_file, default_flow_style=False) from searchtweets import load_credentials, gen_rule_payload, ResultStream premium_search_args = load_credentials("twitter_keys.yaml", yaml_key="search_tweets_api", env_overwrite=False) rule = gen_rule_payload(search_query, results_per_call=100, from_date=from_date, to_date=to_date ) temp = ResultStream(rule_payload=rule, max_results=100000, **premium_search_args) with open(filename, 'a', encoding='utf-8') as temp_file: num = 0 for tweet in temp.stream(): num += 1 if num % print_after_x == 0: print('{0}: {1}'.format(str(num), tweet['created_at'])) json.dump(tweet, temp_file) temp_file.write('\n') print('done')
def save_old_tweets(): from searchtweets import load_credentials, gen_rule_payload, ResultStream import json premium_search_args = load_credentials("twitter_keys_fullarchive.yaml", yaml_key="search_tweets_api", env_overwrite=False) query = "from:NTOO_Org" rule = gen_rule_payload(query, results_per_call=100) rs = ResultStream(rule_payload=rule, max_results=1000, **premium_search_args) with open('fullTweetsData.json', 'a', encoding='utf-8') as f: for tweet in rs.stream(): json.dump(tweet, f) f.write('\n')
def read_stream(apiscope, label): API_KEY = api_key API_SECRET_KEY = api_secret_key DEV_ENVIRONMENT_LABEL = label API_SCOPE = apiscope # 'fullarchive' # 'fullarchive' for full archive, '30day' for last 31 days SEARCH_QUERY = 'delays, @WestMidRailway OR @NetworkRailBHM OR @networkrail' RESULTS_PER_CALL = 100 # 100 for sandbox, 500 for paid tiers TO_DATE = '2021-01-30' # format YYYY-MM-DD HH:MM (hour and minutes optional) FROM_DATE = '2021-01-01' # format YYYY-MM-DD HH:MM (hour and minutes optional) MAX_RESULTS = 10000 # Number of Tweets you want to collect # --------------------------- STOP -------------------------------# # Don't edit anything below, if you don't know what you are doing. # --------------------------- STOP -------------------------------# config = dict(search_tweets_api=dict( account_type='premium', endpoint= f"https://api.twitter.com/1.1/tweets/search/{API_SCOPE}/{DEV_ENVIRONMENT_LABEL}.json", consumer_key=API_KEY, consumer_secret=API_SECRET_KEY)) with open('twitter_keys.yaml', 'w') as config_file: yaml.dump(config, config_file, default_flow_style=False) premium_search_args = load_credentials("twitter_keys.yaml", yaml_key="search_tweets_api", env_overwrite=False) rule = gen_rule_payload(SEARCH_QUERY, results_per_call=RESULTS_PER_CALL, from_date=FROM_DATE, to_date=TO_DATE) rs = ResultStream(rule_payload=rule, max_results=MAX_RESULTS, **premium_search_args) return rs
def tweet_search(search_key, search_args): """ search for "spectrumtv" and create a dict of tweet timestamp (dictionary key, in epoch seconds), tweet authors screen name (dict value, tuple element 1), tweet text (dict value, tuple element 2) """ print("searching for tweets containing \"{}\"".format(search_key)) key_rule = gen_rule_payload(search_key, results_per_call=100) key_rs = ResultStream(rule_payload=key_rule, max_results=500, max_pages=1, **search_args) key_results = list(key_rs.stream()) key_tweets = {} for tweet in key_results: key_tweets[tweet.created_at_seconds] = ( tweet.screen_name, tweet.all_text.replace('\n', ' '), ' ' ) # this space is a placeholder for the sentiment value print("{} tweets found containing \"{}\"\n".format(len(key_results), search_key)) return key_tweets
def get_twitter_results(news_id, query, from_date, premium_search_args, filename, to_date="202005260000"): query1 = "url:" + query + " lang:en" rule = gen_rule_payload(query1, from_date=from_date, to_date=to_date, results_per_call=100) rs = ResultStream(rule_payload=rule, max_results=100, **premium_search_args) l = 0 with open(filename, 'a', encoding='utf-8') as f: n = 0 for tweet in rs.stream(): news_tweet_json = { "news_id": news_id, "query": query, "tweet": tweet } n += 1 if n % 10 == 0: print('{0}: {1}'.format(str(n), tweet['created_at'])) json.dump(news_tweet_json, f) f.write('\n') l = datetime.strptime(tweet['created_at'], "%a %b %d %H:%M:%S +%f %Y").date() print(rs, type(l), l) print('done') return l
def getRecentTweets(): endRecent = 'https://api.twitter.com/2/tweets/search/recent' search_args_rec = load_credentials(".twitter_keys.yaml", yaml_key="search_tweets_v2_recent", env_overwrite=False) query = { "max_results": 100, "tweet.fields": "public_metrics,author_id,lang", "query": "happy -RT OR upset -RT OR lol -RT OR ugh -RT OR dog -RT OR cat -RT OR food -RT OR sucks -RT", "expansions": "author_id", "user.fields": "public_metrics" } rs = ResultStream( request_parameters=query, endpoint=endRecent, bearer_token=bt, max_tweets=100, max_requests=1, ) result = list(rs.stream()) obj = {} obj['data'] = [] obj['includes'] = [] for r in result: obj['data'] = obj['data'] + r.get('data') obj['includes'] = obj['includes'] + r.get('includes').get('users') out = open('testJson.json', 'w') json.dump(obj, out)
def pull_data_for_handle(self, handle, date, days_before, results_per_call=100, max_results=2500): # check handle can be found! user_id = self.get_handle_id(handle) if user_id is 0: return 0 from_date = self.subtract_from_datestring(date, days_before) rule = self.make_rule(handle, date, from_date, results_per_call) rs = ResultStream(rule_payload=rule, max_results=max_results, **self.endpoint_args) results_list = list(rs.stream()) # results_list=temp_dict[list(temp_dict.keys())[0]] print('Found', len(results_list), 'tweets for', handle) if len(results_list) == max_results: print('Max results limit hit (' + str(2500) + '). Consider changing the parameter') return self.strip_maxresults_from_query(rule), results_list
def main(): args_dict = vars(parse_cmd_args().parse_args()) if args_dict.get("debug") is True: logger.setLevel(logging.DEBUG) logger.debug("command line args dict:") logger.debug(json.dumps(args_dict, indent=4)) if args_dict.get("config_filename") is not None: configfile_dict = read_config(args_dict["config_filename"]) else: configfile_dict = {} extra_headers_str = args_dict.get("extra_headers") if extra_headers_str is not None: args_dict['extra_headers_dict'] = json.loads(extra_headers_str) del args_dict['extra_headers'] logger.debug("config file ({}) arguments sans sensitive args:".format( args_dict["config_filename"])) logger.debug(json.dumps(_filter_sensitive_args(configfile_dict), indent=4)) creds_dict = load_credentials(filename=args_dict["credential_file"], yaml_key=args_dict["credential_yaml_key"], env_overwrite=args_dict["env_overwrite"]) dict_filter = lambda x: {k: v for k, v in x.items() if v is not None} config_dict = merge_dicts(dict_filter(configfile_dict), dict_filter(creds_dict), dict_filter(args_dict)) logger.debug("combined dict (cli, config, creds):") logger.debug(json.dumps(_filter_sensitive_args(config_dict), indent=4)) if len(dict_filter(config_dict).keys() & REQUIRED_KEYS) < len(REQUIRED_KEYS): print(REQUIRED_KEYS - dict_filter(config_dict).keys()) logger.error("ERROR: not enough arguments for the script to work") sys.exit(1) stream_params = gen_params_from_config(config_dict) logger.debug( "full arguments passed to the ResultStream object sans credentials") logger.debug(json.dumps(_filter_sensitive_args(stream_params), indent=4)) while True: start = time.time() rs = ResultStream(tweetify=False, **stream_params) logger.debug(str(rs)) if config_dict.get("filename_prefix") is not None: stream = write_result_stream( rs, filename_prefix=config_dict.get("filename_prefix"), results_per_file=config_dict.get("results_per_file")) else: stream = rs.stream() first_tweet = True tweets_num = 0 #Iterate through Tweet array and handle output. for tweet in stream: tweets_num = tweets_num + 1 #Get Tweet ID from first Tweet if first_tweet: newest_id = tweet['id'] first_tweet = False if config_dict["print_stream"] is True: print(json.dumps(tweet)) #This polling script switches to a since_id requests and removes the start_time parameter if it is used for backfill. #Prepare next query, by setting the since_id request parameter. print(f"{tweets_num} new Tweets. Newest_id: {newest_id}") request_json = json.loads(stream_params['request_parameters']) if 'start_time' in request_json.keys(): del request_json['start_time'] request_json.update(since_id=newest_id) stream_params['request_parameters'] = json.dumps(request_json) duration = time.time() - start sleep_interval = (float(config_dict["interval"]) * 60) - duration if sleep_interval < 0: sleep_interval = (float(config_dict["interval"]) * 60) time.sleep(sleep_interval)
def extract_tweets(): today = date.today() d1 = today.strftime("%d-%m-%Y") with open('config.json','r') as f: keys = json.load(f) config = dict( search_tweets_api = dict( account_type = 'premium', endpoint = 'https://api.twitter.com/1.1/tweets/search/30day/development1.json', consumer_key = keys['consumer_key'], consumer_secret = keys['consumer_secret']) ) with open('twitter_keys_fullhistory.yaml', 'w') as config_file: yaml.dump(config, config_file, default_flow_style=False) premium_search_args = load_credentials("twitter_keys_fullhistory.yaml", yaml_key="search_tweets_api", env_overwrite=False) SEARCH_QUERY = 'to:Lloydsbank' RESULTS_PER_CALL = 100 FROM_DATE = "2020-06-01" TO_DATE = "2020-06-10" MAX_RESULTS = 100000 FILENAME = 'twitter_input_data_{}_{}.jsonl'.format(FROM_DATE, TO_DATE) # Where the Tweets should be saved PRINT_AFTER_X = 100 rule = gen_rule_payload(SEARCH_QUERY, results_per_call=RESULTS_PER_CALL, from_date=FROM_DATE, to_date=TO_DATE ) rs = ResultStream(rule_payload=rule, max_results=MAX_RESULTS, **premium_search_args) with open(FILENAME, 'a', encoding='utf-8') as f: n = 0 for tweet in rs.stream(): n += 1 if n % PRINT_AFTER_X == 0: print('{0}: {1}'.format(str(n), tweet['created_at'])) json.dump(tweet, f) f.write('\n') new_tweets = [] dates_created = [] location = [] user= [] with open(FILENAME, 'rb') as f: for item in json_lines.reader(f): try: new_tweets.append(item['extended_tweet']['full_text']) except KeyError as e: new_tweets.append(item['text']) dates_created.append(item['created_at']) location.append(item['user']['location']) user.append(item['user']['id']) dataframe = pd.DataFrame(list(zip(user, location, dates_created, new_tweets)), columns =['User', 'Location', 'date_created', 'text']) print(dataframe.head()) dataframe.to_csv("tweets.csv", sep =",")
consumer_secret=API_SECRET_KEY)) with open('twitter_keys.yaml', 'w') as config_file: yaml.dump(config, config_file, default_flow_style=False) import json from searchtweets import load_credentials, gen_rule_payload, ResultStream premium_search_args = load_credentials("twitter_keys.yaml", yaml_key="search_tweets_api", env_overwrite=False) rule = gen_rule_payload(SEARCH_QUERY, results_per_call=RESULTS_PER_CALL, from_date=FROM_DATE, to_date=TO_DATE) rs = ResultStream(rule_payload=rule, max_results=MAX_RESULTS, **premium_search_args) with open(FILENAME, 'a', encoding='utf-8') as f: n = 0 for tweet in rs.stream(): n += 1 if n % PRINT_AFTER_X == 0: print('{0}: {1}'.format(str(n), tweet['created_at'])) json.dump(tweet, f) f.write('\n') print('done')
def pull_tweets(query, from_date, to_date, save_path, credentials_path, yaml_key, file_name=None, results_per_call=500, max_results=3000, verbose=False, **kwargs): """ Pulls data (i.e., tweets and user info) from Twitter using its API. The data received from the API is stored in its original form (JSON) without performing any type of preprocessing. Parameters ---------- query : str Query passed to the Twitter API to fecth Tweets. from_date : str or None Date format as specified by `convert_utc_time` for the starting time of your search. to_date : str or None Date format as specified by `convert_utc_time` for the end time of your search. save_path : str Path where the raw data will be stored. credentials_path : str Path for the yaml file with the Twitter API credentials. yaml_key : str Key within the yaml file containing the Twitter API credentials to be used. file_name : str or None, default=None Name of the json file saved containing the data dump. If None, the named will be assigned as a function of `query`, `from_date` and `to_date`. results_per_call : int, default=500 Number of Tweets returned per call. max_results : int, default=3000 Maximum number of Tweets to be pulled. verbose : int or bool, default=False Controls the verbosity when pulling data. Returns ------- None : NoneType """ logger = logging.getLogger(__name__) logger.propagate = verbose logger.info('Pulling raw Twitter data') search_args = load_credentials(filename=credentials_path, yaml_key=yaml_key) rule = gen_rule_payload(query, results_per_call=results_per_call, from_date=from_date, to_date=to_date) rs = ResultStream(rule_payload=rule, max_results=max_results, **search_args) if file_name is None: file_name = f'SAMPLE_DATA_QUERY_{query}_'\ + f'FROMDATE_{from_date}_TODATE_{to_date}.json' with open(os.path.join(save_path, file_name), 'a', encoding='utf-8') as f: for tweet in rs.stream(): json.dump(tweet, f) f.write('\n') logger.info('Data successfuly saved at' + f'\"{os.path.join(save_path, file_name)}\"') return None
rule = gen_request_parameters( query=config['query'], results_per_call=config['results_per_call'], start_time=start_ts.isoformat(), end_time=end_ts.isoformat(), tweet_fields=tweetfields, user_fields=userfields, media_fields=mediafields, place_fields=placefields, expansions=expansions, stringify=False) # result stream from twitter v2 api rs = ResultStream(request_parameters=rule, max_results=100000, max_pages=1, max_tweets=config['max_tweets'], **search_creds) # number of reconnection tries tries = 10 # while loop to protect against 104 error while True: tries -= 1 # attempt retrieving tweets try: # indicate which day is getting retrieved print('[INFO] - Retrieving tweets from ' + str(start_ts)) # get json response to list
def write_stream(self): """ write ResultStream object to disk using the write_ndjson utility """ stream = ResultStream(**self.premium_search_args, rule_payload=self.rule, max_results=62000) columns = [] for _ in write_ndjson('US_apr02_apr09_some.json', stream.stream()): # exhaust generator pass
import pprint import csv from searchtweets import load_credentials, gen_rule_payload, ResultStream premium_search_args = load_credentials( filename='D:/Code/python/workspace/LTETwitter/cred.yaml', yaml_key='search_tweets_api', env_overwrite=False) rule = gen_rule_payload("broadbalk", from_date="2010-04-01", to_date="2018-02-14", results_per_call=100) # testing with a sandbox account rs = ResultStream(rule_payload=rule, max_results=500, max_pages=5, **premium_search_args) pp = pprint.PrettyPrinter(indent=4) pp.pprint(rs) tweets = list(rs.stream()) with open("tweets18-19.csv", "a", newline="", encoding="utf-8") as csvFile: writer = csv.writer(csvFile, quoting=csv.QUOTE_MINIMAL) for tweet in tweets: writer.writerow([ tweet.created_at_datetime, tweet.favorite_count, tweet.quote_count, tweet.retweet_count, tweet.name, tweet.follower_count, tweet.geo_coordinates, tweet.profile_location, tweet.bio, tweet.user_id, tweet.screen_name, tweet.hashtags, tweet.in_reply_to_screen_name, tweet.all_text ])
from searchtweets import ResultStream rs = ResultStream(rule_payload=rule, max_results=1000, **premium_search_args) print(rs)
from searchtweets import ResultStream, gen_request_parameters, load_credentials search_args = load_credentials("~/.twitter_keys.yaml", yaml_key="search_tweets_v2", env_overwrite=False) query = gen_request_parameters("Electric Vehicle", results_per_call=100) rs = ResultStream(request_parameters=query, max_results=500, max_pages=1, **search_args) tweets = list(rs.stream())