def get_tweet(item): """ takes a tweet id and uses the twarc lib to harvest it searches for media in the tweet - if it can find any it also tries to download that media item """ item.agent_name = agent_name + "_1_get_tweet" if not os.path.exists(item.storage_folder): os.makedirs(item.storage_folder) my_content_types = [] url = item.url if url.endswith("/"): url = url[:-1] __, __id = url.rsplit("/", 1) t = Twarc(twitter_consumer_key, twitter_consumer_secret, twitter_access_token, twitter_access_token_secret) for tweet in t.hydrate([__id]): get_assets(tweet, item.storage_folder) file_path = os.path.join( item.storage_folder, "{}_{}.json".format(time.strftime("%d-%m-%Y_%H-%M-%S"), tweet['id'])) with open(file_path, "w") as outfile: json.dump(tweet, outfile) item.completed = True return item
def dehydrate(self, tweet_ids: List[str]): t = Twarc(self.configuration["twitter"]["consumer_key"], self.configuration["twitter"]["consumer_secret"], self.configuration["twitter"]["access_token"], self.configuration["twitter"]["access_token_secret"], tweet_mode="extended") count: int = 0 print("Reading tweets from Twitter") with tqdm(total=self.configuration["sampling"]["size"], unit="tweet") as written_progress_bar: with tqdm(total=len(tweet_ids), unit="tweet") as hydrate_progress_bar: for tweet in t.hydrate(tweet_ids): hydrate_progress_bar.update(1) if any(keyword in tweet["full_text"].lower() for keyword in self.configuration["sampling"]["keywords"]): append: bool = True if "only_media" in self.configuration["sampling"].keys( ): if self.configuration["sampling"]["only_media"]: if not self.contains_media(tweet): append = False if len(self.configuration["sampling"] ["languages"]) > 0: if tweet["lang"] not in self.configuration[ "sampling"]["languages"]: append = False if append: written_progress_bar.update(1) count += 1 yield tweet if count == self.configuration["sampling"]["size"]: return
def get_traing_data(self): ''' :return: combined data (tweets info and trec-is data) as dictionary {tweet_id: Tweet} ''' # load tweets retrieved by TREC-Tweets downloader # retrieved_tweets, f_name = self.load_Tweets() #retrieved_tweets, f_name = self.load_event_tweets() file = open('data/all_tweets.pkl', 'rb') retrieved_tweets = pickle.load(file) file.close() missed_tweets = [] training_data = {} # dict {'tweet id': Tweet} # load TREC data data: tweetsID, tweet_priority, tweet_categories, indicator_terms events = json.load(open(self.trec_path)) events = pd.DataFrame.from_dict(events['events'], orient='columns') for _, event in events.iterrows(): for trec_tweet in event['tweets']: if trec_tweet[ 'postID'] in retrieved_tweets: # check if tweets_full is retrieved ? retriev_tweet = retrieved_tweets[trec_tweet['postID']] training_data[trec_tweet['postID']] = Tweet( id=retriev_tweet.id, text=retriev_tweet.text, metadata=retriev_tweet.metadata, priority=trec_tweet['priority'], indicatorTerms=trec_tweet['indicatorTerms'], categories=trec_tweet['categories'], event_type=trec_tweet['event_type']) else: # adding missed tweets training_data[trec_tweet['postID']] = Tweet( id=trec_tweet['postID'], priority=trec_tweet['priority'], indicatorTerms=trec_tweet['indicatorTerms'], categories=trec_tweet['categories'], event_type=trec_tweet['event_type']) missed_tweets.append(trec_tweet['postID']) # Retrieve the missed tweets by Twarc tool and combine with training data t = Twarc(self.consumer_key, self.consumer_secret, self.access_token, self.access_token_secret) tweets_twarc = t.hydrate( iter(missed_tweets)) # retrieve all tweets by IDs for twtt in tweets_twarc: training_data[str(twtt['id'])].add_tweets_data( twtt['full_text'], {'created_at': twtt['created_at']}) return training_data
def test_hydrate(): ids = [ "501064188211765249", "501064196642340864", "501064197632167936", "501064196931330049", "501064198005481472", "501064198009655296", "501064198059597824", "501064198513000450", "501064180468682752", "501064199142117378", "501064171707170816", "501064200186118145", "501064200035516416", "501064201041743872", "501064201251880961", "501064198973960192", "501064201256071168", "501064202027798529", "501064202245521409", "501064201503113216", "501064202363359232", "501064202295848960", "501064202380115971", "501064202904403970", "501064203135102977", "501064203508412416", "501064203516407810", "501064203546148864", "501064203697156096", "501064204191690752", "501064204288540672", "501064197396914176", "501064194309906436", "501064204989001728", "501064204980592642", "501064204661850113", "501064205400039424", "501064205089665024", "501064206666702848", "501064207274868736", "501064197686296576", "501064207623000064", "501064207824351232", "501064208083980290", "501064208277319680", "501064208398573568", "501064202794971136", "501064208789045248", "501064209535614976", "501064209551994881", "501064141332029440", "501064207387742210", "501064210177331200", "501064210395037696", "501064210693230592", "501064210840035329", "501064211855069185", "501064192024006657", "501064200316125184", "501064205642903552", "501064212547137536", "501064205382848512", "501064213843169280", "501064208562135042", "501064214211870720", "501064214467731457", "501064215160172545", "501064209648848896", "501064215990648832", "501064216241897472", "501064215759568897", "501064211858870273", "501064216522932227", "501064216930160640", "501064217667960832", "501064211997274114", "501064212303446016", "501064213675012096", "501064218343661568", "501064213951823873", "501064219467341824", "501064219677044738", "501064210080473088", "501064220415229953", "501064220847656960", "501064222340423681", "501064222772445187", "501064222923440130", "501064220121632768", "501064222948593664", "501064224936714240", "501064225096499201", "501064225142624256", "501064225314185216", "501064225926561794", "501064226451259392", "501064226816143361", "501064227302674433", "501064227344646144", "501064227688558592", "501064228288364546", "501064228627705857", "501064229764751360", "501064229915729921", "501064231304065026", "501064231366983681", "501064231387947008", "501064231488200704", "501064231941570561", "501064232188665856", "501064232449114112", "501064232570724352", "501064232700350464", "501064233186893824", "501064233438568450", "501064233774510081", "501064235107897344", "501064235175399425", "501064235456401410", ] t = Twarc() count = 0 for tweet in t.hydrate(iter(ids)): assert tweet['id_str'] count += 1 assert count > 100 # may need to adjust as these might get deleted
def clean_tweets(self): self.save_ids() t = Twarc() file = open(f'{self.output_path}/{self.hydrate_name}.csv', 'w', encoding="ISO-8859-1") file.write("i,id,year,month,day,tweet\n") i = 1 for tweet in tqdm(t.hydrate(open(f'{self.output_path}/{self.ids_txt}.txt')), desc="parsing tweet ..."): try: output = self.filter_tweet(tweet) except UnicodeDecodeError: output = "error,error,error,error,error" file.write(f'{i},{output}\n') i += 1 file.close() os.remove(f'{self.output_path}/{self.ids_txt}.txt')
def hydrate(files, api_key): consumer_key = api_key['consumer_key'] consumer_secret = api_key['consumer_secret'] access_token = api_key['access_token'] access_token_secret = api_key['access_token_secret'] t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret) file1 = open(mainpath + 'data/flutids.txt', 'w') keywords = ('flu', 'common flu', 'covid19 flu', 'coronavirus common flu') records = [] hashtags = [""] for tweetIDs in files: for tweet in t.hydrate(open(mainpath + dataPath + "/" + tweetIDs)): txt = tweet['full_text'] if (tweet["lang"] == "en") and (not tweet['retweeted'] and 'RT @' not in tweet['full_text']): if any(keyword in tweet["full_text"].lower() for keyword in keywords): tid = str(tweet['id_str']) file1.write(tid + '\n') screen_name = tweet['user']['screen_name'] if not tweet["entities"]["hashtags"]: hashtags = [""] else: for h in tweet["entities"]["hashtags"]: hashtags.append(h["text"]) continue if not tweet["entities"]["urls"]: url = "" else: for urls in tweet["entities"]["urls"]: url = str(urls["expanded_url"]) continue retweets = str(tweet['retweet_count']) favorites = str(tweet['favorite_count']) records.append( [screen_name, txt, hashtags, url, retweets, favorites]) df = pd.DataFrame(records, columns=[ 'screen_name', 'tweet', 'hashtag', 'url', '#retweets', '#favorites' ]) df.to_csv(mainpath + 'data/tweets.csv') file1.close()
def hydrate_tweets(data, consumer_key, consumer_secret, access_token, access_token_secret): t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret) tweet_text = [] favorite_count = [] retweet_count = [] for tweet in t.hydrate(data['tweet_id']): tweet_text.append(tweet['full_text']) favorite_count.append(tweet['favorite_count']) retweet_count.append(tweet['retweet_count']) data['tweet_text'] = tweet_text data['favorite_count'] = favorite_count data['retweet_count'] = retweet_count data.to_csv("HydratedTweets") return (data)
def twarc_provider( tweet_ids: typing.Iterable[int] ) -> typing.Tuple[typing.Set[int], typing.List[typing.Mapping]]: """Get a list of Tweets from their IDs sourced from the Twitter API. Uses Twarc Twitter API connector - https://github.com/DocNow/twarc. """ # Twitter API consumer - handles rate limits for us t = Twarc( # pylint: disable=invalid-name consumer_key=current_app.config['TWITTER_CONSUMER_KEY'], consumer_secret=current_app.config['TWITTER_CONSUMER_SECRET'], access_token=current_app.config['TWITTER_ACCESS_TOKEN'], access_token_secret=current_app.config['TWITTER_ACCESS_TOKEN_SECRET'], ) found_tweets = list(t.hydrate(tweet_ids)) found_tweet_ids = {tweet['id'] for tweet in found_tweets} return found_tweet_ids, found_tweets
def get_tweet(item): item.agent_name = agent_name + "_1_get_tweet" if not os.path.exists(item.storage_folder): os.makedirs(item.storage_folder) my_content_types = [] url = item.url if url.endswith("/"): url = url[:-1] __, __id = url.rsplit("/", 1) t = Twarc(twitter_consumer_key, twitter_consumer_secret, twitter_access_token, twitter_access_token_secret) for tweet in t.hydrate([__id]): get_assets(tweet, item.storage_folder) file_path = os.path.join( item.storage_folder, "{}_{}.json".format(time.strftime("%d-%m-%Y_%H-%M-%S"), tweet['id'])) with open(file_path, "w") as outfile: json.dump(tweet, outfile) item.completed = True return item
def engage_discourse(TID_PATH=Path('data/tweets_ids/'), LM_PATH=Path('data/tweets/'), USERS_PATH=Path('data/users/')): t = Twarc(credentials.CONSUMER_KEY, credentials.CONSUMER_SECRET, credentials.ACCESS_TOKEN, credentials.ACCESS_TOKEN_SECRET) chunksize = 24000 users = [] for doc in (USERS_PATH).glob('*.*'): with open(doc) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') next(csv_reader) for row in csv_reader: users.append(row[0]) print(f'{len(users)} users in the corpus.') for doc in (TID_PATH).glob('*.*'): start = time.time() for count, tweet in enumerate(t.hydrate(open(doc))): process_tweet(tweet) if count % 1000 == 0: print(f'{time.time()-start} seconds for {count} tweets.') print('Document done!') print('Files written!')
def readIdFile(input_file_name, tweets_num=0): t = Twarc(CONSUMER_KEY, CONSUMER_KEY_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET) inputF = open(input_file_name, "r") i = 1 subIdFileName = "tweets_id_" + str(i // 50000 + 1) + ".txt" subIdFile = open(subIdFileName, "a+") print("start read") line = inputF.readline() while line != "" and (tweets_num > 0 and i < tweets_num + 1) or tweets_num <= 0: subIdFile.write(line) line = inputF.readline() if i % 50000 == 0 or tweets_num == i or line == "": print("Read: " + subIdFileName) subIdFile.close() # send request tweets = t.hydrate(open(subIdFileName)) tweetsClean(tweets) print("Finish read:" + subIdFileName) subIdFileName = "./tweets_id_" + str(i // 50000 + 1) + ".txt" subIdFile = open("./tweets_id_" + str(i // 50000 + 1) + ".txt", "w") i += 1
ids_dir = main_dir + 'data/' # Make sure you create this folder in the main directory before running this script target_dir = main_dir + 'data_full/' # Twitter API Credentials ACCESS_TOKEN = config.ACCESS_TOKEN ACCESS_SECRET = config.ACCESS_SECRET CONSUMER_KEY = config.CONSUMER_KEY CONSUMER_SECRET = config.CONSUMER_SECRET t = Twarc(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_SECRET) tweet_ids = pd.read_csv(ids_dir + filename + ".csv", lineterminator='\n') tweet_objects = [] for tweet in t.hydrate(tweet_ids.id.drop_duplicates()): tweet_objects.append(tweet) df_full = pd.DataFrame( tweet_objects, columns=[ 'created_at', 'id', 'id_str', 'full_text', 'truncated', 'display_text_range', 'entities', 'source', 'in_reply_to_status_id', 'in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place', 'contributors', 'is_quote_status', 'retweet_count', 'favorite_count', 'favorited', 'retweeted', 'possibly_sensitive', 'lang' ]) df_full.to_csv(target_dir + filename + '_full.csv', index=None)
def main(warc_file): twitter = Twarc() out = csv.writer(sys.stdout) out.writerow(json2csv.get_headings()) for tweet in twitter.hydrate(tweet_ids(warc_file)): out.writerow(json2csv.get_row(tweet))
class TwitterApi(object): def __init__(self, consumer_key, consumer_secret, access_token_key="", access_token_secret=""): """ This method authenticates and creates a twitterapi object. In case the system is unable to authenticate the object, a SystemError is returned. """ auth = tweepy.OAuthHandler(consumer_key, consumer_secret) try: auth.get_authorization_url() except TweepError as e: print("Unable to authenticate", str(e)) raise ApplicationError(*error_list["AUTH_ERROR"]) auth.set_access_token(access_token_key, access_token_secret) self._api = tweepy.API(auth_handler=auth) try: self._api_twarc = Twarc(cnst.CONSUMER_KEY, cnst.CONSUMER_SECRET, cnst.ACCESS_TOKEN_KEY, cnst.ACCESS_TOKEN_SECRET) except Exception as e: print("Unable to authenticate", str(e)) raise ApplicationError(*error_list["AUTH_ERROR"]) # @property def get_tweet_from_id(self, tweet_id): """ given a valid twitter url, the method returns the tweet as a tweepy status object :error: in case of limit reached "RT_LMT_RCHD" is raised :returns: tweet object """ try: tweet = self._api.get_status(tweet_id, tweet_mode="extended") return tweet except RateLimitError as r: print("Rate limit exceeded", str(r)) raise ApplicationError(*error_list["LMT_RCHD_ERROR"]) except TweepError as e: print("Error occured", str(e)) raise ApplicationError(*error_list["FTCH_ERR"]) def _is_valid_url(self, tweet_url): url_without_share = tweet_url.split("?")[0] m = re.match("https://twitter.com/(.*)/status/(.*)", url_without_share) n = re.match("twitter.com/(.*)/status/(.*)", url_without_share) o = m or n return o def get_tweet_from_url(self, tweet_url): """ Given a tweet url this method identifies the tweet id from the url and then queries get_tweet_from_id to return a tweet object. The URL must be of the form https://twitter.com/[user]/status/[tweet_id] or twitter.com/[user]/status/[tweet_id] :error: malformed url, the application error "MAL_TWT_URL" is raised, :returns: tweet object """ o = self._is_valid_url(tweet_url) if type(tweet_url) is str and o and o.group(2).isnumeric(): return self.get_tweet_from_id(int(o.group(2))) else: raise ApplicationError(*error_list["MAL_TWT_URL"]) def get_original_tweet_from_url(self, tweet_url): """ Given a url this method returns the source twitter object of the tweet. That is, if a tweet A is a retweet, this method returns the source tweet B for retweet A or if the tweet A is source tweet, it returns tweet A. The system is coded with assumption that there can be retweets or a retweet, hence this method searches the original tweet by looping over and over till the "in_reply_to_status_id_str" is None" :error: Application error if there is no embeded url :error: Application error if the original tweet is older than 7 or 30 days, depending on the end """ original_tweet, tweet = None, None while True: # if tweet is None then this is first call and we use the tweet_url if tweet is None: tweet = self.get_tweet_from_url(tweet_url) if not tweet.is_quote_status: # is_quote_status is false for original tweet original_tweet = tweet break # if tweet.in_reply_to_status_id_str is None: # original_tweet = tweet # # pytest.set_trace() # break else: tweet = self.get_tweet_from_id(tweet.quoted_status_id) # tweet = self.get_tweet_from_id(tweet.in_reply_to_status_id) if len(original_tweet.entities["urls"]) == 0: raise ApplicationError(*error_list["NO_EMBD_URL"]) if (original_tweet.created_at - datetime.now()).days >= cnst.MAX_TWEET_CREATION_RANGE: # pytest.set_trace() raise ApplicationError(*error_list["EXPRD_TWT"]) return original_tweet def get_replies(self, tweet, reply_limit=cnst.MAX_REPLY, search_per_request=cnst.SEARCH_PER_REQUEST): """ This method takes in the tweet object and returns replies for the tweet, the count of replies are defined by reply_limit. :error: AssertionError if the tweet object is not of type Tweepy.status :error: Application Error if the limit for the twitter API is reached. """ assert type(tweet) == tweepy.Status reply_tweet_ids_list = list() # get replies on the original tweet self.get_reply_ids_(tweet, reply_limit, search_per_request, reply_tweet_ids_list) # self.get_reply_ids(tweet, reply_limit, search_per_request, reply_tweet_ids_list) # in case the list is not big enough get replies to the reply tweets # until limit is reached. We do not go deeper that level 1 tweet. # NOTE: Commenting below code as we are not going deeper than level one. # if len(reply_tweet_ids_list) < reply_limit: # temp_list = reply_tweet_ids_list.copy() # for tweet_id in temp_list: # tweet = api.get_tweet_from_id(tweet_id) # self.get_reply_ids(tweet, reply_limit, search_per_request, reply_tweet_ids_list) # if len(reply_tweet_ids_list) < reply_limit: # break # get comments from the list replies = list() for tweet in self._api_twarc.hydrate(reply_tweet_ids_list): try: replies.append(tweet['full_text']) except Exception as e: print(str(e)) # for reply_id in reply_tweet_ids_list: # tweet = self.get_tweet_from_id(reply_id) # try: # replies.append(tweet.retweeted_status.full_text) # except AttributeError: # Not a Retweet should never occur # replies.append(tweet.full_text) return replies def get_reply_ids_(self, tweet, reply_limit, search_per_request, reply_tweet_ids_list): """ Given a tweet this method returns a list of ids for the retweets with comments using the premium api. :error: Applicaiton Error when limit is reached. :error: Assertion Error if the reply_tweet_ids_list is None or not a list """ assert reply_tweet_ids_list is not None assert type(reply_tweet_ids_list) == list tweet_id = tweet.id user_name = tweet.user.screen_name search_string = "url:https%3A%2F%2Ftwitter.com%2F{}%2Fstatus%2F{} lang:en".format( user_name, tweet_id) replies = tweepy.Cursor(self._api.search_30_day, cnst.SEARCH_ENV, search_string, maxResults=search_per_request).items() try: startTime = datetime.now() for reply in replies: current_time = datetime.now() if reply.is_quote_status and reply.quoted_status.id == tweet.id: reply_tweet_ids_list.append(reply.id) if len(reply_tweet_ids_list) == reply_limit or \ (current_time-startTime).total_seconds() >= cnst.MAX_TIME_REPLY_SEARCH : break print("Returning", len(reply_tweet_ids_list), " replies") return reply_tweet_ids_list except tweepy.TweepError as e: raise ApplicationError(*error_list["LMT_RCHD_ERROR"]) def get_reply_ids(self, tweet, reply_limit, search_per_request, reply_tweet_ids_list): """ given a tweet this method returns list of reply tweet ids for the given tweet. The upper limit for the tweets returned is defined by reply_limit :error: Applicaiton Error when limit is reached. :error: Assertion Error if the reply_tweet_ids_list is None or not a list """ # reply_tweet_ids = list() assert reply_tweet_ids_list is not None assert type(reply_tweet_ids_list) == list tweet_id = tweet.id user_name = tweet.user.screen_name max_id = None replies = tweepy.Cursor(self._api.search, count=search_per_request, q='to:{}'.format(user_name), since_id=tweet_id, max_id=max_id, tweet_mode='extended').items() try: startTime = datetime.now() for reply in replies: current_time = datetime.now() if (reply.in_reply_to_status_id == tweet_id): # pytest.set_trace() reply_tweet_ids_list.append(reply.id) if len(reply_tweet_ids_list) == reply_limit or \ (current_time-startTime).total_seconds() >= cnst.MAX_TIME_REPLY_SEARCH : # pytest.set_trace() break max_id = reply.id # pytest.set_trace() return reply_tweet_ids_list except tweepy.TweepError as e: raise ApplicationError(*error_list["LMT_RCHD_ERROR"])
after.strftime("%B%-d").lower() + ".csv", 'r') as csvfile: data = csv.reader(csvfile, delimiter=' ', quotechar='|') """ for row in data: print (row) break """ totaldata = pd.read_csv(filename, header=None) dataframe = totaldata[0] numberfile = "number_corona_tweets" + date.strftime( "%B%-d").lower() + ".txt" readyfile = "ready_corona_tweets" + date.strftime( "%B%-d").lower() + ".csv" dataframe.to_csv(numberfile, index=False, header=None) for tweet in t.hydrate(open(numberfile)): #print (tweet["place"]["country"]) if (tweet["place"] == None): continue if (tweet["place"]["country"] == None): continue ''' if ("place" not in tweet): print ("gibberish") continue if ("country" not in tweet["place"]): #print ("sdjfksldf") continue ''' if (tweet["place"]["country"] == "United States"): #print("sdkjf")
from twarc import Twarc from File_manager import File_manager #usage, takes in a file called ids with id's of tweets to hydrate. testing = {""} t = Twarc('4lJGm5YUrXgtwfMUmlo9L4KgH', 'YIYeIiZGCJpolIASa56eqLJsEa54vGKFt07CkTai3SWKoCPx3w', '276679861-dgKpojdDSWRutxEG7NH2A2ZgD7xHtzcuwOisMo1T', '4ckJoGnRm5defgDegfV4opPETmarGuNQr9U6pAVEwR9sT') out_file = File_manager.open_for_download("download") #Get file to write to ids = open("ids", "r", 1) for tweet in t.hydrate(ids): tweet = str(tweet) out_file.write(tweet + "\n") testing.add("for loop executed") out_file.close() ids.close() testing.add("hydrate executed") print testing
from twarc import Twarc import pprint import json consumer_key = "2NBPNFml9TtV3ValyhgZqP4ch" consumer_secret = "qzCNGbr5I5vD2GAps7gdsQRNW4GbmlhODp0BokqFgCzLw2TjjV" access_token = "931008641255084032-rMD6zn8esls7S1z4UiebC52Tb0gp8BM" access_token_secret = "kpxBObeQfcpqbU8EikrionXFa1NbYpstYwPGA542av7K3" output = open("sample1.json", 'w') t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret) hydrated = [] count = 0 for tweet in t.hydrate(open('representatives.txt')): if count > 10000: break count += 1 hydrated.append(tweet) if count == (0 % 1000): output.write(json.dump(hydrated)) print("done!") print(count, " tweets pulled.") output.write(json.dumps(hydrated))
from twarc import Twarc import json #input twitter credentials consumer_key = '*********' consumer_secret = '*********' access_token = '*********' access_token_secret = '*********' t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret) data = [] for tweet in t.hydrate(open('../input_files/ids.txt')): data.append(json.dumps(tweet)) with open('output.json', 'w') as outfile: outfile.write("\n".join(data) + '\n')
twarc = Twarc() tmp_df = pd.read_csv(LOG_FILE, names=["file"]) traversed = list(tmp_df.file.values) with open(LOG_FILE, 'a+') as logf: for file in os.listdir(PATH): if file not in traversed: file_postfix = str(file).split(".")[0][-2:] sample_size = weights[file_postfix] print("Extract from file: ", file, "for ", sample_size, " samples:") ids = sample_file(PATH + file, sample_size) output_file_name = str(file).split(".")[0] + "_contents.txt" # log w_ = csv.writer(logf) w_.writerow([file]) # extract content with open(OUTPUT_PATH + output_file_name, 'w') as wf: for tweet in twarc.hydrate(ids): if "retweeted_status" in tweet: if tweet['lang'] != "en": continue else: row = [] row.append(tweet['created_at']) row.append(tweet['id_str']) row.append(tweet['retweeted_status']['full_text']) w = csv.writer(wf, delimiter=',') w.writerow(row) else: print(file, " is processed already!")
from twarc import Twarc import csv # Replace with your keys and tokens t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret) # Files tweetList = r"NCHB2-idsab.txt" csv_write = csv.writer(open("tweets.csv", "wb")) # Hydrate tweets (remove break to go through entire list) i = 0 for tweet in t.hydrate(open(tweetList)): text = tweet["full_text"].encode('utf-8') csv_write.writerow([tweet["id"], text]) i += 1 if i == 5: break
with open("config.yml", 'r') as ymlfile: cfg = yaml.load(ymlfile) for section in cfg: print(section) consumer_key = cfg['twitter']['consumer_key'] consumer_secret = cfg['twitter']['consumer_secret'] access_token = cfg['twitter']['access_token'] access_token_secret = cfg['twitter']['access_token_secret'] def ids(): for id in open("brexit_tweet_ids.csv"): yield id t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret) keys = [ "text", "id", "created_at", "favorite_count", "lang", "place", "coordinates", "user", "entities", "geo", "retweeted", "retweet_count" ] with open('tweets123.txt', 'w') as outfile: for tweet in t.hydrate(ids()): tweet1 = {filter_key: tweet[filter_key] for filter_key in keys} values_json = json.dumps(tweet1, sort_keys=True) outfile.write(values_json + "\n") print(tweet1['text'])
url = "https://drive.google.com/file/d/1COJ1zrJE-acz0yZssIljRSAPyIRtS2EC/view?usp=sharing" r = requests.get(url) def reader_generator(reader): b = reader(1024 * 1024) while b: yield b b = reader(1024 * 1024) def raw_newline_count(fname): f = open(fname, 'rb') f_gen = reader_generator(f.raw.read) return sum(buf.count(b'\n') for buf in f_gen) if __name__ == "__main__": gzip_path = r.with_suffix('.jsonl.gz') if gzip_path.is_file(): return num_ids = raw_newline_count(r) with gzip.open(gzip_path, 'w') as output: with tqdm(total=num_ids) as pbar: for tweet in twarc.hydrate(r.open()): output.write(json.dumps(tweet).encode('utf8') + b"\n") pbar.update(1)
config = json.load(data_file) logging.info('Finished parsing config.') handle = MongoHandle(config) logging.info('Initialized the Mongo connection.') t = Twarc(config['twitter']['consumer_key'], config['twitter']['consumer_secret'], config['twitter']['access_token'], config['twitter']['access_token_secret']) logging.info('Initialized Twitter connection.') for source_file in os.listdir('./' + config['source_folder']): logging.info('Preparing to hydrate: ' + source_file) tweet_ids = open('./' + config['source_folder'] + '/' + source_file) new_tweet_ids = [] logging.info('Parsing tweet ids.') start = time.time() for line in tweet_ids: line = line.strip() if (not handle.is_written(line)): new_tweet_ids.append(line) end = time.time() logging.info('Finished looking for new tweets in %.2f seconds.' % (end - start)) handle.write(t.hydrate(new_tweet_ids), source_file) tweet_ids.close() logging.info('Finished hydrating: ' + source_file) logging.info('Finished hydration task.') handle.clean()
# Twitter auth for downloading tweets CONSUMER_KEY = os.environ.get("TWITTER_CONSUMER_KEY") CONSUMER_SECRET = os.environ.get("TWITTER_CONSUMER_SECRET") ACCESS_TOKEN = os.environ.get("TWITTER_ACCESS_TOKEN") ACCESS_TOKEN_SECRET = os.environ.get("TWITTER_ACCESS_TOKEN_SECRET") # Concat and read all the CSVs dir1 = "data/twitter-framing-master/congressional_tweets_dataset_2017/unlabeled/" dir2 = "data/twitter-framing-master/congressional_tweets_dataset_2017/labeled/" csv_files = glob.glob(os.path.join(dir1, "*.csv")) + glob.glob( os.path.join(dir2, "*.csv")) HEADERS = [ "tweet_id", "issue1", "issue2", "frame1", "frame2", "frame3", "party", "ts" ] all_df = pd.concat( (pd.read_csv(f, names=HEADERS, header=None) for f in csv_files), ignore_index=True) t = Twarc(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET) tweet_texts = {} for tweet in t.hydrate(all_df["tweet_id"]): tweet_texts[tweet["id"]] = tweet["full_text"] text_df = pd.DataFrame(tweet_texts, index=[0]).transpose().rename(columns={ "index": "tweet_id", 0: "text" }) all_df = all_df.set_index("tweet_id") joined = all_df.join(text_df) joined.to_pickle("data/tweets.pkl")
tweet_array = [] i = 0 foundid = "" with open( r'''C:\Users\shann\OneDrive\Documents\FLU DATA\flu_annotations\2011-12\TESTPYTHON.csv''', 'r') as f: tweet_array = list(csv.reader(f)) #hydrate the tweets t = Twarc("RNlmCwGeBhosfRAnkUCswZdrQ", "FPN1re2DugM3j5PysnXaA2JwPlOsoFdlGJSuPkLDPyWNd90hkG", "1103298875652300800-qDk6jxDGiP5aP2WHr4xhoAUwhIwob1", "RMLXZlarWN1NVab2IF38bmtbKNX34euugxwrYnDChmyui") for tweet in t.hydrate( open( r'''C:\Users\shann\OneDrive\Documents\FLU DATA\flu_annotations\2011-12\TESTPYTHON.csv''' )): time.sleep(0.5) tweet_array[i][0] = tweet['id'] print(tweet_array[i][0], " \n") tweet_array[i][1] = BMP(tweet['full_text']) # text print(tweet_array[i][1], " \n") tweet_array[i][2] = tweet['user']['id'] print(tweet_array[i][2], " \n") tweet_array[i][3] = BMP(tweet['user']['location']) # encode it print(tweet_array[i][3], " \n") tweet_array[i][4] = BMP(tweet['created_at'])
ids = [] for file in filenames: with open(file, 'r') as tweetids: ids.append(tweetids.read()) # Write these merged ids with open('ids.txt', 'w') as outfile: for i in ids[0:2]: outfile.write(str(i)) ids[0:2] testids = [ids[0][0:19], ids[0][20:39], ids[0][40:59]] jsontweets = [] for tweet in t.hydrate(ids[0]): jsontweets.append(tweet) #for tweet in t.hydrate(open('ids.txt')): # print(tweet["text"]) jsontweets testids = ['1245140084313206786', '1245140084350910464', '1245140084417941505'] jsontweets = [] for i in t.hydrate(testids): jsontweets.append(i) jsontweets = json_normalize(jsontweets)
access_token_secret] = getKeysTokens(user) t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret) curr_df = pd.read_csv(id_file) all_ids = curr_df.iloc[:, 0] last_idx = (len(all_ids) - 1) print(id_file, "has", last_idx, "ids") curr_idx = range(num_iter * tweet_limit, (num_iter + 1) * tweet_limit) # setup output .csv hydrated_file = filename[:-4] + '_hydrated.csv' open(hydrated_file, 'w') print('beginning to hydrate', target_length, 'number of tweets from', id_file) # hydrate up to target_length number of tweets while num_iter * tweet_limit < target_length + tweet_limit: tweets = t.hydrate(all_ids[curr_idx]) for tweet in tweets: try: curr_tweet = dict() # ignore any non-English tweets or tweets that don't contain the word vaccine/vax if tweet['lang'] != "en": continue curr_tweet['text'] = tweet['full_text'] curr_tweet['id'] = tweet['id'] curr_tweet['place'] = tweet['place']['country'] if tweet[ 'place'] else "" curr_tweet['created_at'] = tweet['created_at'] curr_tweet['user_location'] = tweet['user']['location'] curr_tweet['user_name'] = tweet['user']['name'] curr_tweet['user_followers_count'] = tweet['user'][ 'followers_count']
import csv ACCESS_TOKEN = "2668727876-Yrz4VAyuedncEMFsFRQhy5G8b6ZKbcB9x2G58BU" ACCESS_TOKEN_SECRET = "LEXRPAoFSKE7oBaqrrZRUBnIbgdoWbZhS5vG2zM2s7Y6j" CONSUMER_KEY = "l79fswnkaCLeUjXeZzPir9iQU" CONSUMER_SECRET = "6s1h36BhY9Ypdu7pxDWWSyT2u6mYpex8EUXwKJaewDAtxhsGVq" t = Twarc(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET) p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION, p.OPT.SMILEY) with open('April1.tsv', 'r') as fin, open('April1_out.tsv', 'w') as fout: reader = csv.reader(fin, dialect='excel-tab') writer = csv.writer(fout, dialect='excel-tab') for row in reader: # delete indices in reverse order to avoid shifting earlier indices del row[1:] writer.writerow(row) # t hydrate March1_out.tsv > March1.jsonl with open('April1.csv', mode='w', encoding="utf-8") as corona_file: fieldnames = ['date', 'text', 'truncated'] writer = csv.DictWriter(corona_file, fieldnames=fieldnames) writer.writeheader() for tweet in t.hydrate(open('April1_out.tsv')): p.clean(tweet["full_text"]) writer.writerow({ 'date': tweet["created_at"], 'text': tweet["full_text"], 'truncated': tweet["truncated"] })