def test_since_id(): t = Twarc() for tweet in t.search('obama'): id = tweet['id_str'] break assert id time.sleep(5) for tweet in t.search('obama', since_id=id): assert tweet['id_str'] > id
def test_paging(): # pages are 100 tweets big so if we can get 500 paging is working t = Twarc() count = 0 for tweet in t.search('obama'): count += 1 if count == 500: break assert count == 500
def test_search(): count = 0 t = Twarc() for tweet in t.search('obama'): assert tweet['id_str'] count += 1 if count == 10: break assert count == 10
def test_stream(): t = Twarc() count = 0 for tweet in t.stream("obama"): assert tweet['id_str'] assert tweet['text'] count += 1 if count == 50: break assert count == 50
def test_max_id(): t = Twarc() for tweet in t.search('obama'): id = tweet['id_str'] break assert id time.sleep(5) count = 0 for tweet in t.search('obama', max_id=id): count += 1 assert tweet['id_str'] <= id if count > 100: break
def test_hydrate(): ids = [ "501064188211765249", "501064196642340864", "501064197632167936", "501064196931330049", "501064198005481472", "501064198009655296", "501064198059597824", "501064198513000450", "501064180468682752", "501064199142117378", "501064171707170816", "501064200186118145", "501064200035516416", "501064201041743872", "501064201251880961", "501064198973960192", "501064201256071168", "501064202027798529", "501064202245521409", "501064201503113216", "501064202363359232", "501064202295848960", "501064202380115971", "501064202904403970", "501064203135102977", "501064203508412416", "501064203516407810", "501064203546148864", "501064203697156096", "501064204191690752", "501064204288540672", "501064197396914176", "501064194309906436", "501064204989001728", "501064204980592642", "501064204661850113", "501064205400039424", "501064205089665024", "501064206666702848", "501064207274868736", "501064197686296576", "501064207623000064", "501064207824351232", "501064208083980290", "501064208277319680", "501064208398573568", "501064202794971136", "501064208789045248", "501064209535614976", "501064209551994881", "501064141332029440", "501064207387742210", "501064210177331200", "501064210395037696", "501064210693230592", "501064210840035329", "501064211855069185", "501064192024006657", "501064200316125184", "501064205642903552", "501064212547137536", "501064205382848512", "501064213843169280", "501064208562135042", "501064214211870720", "501064214467731457", "501064215160172545", "501064209648848896", "501064215990648832", "501064216241897472", "501064215759568897", "501064211858870273", "501064216522932227", "501064216930160640", "501064217667960832", "501064211997274114", "501064212303446016", "501064213675012096", "501064218343661568", "501064213951823873", "501064219467341824", "501064219677044738", "501064210080473088", "501064220415229953", "501064220847656960", "501064222340423681", "501064222772445187", "501064222923440130", "501064220121632768", "501064222948593664", "501064224936714240", "501064225096499201", "501064225142624256", "501064225314185216", "501064225926561794", "501064226451259392", "501064226816143361", "501064227302674433", "501064227344646144", "501064227688558592", "501064228288364546", "501064228627705857", "501064229764751360", "501064229915729921", "501064231304065026", "501064231366983681", "501064231387947008", "501064231488200704", "501064231941570561", "501064232188665856", "501064232449114112", "501064232570724352", "501064232700350464", "501064233186893824", "501064233438568450", "501064233774510081", "501064235107897344", "501064235175399425", "501064235456401410", ] t = Twarc() count = 0 for tweet in t.hydrate(iter(ids)): assert tweet['id_str'] count += 1 assert count > 100 # may need to adjust as these might get deleted
def __init__(self, search_terms): logging.info("initializing TwitterStream Kafka") # globals to all instances self.t = Twarc(localConfig.client_key, localConfig.client_secret, localConfig.access_token, localConfig.access_token_secret) self.search_terms = search_terms
def test_max_and_since_ids(): t = Twarc() max_id = since_id = None count = 0 for tweet in t.search('obama'): count += 1 if not max_id: max_id = tweet['id_str'] since_id = tweet['id_str'] if count > 500: break count = 0 for tweet in t.search('obama', max_id=max_id, since_id=since_id): count += 1 assert tweet['id_str'] <= max_id assert tweet['id_str'] > since_id
def _create_twarc(self): self.twarc = Twarc(self.message["credentials"]["consumer_key"], self.message["credentials"]["consumer_secret"], self.message["credentials"]["access_token"], self.message["credentials"]["access_token_secret"], http_errors=self.http_errors, connection_errors=self.connection_errors)
class TwitterStreamKafka(object): # WORKING TWITTER HOSE def __init__(self, search_terms): logging.info("initializing TwitterStream Kafka") # globals to all instances self.t = Twarc(localConfig.client_key, localConfig.client_secret, localConfig.access_token, localConfig.access_token_secret) self.search_terms = search_terms # method to capture twitter stream def captureStream(self): for tweet in self.t.stream(",".join(self.search_terms)): result = producer.send_messages("betweezered", json.dumps(tweet))
'may': '05', 'june': '06', 'july': '07', 'august': '08', 'september': '09', 'october': '10', 'november': '11', "december": '12' } # Initializing Twitter API keys consumer_key = "IGMYSPiWpx0qLEjhYDrJqRuYp" consumer_secret = "e5ypjtz2Xn49VsjPulIhrVEUduC0id1roNvzoqGfpy6CCRhBgs" access_token = "1140054025791926272-sxzwfB5oCl8EBEPdhgewfuNP1oCemG" access_token_secret = "0tIMUBEeurg9Qmd6e076SLoSbMK7opLaWaorkhpqa4Tn1" t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret) # tweet_folder = "tweet_folder" # last_week_folder = "last_week_folder" # this_week_folder = "this_week_folder" # news_path = r"../COVID19下集体理性量化分析与思考/数据/recovery-news-data.csv" # result_path = "collective_rationalty" # 用于判断是否存在存储tweet数据的文件夹,不存在则创建 def makedir(): if not os.path.exists(tweet_folder): os.makedirs(tweet_folder) if not os.path.exists(last_week_folder): os.makedirs(last_week_folder) if not os.path.exists(this_week_folder):
def daterange(start_date, end_date): for n in range(int((end_date - start_date).days)): yield start_date + timedelta(n) start_date = date(2020, 4, 12) end_date = date( 2020, 7, 13 ) #end date, datetime.date(datetime.now()) (this second option is dynamic and changes by date but depends on timezone) OAUTH_TOKEN = "1029186921438883845-AQjxqWPxZlURJ47eWFqRFRkSCkDPFh" OAUTH_TOKEN_SECRET = "YgxeTz31ItxBrJubvwZpZaqa57LLhWRKLMM4t82pdEtsv" CONSUMER_KEY = "Y70ckEEL2TdQzyq9NqI5RriiB" CONSUMER_SECRET = "YWQJJlJyzXxkaPXCEdFrANgHFf4Dyd0PtkT4f5TvXFUJLUtpvU" t = Twarc(CONSUMER_KEY, CONSUMER_SECRET, OAUTH_TOKEN, OAUTH_TOKEN_SECRET) for singledate in daterange(start_date, end_date): after = singledate + timedelta(1) filename = singledate.strftime("%B%-d").lower() + "_" + after.strftime( "%B%-d").lower() + ".csv" if (singledate.strftime("%B%-d").lower() != "march29"): with open(filename, 'r') as csvfile: data = csv.reader(csvfile, delimiter=' ', quotechar='|') totaldata = pd.read_csv(filename, header=None) dataframe = totaldata[0] sentimentstuff = totaldata[1] numberfile = "number_corona_tweets_state" + singledate.strftime( "%B%-d").lower() + ".txt" readyfile = "sentiment" + singledate.strftime( "%B%-d").lower() + ".csv"
from twarc import Twarc import json import time from simplesentiment.stence import sentanceanalyser t = Twarc('EZ4MUdjIR22V8y6TDia6vRrEf', 'ARY5AgvJKvRWfb6nPeTugnvyKDY8VdQh0HdHpYLhcrUX2AvBdz', '1103185799841902592-g6OFAdGgV4vYkeg5KCK2gZwCmI3XzH', '6IW8bDnxeBZwLEDNa4GAEBWzvgDkAkh7bRRVrV4xcSfpc') class TwarcCustom: """ getting list of top n tweets reply in a list by providing the screen_name and count prams e.g: if n=2 for some screen_name then for 2 tweets you will get all the replies of two tweets as list wise in plain text. """ def getTweetRepliesList(self, screen_name=None, count=1, limit=100): tweet_reply = [] timeline = t.timeline(screen_name=screen_name, count=count) for tw in timeline: tweet = t.tweet(tw['id_str']) tweet_text = "" for index, tweet in zip(range(limit), t.replies(tweet)): tweet_text += tweet['full_text'] + " " tweet_reply.append(tweet_text) return tweet_reply """ this will return tweets reply as single text of all tweets """ def getTweetRepliesText(self, screen_name=None, count=1, pages=1):
# Dependencies from twarc import Twarc import tweepy import utils import keys import sys # Set up dependencies for Twitter APIs twarc = Twarc(keys.consumer_key, keys.consumer_secret, keys.access_token, keys.access_token_secret) auth = tweepy.OAuthHandler(keys.consumer_key, keys.consumer_secret) auth.set_access_token(keys.access_token, keys.access_token_secret) api = tweepy.API(auth) arguments = sys.argv # Get parameters from command line if len(arguments) > 1: # If there's any arguments join with an OR in between hashtags = ' OR '.join(map(str, arguments)) else: # If no arguments don't run print("No arguments passed") sys.exit(0) # Search Twitter for tweets conraining the hashtags tweets = twarc.search(hashtags) for tweet in tweets: user = tweet['user']
import json import datetime from twarc import Twarc # Collection end period_end = datetime.datetime(2017, 12, 31, 23, 59, 59, 999999) # Twitter API keys - geobgu2 t = Twarc( 'JA5KZiEuU8HDIFDtLXwkHCpdx', 'NdGoBYXuYHbHOAInNHHumjz0xeCp8zEYfbm0RW0dzpvcRY8Ovc', '2782755278-ARD36i5dPBU6fxRdgvomZoxuCOI3ewVVGPizZCf', 'ceN8O8yIVV2C7o6CJyLYYo3CNIm48Tnojpxj69pqqv36u' ) # Twitter stream request t = t.filter(locations = "\-72.21437,41.19034,-69.64939,43.30924") # Collect tweets while datetime.datetime.now() < period_end: # Loop until collection period ends day_start = datetime.datetime.now() day_end = datetime.datetime(day_start.year, day_start.month, day_start.day, day_start.hour, 59, 59, 999999) fh = open("boston_geobgu2_" + day_start.strftime("%Y-%m-%d_%H:%M:%S") + ".json", "w") #fh.write("[") for tweet in t: # Loop until hour ends x = tweet try: if x["geo"] != None: print(x["text"]) fh.write(json.dumps(x)) fh.write("\n")
if "media_url" in item: murl = item["media_url"] if murl not in urls: urls.append(murl) return urls # Main starts here if __name__ == '__main__': # Add your own API key values here fsecret = open('/Users/sara/twittersecrets.txt', 'r') secrets = fsecret.readline() access_token, access_token_secret, consumer_key, consumer_secret = \ [x.strip() for x in secrets.split(',')] twarc = Twarc(consumer_key, consumer_secret, access_token, access_token_secret) # Check that search terms were provided at the command line target_list = [] if (len(sys.argv) > 1): target_list = sys.argv[1:] else: print("No search terms provided. Exiting.") sys.exit(0) num_targets = len(target_list) for count, target in enumerate(target_list): print( str(count + 1) + "/" + str(num_targets) + " searching on target: " + target) # Create a separate save directory for each search query
if month == 0: month += 1 else: month += 2 last_day = calendar.monthrange(2020, month)[1] start = 1 end = 10 if i % 3 == 1: start = 11 end = 20 elif i % 3 == 2: start = 21 end = last_day t = Twarc(api_keys[i][0], api_keys[i][1], api_keys[i][2], api_keys[i][3], app_auth=True) args.append([t, month, start, end]) def get_and_save_data(id_col, t): """ Use configured Twarc t to get full tweets given tweet ids id_col and save tweets in database. """ for tweet in t.hydrate(id_col): x = None try: x = mycol.insert_one(tweet) except:
searches = targets else: print "Please add search targets in config/searcher_targets.txt" sys.exit(0) print "Search targets: " + str(len(searches)) script_start_time_str = time.strftime("%Y-%m-%d %H:%M:%S") output_dir_base = output_dir current_label = "" data = {} associations = {} frequencies = {} max_s = len(searches) for count, search in enumerate(searches): acct_name, consumer_key, consumer_secret, access_token, access_token_secret = get_account_sequential( ) t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret) print "Signing in as: " + acct_name search = "\"" + search + "\"" print(str(count) + "/" + str(max_s) + " searching: " + search) current_label = "search_" + str(count) output_dir = output_dir_base + str(count) + "/" if not os.path.exists(output_dir): print("Created directory: " + output_dir) os.makedirs(output_dir) fn = os.path.join(output_dir, "target.txt") with open(fn, "w") as f: f.write(search + "\n") dump_filename = output_dir + "raw.json" dump_file_handle = open(dump_filename, "a") data = {} set_counters()
weights['19'] = 3150 weights['20'] = 3150 weights['21'] = 3150 weights['22'] = 3150 weights['23'] = 3150 def sample_file(fileName, numSamples): df = pd.read_csv(fileName, names=["ids"]) numSamples = len(df) if len(df) < numSamples else numSamples ids = df['ids'].sample(n=numSamples, random_state=1) return ids.values # Use Twarc extract covid19 related tweets twarc = Twarc() tmp_df = pd.read_csv(LOG_FILE, names=["file"]) traversed = list(tmp_df.file.values) with open(LOG_FILE, 'a+') as logf: for file in os.listdir(PATH): if file not in traversed: file_postfix = str(file).split(".")[0][-2:] sample_size = weights[file_postfix] print("Extract from file: ", file, "for ", sample_size, " samples:") ids = sample_file(PATH + file, sample_size) output_file_name = str(file).split(".")[0] + "_contents.txt" # log w_ = csv.writer(logf) w_.writerow([file]) # extract content
from os.path import dirname, realpath, join, exists from twarc import Twarc twarc = Twarc("22GyvUC4Jg89Eh1PuKRh3mwRo", "m75gOSwIccfzYLWxwMCpHEldxgzYP83pTOSqAFbumQ5B6OF1vC", "852540250467467266-NoSAf6ZXmWZnr01CdUIfYBP5Z4cLZGJ", "kWP6L9F4YCUAsvwuaruuCUPMc4JqAE2jhJA8bhuuQSCSu") TWEETS_TO_CRAWL = 10000 DATA_DIR = join(dirname(dirname(realpath(__file__))), "data") TWEET_DATAFILE = join(DATA_DIR, "tweets.json") USER_DATAFILE = join(DATA_DIR, "following.json") MODEL_GRAPH_FILE = join(DATA_DIR, "graph.npy") USER_DATA = join(DATA_DIR, "users.json") RATINGS_FILE = join(DATA_DIR, "ratings.json") USERNAMES_FILE = join(DATA_DIR, "usernames.json") VERIFIED_USERS = join(DATA_DIR, "verified.json") TOPUSERS_FILE = join(DATA_DIR, "topUsers.json") TOPUSERS_OLD_FILE = join(DATA_DIR, "topUsersOld.json") FOLLOW_FILE = join(DATA_DIR, "follow.json") RETWEET_FILE = join(DATA_DIR, "retweets.json") USER_CSV = join(DATA_DIR, "users.csv") TWEETs_CSV = join(DATA_DIR, "tweets.csv") USER_TWEET_CSV = join(DATA_DIR, "userTweetRelationship.csv") TWEET_TWEET_CSV = join(DATA_DIR, "tweetTweetRelationship.csv") RATINGS_CSV = join(DATA_DIR, "ratings.csv") PLOT_FILE = join(DATA_DIR, "plot.png")
with open("config.yml", 'r') as ymlfile: cfg = yaml.load(ymlfile) for section in cfg: print(section) consumer_key = cfg['twitter']['consumer_key'] consumer_secret = cfg['twitter']['consumer_secret'] access_token = cfg['twitter']['access_token'] access_token_secret = cfg['twitter']['access_token_secret'] def ids(): for id in open("brexit_tweet_ids.csv"): yield id t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret) keys = [ "text", "id", "created_at", "favorite_count", "lang", "place", "coordinates", "user", "entities", "geo", "retweeted", "retweet_count" ] with open('tweets123.txt', 'w') as outfile: for tweet in t.hydrate(ids()): tweet1 = {filter_key: tweet[filter_key] for filter_key in keys} values_json = json.dumps(tweet1, sort_keys=True) outfile.write(values_json + "\n") print(tweet1['text'])
This can be modified to do a lot more, can be integrated with wordcloud.py for generating wordclouds on the fly. Or to perform sentiment analysis using any text parser like aylien. """ from twarc import Twarc import json import fileinput import sys print (" # Loading keys") consumer_key = 'INSERT YOUR CONSUMER KEY HERE' consumer_secret = 'INSERT YOUR CONSUMER SECRET HERE' access_token = 'INSERT YOUR TOKEN HERE' access_token_secret = 'INSERT YOUR TOKEN SECRET HERE' twarc_auth = Twarc(consumer_key, consumer_secret, access_token, access_token_secret) print (" # Reading search terms") with open('tweet_terms.txt','r') as tweet_terms_file_content: my_tweet_terms = [line.strip() for line in tweet_terms_file_content] print (" # Search terms loaded") if len(my_tweet_terms) > 0: twitter_query = ",".join(my_tweet_terms) print " # Search terms: " + twitter_query for tweet in twarc_auth.filter(track = twitter_query): with open('data_dump.json', 'a') as json_output_file: json.dump(tweet, json_output_file, indent=4, sort_keys=True) else:
def main(warc_file): twitter = Twarc() out = csv.writer(sys.stdout) out.writerow(json2csv.get_headings()) for tweet in twitter.hydrate(tweet_ids(warc_file)): out.writerow(json2csv.get_row(tweet))
class TwitterRelationships(): # Cut-down code to get twitter relationships for a set of hashtags. # Adapted from https://labsblog.f-secure.com/2018/02/16/searching-twitter-with-twarc/ def __init__(self, secretsfile='/Users/sara/twittersecrets.txt'): fsecret = open(secretsfile, 'r') secrets = fsecret.readline() access_token, access_token_secret, consumer_key, consumer_secret = \ [x.strip() for x in secrets.split(',')] self.twarc = Twarc(consumer_key, consumer_secret, access_token, access_token_secret) # Helper functions for saving csv and formatted txt files def write_data(self, data, filename, filetype='txt'): with io.open(filename, "w", encoding="utf-8") as handle: if filetype == 'txt': for item, count in data.most_common(): handle.write(str(count) + "\t" + item + "\n") else: #write to csv handle.write(u"Source,Target,Weight\n") for source, targets in sorted(data.items()): for target, count in sorted(targets.items()): if source != target and source is not None and target is not None: handle.write(source + u"," + target + u"," + str(count) + u"\n") return # Returns the screen_name of the user retweeted, or None def retweeted_user(self, status): if "retweeted_status" in status: orig_tweet = status["retweeted_status"] if "user" in orig_tweet and orig_tweet["user"] is not None: user = orig_tweet["user"] if "screen_name" in user and user["screen_name"] is not None: return user["screen_name"] return # Returns a list of screen_names that the user interacted with in this Tweet def get_interactions(self, status): interactions = [] if "in_reply_to_screen_name" in status: replied_to = status["in_reply_to_screen_name"] if replied_to is not None and replied_to not in interactions: interactions.append(replied_to) if "retweeted_status" in status: orig_tweet = status["retweeted_status"] if "user" in orig_tweet and orig_tweet["user"] is not None: user = orig_tweet["user"] if "screen_name" in user and user["screen_name"] is not None: if user["screen_name"] not in interactions: interactions.append(user["screen_name"]) if "quoted_status" in status: orig_tweet = status["quoted_status"] if "user" in orig_tweet and orig_tweet["user"] is not None: user = orig_tweet["user"] if "screen_name" in user and user["screen_name"] is not None: if user["screen_name"] not in interactions: interactions.append(user["screen_name"]) if "entities" in status: entities = status["entities"] if "user_mentions" in entities: for item in entities["user_mentions"]: if item is not None and "screen_name" in item: mention = item['screen_name'] if mention is not None and mention not in interactions: interactions.append(mention) return interactions # Returns a list of hashtags found in the tweet def get_hashtags(self, status): hashtags = [] if "entities" in status: entities = status["entities"] if "hashtags" in entities: for item in entities["hashtags"]: if item is not None and "text" in item: hashtag = item['text'] if hashtag is not None and hashtag not in hashtags: hashtags.append(hashtag) return hashtags # Returns a list of URLs found in the Tweet def get_urls(self, status): urls = [] if "entities" in status: entities = status["entities"] if "urls" in entities: for item in entities["urls"]: if item is not None and "expanded_url" in item: url = item['expanded_url'] if url is not None and url not in urls: urls.append(url) return urls def get_image_urls(self, status): # Returns the URLs to any images found in the Tweet urls = [] if "entities" in status: entities = status["entities"] if "media" in entities: for item in entities["media"]: if item is not None: if "media_url" in item: murl = item["media_url"] if murl not in urls: urls.append(murl) return urls def fetch_images(self): # Iterate through image URLs, fetching each image if we haven't already pictures_dir = os.path.join(self.save_dir, self.dataname + '_' + "images") if not os.path.exists(pictures_dir): print("Creating directory: " + pictures_dir) os.makedirs(pictures_dir) for url in self.all_image_urls: m = re.search("^http:\/\/pbs\.twimg\.com\/media\/(.+)$", url) if m is not None: filename = m.group(1) print("Getting picture from: " + url) save_path = os.path.join(pictures_dir, filename) if not os.path.exists(save_path): response = requests.get(url, stream=True) with open(save_path, 'wb') as out_file: shutil.copyfileobj(response.raw, out_file) del response return def writedf(self, dataset, name, columns): filename = os.path.join(self.save_dir, self.dataname + '_' + name) with io.open(filename, "w", encoding="utf-8") as handle: handle.write('\t'.join(columns) + u"\n") for row in dataset: handle.write('\t'.join(row) + u"\n") return def save_datasets(self, fetch_images=True): csv_outputs = { "user_user_graph.csv": self.user_user_graph, "user_hashtag_graph.csv": self.user_hashtag_graph, "hashtag_hashtag_graph.csv": self.hashtag_hashtag_graph } for name, dataset in csv_outputs.items(): filename = os.path.join(self.save_dir, self.dataname + '_' + name) self.write_data(dataset, filename, 'csv') text_outputs = { "hashtags.txt": self.hashtag_frequency_dist, "influencers.txt": self.influencer_frequency_dist, "mentioned.txt": self.mentioned_frequency_dist, "urls.txt": self.url_frequency_dist } for name, dataset in text_outputs.items(): filename = os.path.join(self.save_dir, self.dataname + '_' + name) self.write_data(dataset, filename, 'txt') self.writedf(self.url_refs, "url_refs.csv", ['url', 'tweeturl']) self.writedf(self.image_refs, "image_refs.csv", ['url', 'tweeturl']) self.writedf(self.tweets, "tweets.csv", ['url', 'screen_name', 'id', 'created_at', 'text']) if fetch_images: self.fetch_images() return def make_directories(self, target, rootdir='../data/twitter'): # Create a separate save directory for each search query # Since search queries can be a whole sentence, we'll check the length # and simply number it if the query is overly long self.dataname = datetime.now().strftime( "%Y%m%d%H%M%S") + '_' + target.replace(" ", "_") self.save_dir = rootdir if not os.path.exists(rootdir): os.makedirs(rootdir) if len(target) < 30: self.save_dir += "/" + self.dataname else: self.save_dir += "/target_" + str(count + 1) if not os.path.exists(self.save_dir): print("Creating directory: " + self.save_dir) os.makedirs(self.save_dir) return def get_target_data(self, target): # Variables for capturing stuff self.tweets_captured = 0 self.influencer_frequency_dist = Counter() self.mentioned_frequency_dist = Counter() self.hashtag_frequency_dist = Counter() self.url_frequency_dist = Counter() self.user_user_graph = {} self.user_hashtag_graph = {} self.hashtag_hashtag_graph = {} self.all_image_urls = [] self.tweets = [] self.tweet_count = 0 self.url_refs = [] self.image_refs = [] # Start the search for status in self.twarc.search(target): # Output some status as we go, so we know something is happening sys.stdout.write("\r") sys.stdout.flush() sys.stdout.write("Collected " + str(self.tweet_count) + " tweets.") sys.stdout.flush() self.tweet_count += 1 screen_name = None if "user" in status: if "screen_name" in status["user"]: screen_name = status["user"]["screen_name"] retweeted = self.retweeted_user(status) if retweeted is not None: self.influencer_frequency_dist[retweeted] += 1 else: self.influencer_frequency_dist[screen_name] += 1 # Tweet text can be in either "text" or "full_text" field... text = None if "full_text" in status: text = status["full_text"] elif "text" in status: text = status["text"] id_str = None if "id_str" in status: id_str = status["id_str"] # Assemble the URL to the tweet we received... tweet_url = None if id_str is not None and screen_name is not None: tweet_url = "https://twitter.com/" + screen_name + "/status/" + id_str # if tweet_url is not None and text is not None: # self.tweets[tweet_url] = text created_at = None if "created_at" in status: created_at = status["created_at"] self.tweets += [[tweet_url, screen_name, id_str, created_at, text]] #capture everything # Record mapping graph between users interactions = self.get_interactions(status) if interactions is not None: for user in interactions: self.mentioned_frequency_dist[user] += 1 if screen_name not in self.user_user_graph: self.user_user_graph[screen_name] = {} if user not in self.user_user_graph[screen_name]: self.user_user_graph[screen_name][user] = 1 else: self.user_user_graph[screen_name][user] += 1 # Record mapping graph between users and hashtags hashtags = self.get_hashtags(status) if hashtags is not None: if len(hashtags) > 1: hashtag_interactions = [] # This code creates pairs of hashtags in situations where multiple # hashtags were found in a tweet # This is used to create a graph of hashtag-hashtag interactions for comb in combinations(sorted(hashtags), 2): hashtag_interactions.append(comb) if len(hashtag_interactions) > 0: for inter in hashtag_interactions: item1, item2 = inter if item1 not in self.hashtag_hashtag_graph: self.hashtag_hashtag_graph[item1] = {} if item2 not in self.hashtag_hashtag_graph[item1]: self.hashtag_hashtag_graph[item1][item2] = 1 else: self.hashtag_hashtag_graph[item1][item2] += 1 for hashtag in hashtags: self.hashtag_frequency_dist[hashtag] += 1 if screen_name not in self.user_hashtag_graph: self.user_hashtag_graph[screen_name] = {} if hashtag not in self.user_hashtag_graph[screen_name]: self.user_hashtag_graph[screen_name][hashtag] = 1 else: self.user_hashtag_graph[screen_name][hashtag] += 1 urls = self.get_urls(status) if urls is not None: for url in urls: self.url_refs += [[url, tweet_url]] self.url_frequency_dist[url] += 1 image_urls = self.get_image_urls(status) if image_urls is not None: for url in image_urls: self.image_refs += [[url, tweet_url]] if url not in self.all_image_urls: self.all_image_urls.append(url) self.save_datasets(fetch_images=True) return
from twarc import Twarc import json #input twitter credentials consumer_key = '*********' consumer_secret = '*********' access_token = '*********' access_token_secret = '*********' t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret) data = [] for tweet in t.hydrate(open('../input_files/ids.txt')): data.append(json.dumps(tweet)) with open('output.json', 'w') as outfile: outfile.write("\n".join(data) + '\n')
# Twitter auth for downloading tweets CONSUMER_KEY = os.environ.get("TWITTER_CONSUMER_KEY") CONSUMER_SECRET = os.environ.get("TWITTER_CONSUMER_SECRET") ACCESS_TOKEN = os.environ.get("TWITTER_ACCESS_TOKEN") ACCESS_TOKEN_SECRET = os.environ.get("TWITTER_ACCESS_TOKEN_SECRET") # Concat and read all the CSVs dir1 = "data/twitter-framing-master/congressional_tweets_dataset_2017/unlabeled/" dir2 = "data/twitter-framing-master/congressional_tweets_dataset_2017/labeled/" csv_files = glob.glob(os.path.join(dir1, "*.csv")) + glob.glob( os.path.join(dir2, "*.csv")) HEADERS = [ "tweet_id", "issue1", "issue2", "frame1", "frame2", "frame3", "party", "ts" ] all_df = pd.concat( (pd.read_csv(f, names=HEADERS, header=None) for f in csv_files), ignore_index=True) t = Twarc(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET) tweet_texts = {} for tweet in t.hydrate(all_df["tweet_id"]): tweet_texts[tweet["id"]] = tweet["full_text"] text_df = pd.DataFrame(tweet_texts, index=[0]).transpose().rename(columns={ "index": "tweet_id", 0: "text" }) all_df = all_df.set_index("tweet_id") joined = all_df.join(text_df) joined.to_pickle("data/tweets.pkl")
from twarc import Twarc tw = Twarc() #get training data for tweet in tw.search("covid-19", lang='en'): try: screen_name = None if "screen_name" in tweet["user"]: screen_name = tweet["user"]["screen_name"] id_str = tweet["id_str"] tweet_url = None if "id_str" != None and "screen_name" != None: tweet_url = "https://twitter.com/" + screen_name + "/status/" + id_str #put training data into a txt file with open("trainingcovid-19.txt", "a+") as f: # Move read cursor to the start of file. f.seek(0) # If file is not empty then append '\n' data = f.read(100) if len(data) > 0: f.write("\n") # Append text at the end of file f.write(tweet['full_text']) f.write("\n") f.write(tweet_url) except UnicodeEncodeError: print("UnicodeEncodeError in finding training data") #now we have to manually sort training data
from bottle import run, route, get, post, request, template, static_file from twarc import Twarc import pandas as pd t = Twarc("JNaw7CRIGnQWxHH3C6tcpF0fP", "1opF4IfXrtzcUPOJUvnSr4wXbYpVGEJ8J4oBHAzEqRxV1p9FVO", "1055391684354203648-bmiuojBuJ8S0a4cQEGErobfaPVMIQV", "5R457jy32zTCVtwlQkZCKUtM9mMjgod9fw02g6zNWCOzW") twdata = None @get('/get_details') def get_detail(): return '''<!doctype html> <html> <head> <title>twitter</title> <style> body{ background-image:url("https://thetrendingprof.com/wp-content/uploads/2013/11/twitter.jpg"); background-size: 1300px 800px; background-repeat:no-repeat; } #rcorners3{ border-radius: 80px 0px; background-image:url("http://www.hdwallpapers10.com/wp-content/uploads/2017/05/Black%20and%20White%20abstract%20Background%20Full%20HD-623x623.png"); padding: 20px; width: 500px; height: 200px; opacity:0.8; }
from twarc import Twarc import pprint import json consumer_key = "2NBPNFml9TtV3ValyhgZqP4ch" consumer_secret = "qzCNGbr5I5vD2GAps7gdsQRNW4GbmlhODp0BokqFgCzLw2TjjV" access_token = "931008641255084032-rMD6zn8esls7S1z4UiebC52Tb0gp8BM" access_token_secret = "kpxBObeQfcpqbU8EikrionXFa1NbYpstYwPGA542av7K3" output = open("sample1.json", 'w') t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret) hydrated = [] count = 0 for tweet in t.hydrate(open('representatives.txt')): if count > 10000: break count += 1 hydrated.append(tweet) if count == (0 % 1000): output.write(json.dump(hydrated)) print("done!") print(count, " tweets pulled.") output.write(json.dumps(hydrated))
# Read in necessary libraries and Packages import os import pandas as pd import json import numpy as np import matplotlib.pyplot as plt from shapely.geometry import Point, Polygon import geopandas as gpd from twarc import Twarc # Pass in credentials so I can connect to API t = Twarc(key) # The JSON file has thousands of JSON objects in it, so we need to first open the file, # loop through each line, and use the json.loads() function to extract each object tweets = [] with open('tweets.json') as f: for line in f: tweets.append(json.loads(line)) len(tweets) # Printing out the first five tweets in the file [print(tweets[i]['full_text'], '\n\n') for i in range(5)] # If a Tweet was retweeted, the text may be shortened. For example, in this tweet below the 'full text' # is actually cut short, but in the retweeted status we can see the full text. print(tweets[27]['user']['location'], tweets[27]['full_text']), tweets[27]['retweeted_status']['full_text'] """### Locations from Tweets"""
#!/usr/bin/env python3 # # Parts of code taken from stackoverflow # import gzip import json import requests from tqdm import tqdm from twarc import Twarc from pathlib import Path twarc = Twarc() url = "https://drive.google.com/file/d/1COJ1zrJE-acz0yZssIljRSAPyIRtS2EC/view?usp=sharing" r = requests.get(url) def reader_generator(reader): b = reader(1024 * 1024) while b: yield b b = reader(1024 * 1024) def raw_newline_count(fname): f = open(fname, 'rb') f_gen = reader_generator(f.raw.read) return sum(buf.count(b'\n') for buf in f_gen)
class TwitterHarvester(BaseHarvester): def __init__(self, working_path, stream_restart_interval_secs=30 * 60, mq_config=None, debug=False, connection_errors=5, http_errors=5, debug_warcprox=False, tries=3): BaseHarvester.__init__(self, working_path, mq_config=mq_config, stream_restart_interval_secs=stream_restart_interval_secs, debug=debug, debug_warcprox=debug_warcprox, tries=tries) self.twarc = None self.connection_errors = connection_errors self.http_errors = http_errors self.extract_media = False self.extract_web_resources = False self.extract_user_profile_images = False def harvest_seeds(self): # Create a twarc self._create_twarc() # Get harvest extract options. self.extract_media = self.message.get("options", {}).get("media", False) self.extract_web_resources = self.message.get("options", {}).get("web_resources", False) self.extract_user_profile_images = self.message.get("options", {}).get("user_images", False) # Dispatch message based on type. harvest_type = self.message.get("type") log.debug("Harvest type is %s", harvest_type) if harvest_type == "twitter_search": self.search() elif harvest_type == "twitter_filter": self.filter() elif harvest_type == "twitter_sample": self.sample() elif harvest_type == "twitter_user_timeline": self.user_timeline() else: raise KeyError def _create_twarc(self): self.twarc = Twarc(self.message["credentials"]["consumer_key"], self.message["credentials"]["consumer_secret"], self.message["credentials"]["access_token"], self.message["credentials"]["access_token_secret"], http_errors=self.http_errors, connection_errors=self.connection_errors) def search(self): assert len(self.message.get("seeds", [])) == 1 incremental = self.message.get("options", {}).get("incremental", False) query = self.message["seeds"][0]["token"] since_id = self.state_store.get_state(__name__, u"{}.since_id".format(query)) if incremental else None self._harvest_tweets(self.twarc.search(query, since_id=since_id)) def filter(self): assert len(self.message.get("seeds", [])) == 1 track = self.message["seeds"][0]["token"].get("track") follow = self.message["seeds"][0]["token"].get("follow") locations = self.message["seeds"][0]["token"].get("locations") self._harvest_tweets(self.twarc.filter(track=track, follow=follow, locations=locations)) def sample(self): self._harvest_tweets(self.twarc.sample()) def user_timeline(self): incremental = self.message.get("options", {}).get("incremental", False) for seed in self.message.get("seeds", []): seed_id = seed["id"] screen_name = seed.get("token") user_id = seed.get("uid") log.debug("Processing seed (%s) with screen name %s and user id %s", seed_id, screen_name, user_id) assert screen_name or user_id # If there is not a user_id, look it up. if screen_name and not user_id: user_id = self._lookup_user_id(screen_name) if user_id: # Report back if nsid found self.result.uids[seed_id] = user_id else: msg = "User id not found for user {}".format(screen_name) log.exception(msg) self.result.warnings.append(Msg(CODE_TOKEN_NOT_FOUND, msg)) # Otherwise, get the current screen_name else: new_screen_name = self._lookup_screen_name(user_id) if new_screen_name != screen_name: self.result.token_updates[seed_id] = new_screen_name screen_name = new_screen_name if user_id: try: # Get since_id from state_store since_id = self.state_store.get_state(__name__, "timeline.{}.since_id".format( user_id)) if incremental else None self._harvest_tweets(self.twarc.timeline(user_id=user_id, since_id=since_id)) except HTTPError as e: if e.response.status_code == 401: msg = "Unauthorized for user {} (User ID: {}) because account is suspended or private".format(screen_name, user_id) log.exception(msg) self.result.warnings.append(Msg(CODE_TOKEN_UNAUTHORIZED, msg)) else: raise e def _lookup_screen_name(self, user_id): """ Lookup a screen name given a user id. """ users = list(self.twarc.user_lookup(user_ids=(user_id,))) assert len(users) in (0, 1) if users: return users[0]["screen_name"] return None def _lookup_user_id(self, screen_name): """ Lookup a user id given a screen name. """ users = list(self.twarc.user_lookup(screen_names=(screen_name,))) assert len(users) in (0, 1) if users: return users[0]["id_str"] return None def _harvest_tweets(self, tweets): # max_tweet_id = None for count, tweet in enumerate(tweets): if not count % 100: log.debug("Harvested %s tweets", count) self.result.harvest_counter["tweets"] += 1 if self.stop_harvest_seeds_event.is_set(): log.debug("Stopping since stop event set.") break def _process_entities(self, entities): if self.extract_web_resources: for url in entities.get("urls", []): # Exclude links for tweets if url["expanded_url"] and not status_re.match(url["expanded_url"]): self.result.urls.append(url["expanded_url"]) if self.extract_media: for media in entities.get("media", []): if media["media_url"]: self.result.urls.append(media["media_url"]) def process_warc(self, warc_filepath): # Dispatch message based on type. harvest_type = self.message.get("type") log.debug("Harvest type is %s", harvest_type) if harvest_type == "twitter_search": self.process_search_warc(warc_filepath) elif harvest_type == "twitter_filter": self._process_tweets(TwitterStreamWarcIter(warc_filepath)) elif harvest_type == "twitter_sample": self._process_tweets(TwitterStreamWarcIter(warc_filepath)) elif harvest_type == "twitter_user_timeline": self.process_user_timeline_warc(warc_filepath) else: raise KeyError def process_search_warc(self, warc_filepath): incremental = self.message.get("options", {}).get("incremental", False) query = self.message["seeds"][0]["token"] since_id = self.state_store.get_state(__name__, u"{}.since_id".format(query)) if incremental else None max_tweet_id = self._process_tweets(TwitterRestWarcIter(warc_filepath)) # Update state store if incremental and max_tweet_id > since_id: self.state_store.set_state(__name__, u"{}.since_id".format(query), max_tweet_id) def process_user_timeline_warc(self, warc_filepath): incremental = self.message.get("options", {}).get("incremental", False) for count, status in enumerate(TwitterRestWarcIter(warc_filepath)): tweet = status.item if not count % 100: log.debug("Processing %s tweets", count) if "text" in tweet: user_id = tweet["user"]["id_str"] if incremental: # Update state key = "timeline.{}.since_id".format(user_id) self.state_store.set_state(__name__, key, max(self.state_store.get_state(__name__, key), tweet.get("id"))) self._process_tweet(tweet) def _process_tweets(self, warc_iter): max_tweet_id = None for count, status in enumerate(warc_iter): tweet = status.item if not count % 100: log.debug("Processing %s tweets", count) if "text" in tweet: max_tweet_id = max(max_tweet_id, tweet.get("id")) self._process_tweet(tweet) return max_tweet_id def _process_tweet(self, tweet): self.result.increment_stats("tweets") # For more info, see https://dev.twitter.com/overview/api/entities-in-twitter-objects statuses = [tweet] if "retweeted_status" in tweet: statuses.append(tweet["retweeted_status"]) elif "quoted_status" in tweet: statuses.append(tweet["quoted_status"]) for status in statuses: self._process_entities(status.get("entities", {})) self._process_entities(status.get("extended_entities", {})) if self.extract_user_profile_images: self.result.urls.append(tweet["user"]["profile_image_url"]) self.result.urls.append(tweet["user"]["profile_background_image_url"]) if "profile_banner_url" in tweet["user"]: self.result.urls.append(tweet["user"]["profile_banner_url"])
class TwitterHarvester(BaseHarvester): def __init__(self, process_interval_secs=1200, mq_config=None, debug=False): BaseHarvester.__init__(self, mq_config=mq_config, process_interval_secs=process_interval_secs, debug=debug) self.twarc = None def harvest_seeds(self): # Create a twarc self._create_twarc() # Dispatch message based on type. harvest_type = self.message.get("type") log.debug("Harvest type is %s", harvest_type) if harvest_type == "twitter_search": self.search() elif harvest_type == "twitter_filter": self.filter() else: raise KeyError def _create_twarc(self): self.twarc = Twarc(self.message["credentials"]["consumer_key"], self.message["credentials"]["consumer_secret"], self.message["credentials"]["access_token"], self.message["credentials"]["access_token_secret"]) def search(self): incremental = self.message.get("options", {}).get("incremental", False) for seed in self.message.get("seeds", []): query = seed.get("token") # Get since_id from state_store since_id = self.state_store.get_state(__name__, "{}.since_id".format(query)) if incremental else None max_tweet_id = self._process_tweets(self.twarc.search(query, since_id=since_id)) log.debug("Searching on %s since %s returned %s tweets.", query, since_id, self.harvest_result.summary.get("tweet")) # Update state store if incremental and max_tweet_id: self.state_store.set_state(__name__, "{}.since_id".format(query), max_tweet_id) def filter(self): assert len(self.message.get("seeds", [])) == 1 track = self.message["seeds"][0]["token"] self._process_tweets(self.twarc.stream(track)) def _process_tweets(self, tweets): max_tweet_id = None for count, tweet in enumerate(tweets): if not count % 100: log.debug("Processed %s tweets", count) if self.stop_event.is_set(): log.debug("Stopping since stop event set.") break if "text" in tweet: with self.harvest_result_lock: max_tweet_id = max(max_tweet_id, tweet.get("id")) self.harvest_result.increment_summary("tweet") if "urls" in tweet["entities"]: for url in tweet["entities"]["urls"]: self.harvest_result.urls.append(url["expanded_url"]) if "media" in tweet["entities"]: for media in tweet["entities"]["media"]: self.harvest_result.urls.append(media["media_url"]) return max_tweet_id
from os import path import pandas as pd from twarc import Twarc from util.util import DataCollector from util.util import create_dir, Config keys = pd.read_csv('resources/tweet_keys_file.txt').iloc[0] t = Twarc(keys['app_key'], keys['app_secret'], keys['oauth_token'], keys['oauth_token_secret']) features = [ 'tweet_id', 'retweeted_id', 'created_at', 'favorite_count', 'retweet_count', 'user_id', 'location', 'verified', 'followers_count', 'source', 'text', 'fake' ] def collect_tweets(news_list, news_source, label, config: Config): create_dir(config.dump_location) create_dir("{}/{}".format(config.dump_location, news_source)) create_dir("{}/{}/tweets".format(config.dump_location, news_source)) for news in news_list: print('Downloading ' + news_source + ' ' + label + ' ' + news.news_id + ' tweets') create_dir("{}/{}/{}/{}".format(config.dump_location, news_source, label, news.news_id)) data = pd.DataFrame(columns=features) news_dir = "{}/{}/tweets/{}.csv".format(config.dump_location, news_source, news.news_id)
def _create_twarc(self): self.twarc = Twarc(self.message["credentials"]["consumer_key"], self.message["credentials"]["consumer_secret"], self.message["credentials"]["access_token"], self.message["credentials"]["access_token_secret"])
#!/usr/bin/env python3 # # This script will walk through all the tweet id files and # hydrate them with twarc. The line oriented JSON files will # be placed right next to each tweet id file. from pathlib import Path from twarc import Twarc from pyspark import SparkConf, SparkContext import sys from os import listdir twarc = Twarc(consumer_key="ledLMTpVRfM", consumer_secret="Mza5q9YYc2KIK8rI0B0kss3", access_token="121996430w61PFl46Q7jRrgbrqkGLxy", access_token_secret="8ymranCmZ2UedmN") def extractInfo(tweet): hydrated_info = {} hydrated_info['id'] = tweet['id_str'] hydrated_info['favorite_count'] = tweet['favorite_count'] hydrated_info['retweet_count'] = tweet['retweet_count'] hydrated_info['geo'] = tweet['geo'] hydrated_info['create_at'] = tweet['created_at'] return hydrated_info def main(input_dir, outpath): conf = SparkConf().setMaster("local").setAppName("Test")
if errs: raise RuntimeError( f"Required environment variables are undefined: {errs}. See README for details." ) # Check that tweet_id was provided if len(sys.argv) != 2: raise RuntimeError( "Program should be called like: `python main.py <tweet_id>`") tweet_id = sys.argv[1] # Main t = Twarc( consumer_key=consumer_key, consumer_secret=consumer_secret, access_token=access_token, access_token_secret=access_token_secret, ) tweet = t.tweet(tweet_id) if not tweet: raise RuntimeError(f"tweet with id {tweet_id} does not exist") # replies is a generator object replies = t.replies(tweet, True) # List to hold dict of relevant photo data from each of the replies photo_data = [] for reply in replies: # Photos will be in a list stored at reply['extended_entities']['media'] print("Processing next reply")
# from the command line to tell it your Twitter API keys. # import gzip import json from tqdm import tqdm from twarc import Twarc from pathlib import Path import datetime print(datetime.datetime.now()) with open('config/cred.json') as json_file: cred = json.load(json_file) twarc = Twarc(cred['CONSUMER_KEY'], cred['CONSUMER_SECRET'], cred['ACCESS_TOKEN'], cred['ACCESS_TOKEN_SECRET']) #data_dirs = ['2020-01', '2020-02', '2020-03', '2020-04', '2020-05'] base_path = "data-ids/" data_dirs = ['2020-05'] import threading import queue #Number of threads n_thread = 5 all_ids = [] #Create queue queue = queue.Queue() class ThreadClass(threading.Thread):
testneg = 0.0 withoutclass = line.split() for eachword in withoutclass: if eachword in dictpos: testpos += numpy.log10(dictpos[eachword]['probability']) else: testpos += numpy.log10(a / (counterpos + (a * (counterpos + counterneg)))) if eachword in dictneg: testneg += numpy.log10(dictneg[eachword]['probability']) else: testneg += numpy.log10(a / (counterneg + (a * (counterpos + counterneg)))) #if it's good, write it into the result txt file if (testpos > testneg): with open("testingcovid-19result.txt", "a+") as f: f.seek(0) data = f.read(100) if len(data) > 0 : f.write("\n") f.write(line) positive = True except UnicodeEncodeError: print("UnicodeEncodeError in testing data") if __name__ == '__main__': tw = Twarc() dictpos, dictneg, counterpos, counterneg, a = training() gettestingdata() testing(dictpos, dictneg, counterpos, counterneg, a)
# twitore server flask app # localConfig import localConfig # python import flask from flask import Flask, render_template, g # twarc from twarc import Twarc # global twarc instance twarc_instance = Twarc(localConfig.client_key, localConfig.client_secret, localConfig.access_token, localConfig.access_token_secret) # crontab from crontab import CronTab mycron = CronTab(user=True) # create app app = flask.Flask(__name__) # set session key app.secret_key = 'twitore_is_the_bomb' # Flask/MongoEngine from flask.ext.mongoengine import MongoEngine app.config['MONGODB_SETTINGS'] = { 'db': 'twitore_dev' } db = MongoEngine(app)
def crawl_feed(feed_dict, credentials): twarc = Twarc(credentials['consumer_key'], credentials['consumer_secret'], credentials['access_token'], credentials['access_token_secret']) crawl_time = datetime.datetime.now() crawl_time_filename = crawl_time.strftime('%Y%m%d%I%M%S') crawl_time_html = crawl_time.strftime('%B %d, %Y') crawl_name = feed_dict['crawl_name'] crawl_type = feed_dict['crawl_type'] short_name = feed_dict['short_name'] search_string = feed_dict['search_string'] feed_dir = feed_dict['feed_dir'] json_dir = join(feed_dir, 'json') html_dir = join(feed_dir, 'html') media_dir = join(feed_dir, 'media') logs_dir = join(feed_dir, 'logs') for directory in [feed_dir, json_dir, html_dir, media_dir, logs_dir]: if not os.path.exists(directory): os.makedirs(directory) log_file = join(logs_dir, 'twarc.log') formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') logger = logging.getLogger(crawl_name) handler = logging.FileHandler(log_file) handler.setFormatter(formatter) logger.addHandler(handler) logger.setLevel(logging.INFO) base_filename = short_name + '-' + crawl_time_filename json_file = join(json_dir, base_filename + '.json') print "Searching Twitter API for {0}".format(search_string) print "Writing JSON and HTML files..." logger.info("starting search for %s", search_string) tweet_count = 0 for tweet in twarc.search(search_string): with open(json_file, 'a') as json_out: json_out.write("{}\n".format(json.dumps(tweet))) if "id_str" in tweet: logger.info("archived https://twitter.com/%s/status/%s", tweet['user']['screen_name'], tweet["id_str"]) elif 'limit' in tweet: logger.warn("%s tweets undelivered", tweet["limit"]["track"]) elif 'warning' in tweet: logger.warn(tweet['warning']['message']) else: logger.warn(json.dumps(tweet)) tweet_count += 1 if tweet_count == 0: logger.info("no new tweets matching %s", search_string) # Write an empty json file. Maybe don't do this? with open(json_file, 'w') as json_out: json_out.close() return base_filename, tweet_count, crawl_time_html
__location__ = os.path.dirname(os.path.realpath(__file__)) users = os.path.join(__location__, "apostrophe", "tweets.csv") userList = [] with open(users, 'r', encoding='utf-8') as f: reader = csv.reader(f) rowCount = 0 for row in reader: rowCount += 1 if rowCount > 1: if not row[3] in userList: userList.append(row[3]) tweets = [] tweetContent = "" for user in userList: t = Twarc() for tweet in t.search("from:" + user): print (tweet["full_text"]) tweetContent += "%s\n" % str(tweet["full_text"]) tweets.append(tweet) outputFile = os.path.join(__location__, "possibleBotTweets.jsonl") with open(outputFile, "w", encoding='utf-8') as output: for line in tweets: output.write("%s\n" % str(json.dumps(line))) contentOutput = os.path.join(__location__, "possibleBotTweetContent.txt") with open(contentOutput, "w", encoding='utf-8') as output2: output2.write(tweetContent)
from twarc import Twarc import os import csv t = Twarc() def load_seed_list(filepath): """ For reading user ids from a seed list downloaded from SFM into a dictionary. """ user_ids = set() # Encoding handles the BOM with open(filepath, encoding='utf-8-sig') as csvfile: reader = csv.DictReader(csvfile) for row in reader: # user id to screen name user_ids.add(row['Uid']) return user_ids def get_followings(user_ids): existing_followed_user_ids = set() new_followed_user_ids = set() if os.path.exists('followed.csv'): with open('followed.csv') as followed_file: for line in followed_file: existing_followed_user_ids.add(user_id_from_line(line)) print('Loaded {} existing followed users'.format(len(existing_followed_user_ids))) with open('follower_to_followed.csv', 'w') as follower_to_followed_file: for count, user_id in enumerate(user_ids): print('Getting following for {} ({})'.format(user_id, count + 1))
from twarc import Twarc client_key = 'client_key' client_secret = 'client_secret' access_token = '197456523-m2qIYWxkQTFKj0ModTQPcdByTnjryHwLRm9L8o5y' access_token_secret = 'access_token_secret' t = Twarc(client_key, client_secret, access_token, access_token_secret) for tweet in t.search("resigncameron"): print(tweet["text"])
# This script was scheduled to run daily, so the filenames to be processed was yesterday's date filename = (date.today() - timedelta(days=1)).strftime("%m-%d-%Y") # Main directory which contails the ids folder and the full data folder main_dir = '/home/vca_rishik/rishik/COVID-19-tweets/' ids_dir = main_dir + 'data/' # Make sure you create this folder in the main directory before running this script target_dir = main_dir + 'data_full/' # Twitter API Credentials ACCESS_TOKEN = config.ACCESS_TOKEN ACCESS_SECRET = config.ACCESS_SECRET CONSUMER_KEY = config.CONSUMER_KEY CONSUMER_SECRET = config.CONSUMER_SECRET t = Twarc(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_SECRET) tweet_ids = pd.read_csv(ids_dir + filename + ".csv", lineterminator='\n') tweet_objects = [] for tweet in t.hydrate(tweet_ids.id.drop_duplicates()): tweet_objects.append(tweet) df_full = pd.DataFrame( tweet_objects, columns=[ 'created_at', 'id', 'id_str', 'full_text', 'truncated', 'display_text_range', 'entities', 'source', 'in_reply_to_status_id', 'in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place', 'contributors', 'is_quote_status',
# This script will walk through all the tweet id files and # hydrate them with twarc. The line oriented JSON files will # be placed right next to each tweet id file. # # Note: you will need to install twarc, tqdm, and run twarc configure # from the command line to tell it your Twitter API keys. # import gzip import json from tqdm import tqdm from twarc import Twarc from pathlib import Path twarc = Twarc() data_dirs = ['2020-01', '2020-02', '2020-03', '2020-04', '2020-05', '2020-06', '2020-07', '2020-08', '2020-09'] def main(): for data_dir in data_dirs: for path in Path(data_dir).iterdir(): if path.name.endswith('.txt'): hydrate(path) def _reader_generator(reader): b = reader(1024 * 1024) while b: yield b b = reader(1024 * 1024)
def crawl_feed(feed_dict, credentials): twarc = Twarc(credentials['consumer_key'], credentials['consumer_secret'], credentials['access_token'], credentials['access_token_secret']) crawl_time = datetime.datetime.now() crawl_time_filename = crawl_time.strftime('%Y%m%d%I%M%S') crawl_time_html = crawl_time.strftime('%B %d, %Y') crawl_name = feed_dict['crawl_name'] crawl_type = feed_dict['crawl_type'] short_name = feed_dict['short_name'] search_string = feed_dict['search_string'] feed_dir = feed_dict['feed_dir'] json_dir = join(feed_dir, 'json') html_dir = join(feed_dir, 'html') media_dir = join(feed_dir, 'media') logs_dir = join(feed_dir, 'logs') for directory in [feed_dir, json_dir, html_dir, media_dir, logs_dir]: if not os.path.exists(directory): os.makedirs(directory) log_file = join(logs_dir, 'twarc.log') formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') logger = logging.getLogger(crawl_name) handler = logging.FileHandler(log_file) handler.setFormatter(formatter) logger.addHandler(handler) logger.setLevel(logging.INFO) base_filename = short_name + '-' + crawl_time_filename json_file = join(json_dir, base_filename + '.json') print("Searching Twitter API for {0}".format(search_string)) print("Writing JSON and HTML files...") logger.info("starting search for %s", search_string) tweet_count = 0 for tweet in twarc.search(search_string): with open(json_file, 'a') as json_out: json_out.write("{}\n".format(json.dumps(tweet))) if "id_str" in tweet: logger.info("archived https://twitter.com/%s/status/%s", tweet['user']['screen_name'], tweet["id_str"]) elif 'limit' in tweet: logger.warn("%s tweets undelivered", tweet["limit"]["track"]) elif 'warning' in tweet: logger.warn(tweet['warning']['message']) else: logger.warn(json.dumps(tweet)) tweet_count += 1 if tweet_count == 0: logger.info("no new tweets matching %s", search_string) # Write an empty json file. Maybe don't do this? with open(json_file, 'w') as json_out: json_out.close() return base_filename, tweet_count, crawl_time_html