def test_since_id(): t = Twarc() for tweet in t.search('obama'): id = tweet['id_str'] break assert id time.sleep(5) for tweet in t.search('obama', since_id=id): assert tweet['id_str'] > id
def test_max_id(): t = Twarc() for tweet in t.search('obama'): id = tweet['id_str'] break assert id time.sleep(5) count = 0 for tweet in t.search('obama', max_id=id): count += 1 assert tweet['id_str'] <= id if count > 100: break
def test_max_and_since_ids(): t = Twarc() max_id = since_id = None count = 0 for tweet in t.search('obama'): count += 1 if not max_id: max_id = tweet['id_str'] since_id = tweet['id_str'] if count > 500: break count = 0 for tweet in t.search('obama', max_id=max_id, since_id=since_id): count += 1 assert tweet['id_str'] <= max_id assert tweet['id_str'] > since_id
def test_paging(): # pages are 100 tweets big so if we can get 500 paging is working t = Twarc() count = 0 for tweet in t.search('obama'): count += 1 if count == 500: break assert count == 500
def test_search(): count = 0 t = Twarc() for tweet in t.search('obama'): assert tweet['id_str'] count += 1 if count == 10: break assert count == 10
def main(get_method=None, input_hashtags=None, storage_location=None): if not os.path.exists(storage_location): os.makedirs(storage_location, exist_ok=True) hashtag_query = input_hashtags.strip().replace(",", "+OR+") try: tweets = 0 t = Twarc( consumer_key, consumer_secret, access_token, access_token_secret, tweet_mode="extended", ) print( "Started storing tweets related to " + input_hashtags + " at " + storage_location + " since " + str(datetime.datetime.now()) ) if get_method == "populate": for tweet in t.search(hashtag_query, lang=language): with open( os.path.join( storage_location + "tweet" + str(tweet["id"]) + ".json" ), "w", encoding="utf8", ) as file: json.dump(tweet, file) tweets += 1 elif get_method == "track": for tweet in t.filter(hashtag_query): with open( storage_location + "/tweet" + str(tweet["id"]) + ".json", "w", encoding="utf8", ) as file: json.dump(tweet, file) tweets += 1 else: print("No method defined, exiting...") except KeyboardInterrupt: print("Shutdown requested...successfully stored " + str(tweets) + " tweets") except BaseException: traceback.print_exc(file=sys.stdout) sys.exit(0)
def count_tweets(app_auth): """ Search for covid_19 in tweets using the given context and return the number of tweets that were fetched in 10 minutes. """ count = 0 t = Twarc(app_auth=app_auth) start = None for tweet in t.search('covid_19'): # start the timer when we get the first tweet if start is None: start = datetime.now() count += 1 if datetime.now() - start > timedelta(minutes=10): break t.client.close() return count
def crawl_feed(feed_dict, credentials): twarc = Twarc(credentials['consumer_key'], credentials['consumer_secret'], credentials['access_token'], credentials['access_token_secret']) crawl_time = datetime.datetime.now() crawl_time_filename = crawl_time.strftime('%Y%m%d%I%M%S') crawl_time_html = crawl_time.strftime('%B %d, %Y') crawl_name = feed_dict['crawl_name'] crawl_type = feed_dict['crawl_type'] short_name = feed_dict['short_name'] search_string = feed_dict['search_string'] feed_dir = feed_dict['feed_dir'] json_dir = join(feed_dir, 'json') html_dir = join(feed_dir, 'html') media_dir = join(feed_dir, 'media') logs_dir = join(feed_dir, 'logs') for directory in [feed_dir, json_dir, html_dir, media_dir, logs_dir]: if not os.path.exists(directory): os.makedirs(directory) log_file = join(logs_dir, 'twarc.log') formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') logger = logging.getLogger(crawl_name) handler = logging.FileHandler(log_file) handler.setFormatter(formatter) logger.addHandler(handler) logger.setLevel(logging.INFO) base_filename = short_name + '-' + crawl_time_filename json_file = join(json_dir, base_filename + '.json') print("Searching Twitter API for {0}".format(search_string)) print("Writing JSON and HTML files...") logger.info("starting search for %s", search_string) tweet_count = 0 if crawl_type == "timeline": for tweet in twarc.timeline(screen_name=search_string): with open(json_file, 'a') as json_out: json_out.write("{}\n".format(json.dumps(tweet))) if "id_str" in tweet: logger.info("archived https://twitter.com/%s/status/%s", tweet['user']['screen_name'], tweet["id_str"]) elif 'limit' in tweet: logger.warn("%s tweets undelivered", tweet["limit"]["track"]) elif 'warning' in tweet: logger.warn(tweet['warning']['message']) else: logger.warn(json.dumps(tweet)) tweet_count += 1 else: for tweet in twarc.search(search_string): with open(json_file, 'a') as json_out: json_out.write("{}\n".format(json.dumps(tweet))) if "id_str" in tweet: logger.info("archived https://twitter.com/%s/status/%s", tweet['user']['screen_name'], tweet["id_str"]) elif 'limit' in tweet: logger.warn("%s tweets undelivered", tweet["limit"]["track"]) elif 'warning' in tweet: logger.warn(tweet['warning']['message']) else: logger.warn(json.dumps(tweet)) tweet_count += 1 if tweet_count == 0: logger.info("no new tweets matching %s", search_string) # Write an empty json file. Maybe don't do this? with open(json_file, 'w') as json_out: json_out.close() return base_filename, tweet_count, crawl_time_html
print "Signing in as: " + acct_name search = "\"" + search + "\"" print(str(count) + "/" + str(max_s) + " searching: " + search) current_label = "search_" + str(count) output_dir = output_dir_base + str(count) + "/" if not os.path.exists(output_dir): print("Created directory: " + output_dir) os.makedirs(output_dir) fn = os.path.join(output_dir, "target.txt") with open(fn, "w") as f: f.write(search + "\n") dump_filename = output_dir + "raw.json" dump_file_handle = open(dump_filename, "a") data = {} set_counters() for status in t.search(search): captured_status = {} increment_counter("tweets_encountered") if "lang" in status: lang = status["lang"] increment_counter("tweets_" + lang) captured_status = capture_status_items(status) process_status(captured_status) if captured_status is not None: increment_counter("tweets_captured") increment_counter("tweets_processed") increment_counter("tweets_processed_this_interval") dump_file_handle.write(json.dumps(captured_status)) dump_file_handle.write("\n") periodic_events() sys.stdout.write("#")
class AnalyzerProcess(): def __init__(self, config, loggerObject, alerLoggerObject, rules, executionMode): self.logger = loggerObject self.alertLogger = alerLoggerObject self.rules = rules self.config = config self.executionMode = executionMode self.access_token = "insert Twitter API access token" self.access_token_secret = "insert Twitter API token secret" self.consumer_key = "insert Twitter API consumer key" self.consumer_secret = "insert Twitter API consumer secret" self.twarc = Twarc(self.consumer_key, self.consumer_secret, self.access_token, self.access_token_secret) self.currdir = "/home/centos/modosint-python3" + path.dirname(__file__) self.wcloud = "" self.stop_words = get_stop_words('spanish') newStopWords = ["http", "https", "co", "n'", "'", '"'] self.stop_words.extend(newStopWords) #Search Tweets that contais term in different Language def searchDifLanguage(self, text, language, ruleId): fichero = open("/var/log/modosint/analyzer-twitter/graylog.txt", "+a", encoding='utf8') with io.open("/var/log/modosint/analyzer-twitter/cache.txt", 'a+') as f: os.chmod("/var/log/modosint/analyzer-twitter/cache.txt", 0o777) traductor = Translator() translatedText = traductor.translate(text, dest=language) repeated = False if self.executionMode == "daemon": searchDif = self.twarc.search(translatedText.text) for tweet in searchDif: tweetTime = parser.parse(''.join(tweet['created_at'])) timeFormed = time.strptime( str(tweetTime.time()).split(',')[0], '%H:%M:%S') createdAtSeconds = datetime.timedelta( hours=timeFormed.tm_hour, minutes=timeFormed.tm_min, seconds=timeFormed.tm_sec).total_seconds() nowTimeUtc = datetime.datetime.utcnow().time() nowTimeFormed = time.strptime( str(nowTimeUtc).split('.')[0], '%H:%M:%S') nowTimeSeconds = datetime.timedelta( hours=nowTimeFormed.tm_hour, minutes=nowTimeFormed.tm_min, seconds=nowTimeFormed.tm_sec).total_seconds() if (nowTimeSeconds - createdAtSeconds < 300): #time in 5 minutes if 'retweeted_status' not in tweet: #avoid RT f.seek(0) #read temporary file (cache) content = f.readlines() content = [ x.strip('\n').strip('u') for x in content ] for i in range(len(content)): if tweet['id_str'] in content: repeated = True else: repeated = False if repeated == False: f.seek(0, 2) #write temporary file (cache) f.write(tweet['id_str']) f.write('\n') texto = tweet['full_text'] for c in texto: if c in emoji.UNICODE_EMOJI: texto = texto.replace(c, "") texto = u'' + texto try: emoji_pattern = re.compile( u"(\ud83d[\ude00-\ude4f])|" # emoticons u"(\ud83c[\udf00-\uffff])|" # symbols & pictographs (1 of 2) u"(\ud83d[\u0000-\uddff])|" # symbols & pictographs (2 of 2) u"(\ud83d[\ude80-\udeff])|" # transport & map symbols u"(\U0001F1E0-\U0001F1FF])|" u"(\U0001F600-\U0001F64F])|" # emoticons 2 u"(\U0001F300-\U0001F5FF])|" # symbols & pictographs u"(\U0001F680-\U0001F6FF])|" u"(\u2600-\u26FF])|" u"(\U0001F1F2\U0001F1F4)|" # Macau flag u"([\U0001F1E6-\U0001F1FF]{2})|" # flags u"([\U0001F600-\U0001F64F])" # emoticons 3 u"(\ud83c[\udde0-\uddff])" # flags (iOS) "+", flags=re.UNICODE) resultesp = traductor.translate( emoji_pattern.sub(r'', texto), dest='es') except ValueError: self.my_logger.debug( '[Emoji Error] Tweet can not be translated. Unrecognized emoji in tweet.' ) tweetdata = { "CreatedTime": tweet['created_at'], "short_message": tweet['full_text'], "TranslatedTweet": resultesp.text, "Author": tweet['user']['screen_name'], "Retweets": tweet['retweet_count'], "Likes": tweet['favorite_count'], "Location": tweet['user']['location'], "Rule": ruleId, "full_message": "Tweet matched with RULE: " + ruleId } autotweet = json.dumps(tweetdata) fichero.write(autotweet + '\n') self.wcloud.write(resultesp.text + '\n') os.chmod( "/var/log/modosint/analyzer-twitter/wcloudRule" + ruleId + ".txt", 0o777) os.chmod( "/var/log/modosint/analyzer-twitter/graylog.txt", 0o777) else: break else: searchDif = self.twarc.search(translatedText.text) for tweet in searchDif: tweetTime = ''.join(tweet['created_at']) datetweet = parser.parse(tweetTime) if (datetweet.date() == datetime.datetime.now().date() or datetweet.date() == (datetime.datetime.now().date() - timedelta(1))): if 'retweeted_status' not in tweet: #avoid RT f.seek(0) #read temporary file (cache) content = f.readlines() content = [ x.strip('\n').strip('u') for x in content ] for i in range(len(content)): if tweet['id_str'] in content: repeated = True else: repeated = False if repeated == False: f.seek(0, 2) #write temporary file (cache) f.write(tweet['id_str']) f.write('\n') texto = tweet['full_text'] for c in texto: if c in emoji.UNICODE_EMOJI: texto = texto.replace(c, "") texto = u'' + texto try: emoji_pattern = re.compile( u"(\ud83d[\ude00-\ude4f])|" # emoticons u"(\ud83c[\udf00-\uffff])|" # symbols & pictographs (1 of 2) u"(\ud83d[\u0000-\uddff])|" # symbols & pictographs (2 of 2) u"(\ud83d[\ude80-\udeff])|" # transport & map symbols u"(\U0001F1E0-\U0001F1FF])|" u"(\U0001F600-\U0001F64F])|" # emoticons 2 u"(\U0001F300-\U0001F5FF])|" # symbols & pictographs u"(\U0001F680-\U0001F6FF])|" u"(\u2600-\u26FF])|" u"(\U0001F1F2\U0001F1F4)|" # Macau flag u"([\U0001F1E6-\U0001F1FF]{2})|" # flags u"([\U0001F600-\U0001F64F])" # emoticons 3 u"(\ud83c[\udde0-\uddff])" # flags (iOS) "+", flags=re.UNICODE) resultesp = traductor.translate( emoji_pattern.sub(r'', texto), dest='es') except ValueError: self.my_logger.debug( '[Emoji Error] Tweet can not be translated. Unrecognized emoji in tweet.' ) tweetdata = { "CreatedTime": tweet['created_at'], "short_message": tweet['full_text'], "TranslatedTweet": resultesp.text, "Author": tweet['user']['screen_name'], "Retweets": tweet['retweet_count'], "Likes": tweet['favorite_count'], "Location": tweet['user']['location'], "Rule": ruleId, "full_message": "Tweet matched with RULE: " + ruleId } autotweet = json.dumps(tweetdata) fichero.write(autotweet + '\n') self.wcloud.write(resultesp.text + '\n') os.chmod( "/var/log/modosint/analyzer-twitter/wcloudRule" + ruleId + ".txt", 0o777) os.chmod( "/var/log/modosint/analyzer-twitter/graylog.txt", 0o777) else: break #Search Tweets that contains term or Hashtag def searchTweetOrHashtag(self, text, ruleId): fichero = open("/var/log/modosint/analyzer-twitter/graylog.txt", "+a", encoding='utf8') with io.open("/var/log/modosint/analyzer-twitter/cache.txt", 'a+') as f: os.chmod("/var/log/modosint/analyzer-twitter/cache.txt", 0o777) repeated = False if self.executionMode == "daemon": tweets = self.twarc.search(text) for tweet in tweets: tweetTime = parser.parse(''.join(tweet['created_at'])) timeFormed = time.strptime( str(tweetTime.time()).split(',')[0], '%H:%M:%S') createdAtSeconds = datetime.timedelta( hours=timeFormed.tm_hour, minutes=timeFormed.tm_min, seconds=timeFormed.tm_sec).total_seconds() nowTimeUtc = datetime.datetime.utcnow().time() nowTimeFormed = time.strptime( str(nowTimeUtc).split('.')[0], '%H:%M:%S') nowTimeSeconds = datetime.timedelta( hours=nowTimeFormed.tm_hour, minutes=nowTimeFormed.tm_min, seconds=nowTimeFormed.tm_sec).total_seconds() if (nowTimeSeconds - createdAtSeconds < 300): #time in 5 minutes if 'retweeted_status' not in tweet: #avoid RT f.seek(0) #read temporary file (cache) content = f.readlines() content = [ x.strip('\n').strip('u') for x in content ] for i in range(len(content)): if tweet['id_str'] in content: repeated = True else: repeated = False if repeated == False: f.seek(0, 2) #write temporary file (cache) f.write(tweet['id_str'] + '\n') tweetdata = { "CreatedTime": tweet['created_at'], "short_message": tweet['full_text'], "Author": tweet['user']['screen_name'], "Retweets": tweet['retweet_count'], "Likes": tweet['favorite_count'], "Location": tweet['user']['location'], "Rule": ruleId, "full_message": "Tweet matched with RULE: " + ruleId } autotweet = json.dumps(tweetdata) fichero.write(autotweet + '\n') self.wcloud.write(tweet['full_text'] + '\n') os.chmod( "/var/log/modosint/analyzer-twitter/wcloudRule" + ruleId + ".txt", 0o777) os.chmod( "/var/log/modosint/analyzer-twitter/graylog.txt", 0o777) else: break else: tweets = self.twarc.search(text) for tweet in tweets: #no daemon(tweets in this day and yesterday) tweetTime = ''.join(tweet['created_at']) datetweet = parser.parse(tweetTime) if (datetweet.date() == datetime.datetime.now().date() or datetweet.date() == (datetime.datetime.now().date() - timedelta(1))): if 'retweeted_status' not in tweet: #avoid RT f.seek(0) #read temporary file (cache) content = f.readlines() content = [ x.strip('\n').strip('u') for x in content ] for i in range(len(content)): if tweet['id_str'] in content: repeated = True else: repeated = False if repeated == False: f.seek(0, 2) #write temporary file (cache) f.write(tweet['id_str'] + '\n') tweetdata = { "CreatedTime": tweet['created_at'], "short_message": tweet['full_text'], "Author": tweet['user']['screen_name'], "Retweets": tweet['retweet_count'], "Likes": tweet['favorite_count'], "Location": tweet['user']['location'], "Rule": ruleId, "full_message": "Tweet matched with RULE: " + ruleId } autotweet = json.dumps(tweetdata) fichero.write(autotweet + '\n') self.wcloud.write(tweet['full_text'] + '\n') os.chmod( "/var/log/modosint/analyzer-twitter/wcloudRule" + ruleId + ".txt", 0o777) os.chmod( "/var/log/modosint/analyzer-twitter/graylog.txt", 0o777) else: break #Search All Tweets or timeline from @user def searchUserTweets(self, user, ruleId, fullstring): fichero = open("/var/log/modosint/analyzer-twitter/graylog.txt", "+a", encoding='utf8') with io.open("/var/log/modosint/analyzer-twitter/cache.txt", 'a+') as f: os.chmod("/var/log/modosint/analyzer-twitter/cache.txt", 0o777) tweets = self.twarc.timeline(None, user, None, None) repeated = False t_end = time.time() + 30 for tweet in tweets: if time.time() < t_end: for text in fullstring: if text in tweet['full_text']: f.seek(0) #read temporary file (cache) content = f.readlines() content = [ x.strip('\n').strip('u') for x in content ] for i in range(len(content)): if tweet['id_str'] in content: repeated = True else: repeated = False if repeated == False: f.seek(0, 2) #write temporary file (cache) f.write(tweet['id_str']) f.write('\n') tweetdata = { "CreatedTime": tweet['created_at'], "short_message": tweet['full_text'], "Author": tweet['user']['screen_name'], "Retweets": tweet['retweet_count'], "Likes": tweet['favorite_count'], "Location": tweet['user']['location'], "Rule": ruleId, "full_message": "Tweet matched with RULE: " + ruleId } autotweet = json.dumps(tweetdata) fichero.write(autotweet + '\n') self.wcloud.write(tweet['full_text'] + '\n') os.chmod( "/var/log/modosint/analyzer-twitter/wcloudRule" + ruleId + ".txt", 0o777) os.chmod( "/var/log/modosint/analyzer-twitter/graylog.txt", 0o777) else: break def create_wordcloud(self, text, ruleId): mask = np.array(Image.open(path.join(self.currdir, "twitter_mask.png"))) # create wordcloud object wc = WordCloud(background_color="white", max_words=200, mask=mask, stopwords=self.stop_words) try: # generate wordcloud wc.generate(text) # save wordcloud wc.to_file( path.join(self.currdir + "/WordCloud/Twitter/", "wcTwitterRule" + ruleId + ".png")) os.chmod( path.join(self.currdir + "/WordCloud/Twitter/", "wcTwitterRule" + ruleId + ".png"), 0o777) except ValueError as e: error = True # custom functionality def run(self): self.logger.info("working...") OSINTRules = self.rules for element in OSINTRules: ruleId = element.get('metadata', False).get('id', False) self.wcloud = open( "/var/log/modosint/analyzer-twitter/wcloudRule" + ruleId + ".txt", "a+") checkUsername = element.get('_username', False) checkString = element.get('_string', False) if checkUsername: user = (''.join(element['_username'])) if checkString: string = (','.join(element['_string'])) fullstring = element['_string'] checkLanguage = element.get('_language', False) if checkLanguage: language = (''.join(element['_language'])) self.searchDifLanguage(string, language, ruleId) else: self.searchTweetOrHashtag(string, ruleId) if checkUsername: self.searchUserTweets(user, ruleId, fullstring) if not os.path.exists(self.currdir + "/WordCloud"): os.makedirs(self.currdir + "/WordCloud/") os.chmod(self.currdir + "/WordCloud/", 0o777) if not os.path.exists(self.currdir + "/WordCloud/Twitter"): os.makedirs(self.currdir + "/WordCloud/Twitter/") os.chmod(self.currdir + "/WordCloud/Twitter/", 0o777) for element in OSINTRules: ruleId = element.get('metadata', False).get('id', False) file_content = open( "/var/log/modosint/analyzer-twitter/wcloudRule" + ruleId + ".txt", "r") file_content = file_content.readlines() self.create_wordcloud(str(file_content), ruleId) self.createPlotMentions() self.createPlotHashtag() self.alertLogger.info("Twitter Analyzer Job Finished succesfully.") def exportReferenceHashtag(self, mensaje): lista = re.findall(r'#\w+', mensaje) return lista if lista != [] else np.NaN def exportReferenceMentions(self, mensaje): lista = re.findall(r'@\w+', mensaje) return lista if lista != [] else np.NaN def createPlotMentions(self): with io.open('/var/log/modosint/analyzer-twitter/graylog.txt', 'r') as f: dataMentions = f.readlines() data_json = json.dumps( list(map(lambda entry: eval(entry[:-1]), dataMentions))) data_twitter = pd.read_json(data_json) referenceMentions = data_twitter.short_message.map( self.exportReferenceMentions) referenceMentions.dropna(inplace=True) referenceMentions.head() referenceMentions = list(referenceMentions) referenceMentions_list = list(itertools.chain(*referenceMentions)) count_referenceMentions = pd.Series( referenceMentions_list).value_counts() fig = plt.figure(figsize=(12, 8)) sns.barplot(y=count_referenceMentions.iloc[:20].index, x=count_referenceMentions.iloc[:20].values) fig.savefig(self.currdir + 'mentionsPlot.png') os.chmod(self.currdir + 'mentionsPlot.png', 0o777) def createPlotHashtag(self): with io.open('/var/log/modosint/analyzer-twitter/graylog.txt', 'r') as f: dataHashtag = f.readlines() data_json = json.dumps( list(map(lambda entry: eval(entry[:-1]), dataHashtag))) data_twitter = pd.read_json(data_json) referenceHash = data_twitter.short_message.map( self.exportReferenceHashtag) referenceHash.dropna(inplace=True) referenceHash.head() referenceHash = list(referenceHash) referenceHash_list = list(itertools.chain(*referenceHash)) count_referenceHash = pd.Series(referenceHash_list).value_counts() fig = plt.figure(figsize=(12, 8)) sns.barplot(y=count_referenceHash.iloc[:20].index, x=count_referenceHash.iloc[:20].values) fig.savefig(self.currdir + 'mentionsHashtag.png') os.chmod(self.currdir + 'mentionsHashtag.png', 0o777)
class TwitterHarvester(BaseHarvester): def __init__(self, working_path, stream_restart_interval_secs=30 * 60, mq_config=None, debug=False, connection_errors=5, http_errors=5, debug_warcprox=False, tries=3): BaseHarvester.__init__( self, working_path, mq_config=mq_config, stream_restart_interval_secs=stream_restart_interval_secs, debug=debug, debug_warcprox=debug_warcprox, tries=tries) self.twarc = None self.connection_errors = connection_errors self.http_errors = http_errors self.extract_media = False self.extract_web_resources = False self.extract_user_profile_images = False def harvest_seeds(self): # Create a twarc self._create_twarc() # Get harvest extract options. self.extract_media = self.message.get("options", {}).get("media", False) self.extract_web_resources = self.message.get("options", {}).get( "web_resources", False) self.extract_user_profile_images = self.message.get("options", {}).get( "user_images", False) # Dispatch message based on type. harvest_type = self.message.get("type") log.debug("Harvest type is %s", harvest_type) if harvest_type == "twitter_search": self.search() elif harvest_type == "twitter_filter": self.filter() elif harvest_type == "twitter_sample": self.sample() elif harvest_type == "twitter_user_timeline": self.user_timeline() else: raise KeyError def _create_twarc(self): self.twarc = Twarc(self.message["credentials"]["consumer_key"], self.message["credentials"]["consumer_secret"], self.message["credentials"]["access_token"], self.message["credentials"]["access_token_secret"], http_errors=self.http_errors, connection_errors=self.connection_errors) def search(self): assert len(self.message.get("seeds", [])) == 1 incremental = self.message.get("options", {}).get("incremental", False) query = self.message["seeds"][0]["token"] since_id = self.state_store.get_state( __name__, u"{}.since_id".format(query)) if incremental else None self._harvest_tweets(self.twarc.search(query, since_id=since_id)) def filter(self): assert len(self.message.get("seeds", [])) == 1 track = self.message["seeds"][0]["token"].get("track") follow = self.message["seeds"][0]["token"].get("follow") locations = self.message["seeds"][0]["token"].get("locations") self._harvest_tweets( self.twarc.filter(track=track, follow=follow, locations=locations, event=self.stop_harvest_seeds_event)) def sample(self): self._harvest_tweets(self.twarc.sample(self.stop_harvest_seeds_event)) def user_timeline(self): incremental = self.message.get("options", {}).get("incremental", False) for seed in self.message.get("seeds", []): seed_id = seed["id"] screen_name = seed.get("token") user_id = seed.get("uid") log.debug( "Processing seed (%s) with screen name %s and user id %s", seed_id, screen_name, user_id) assert screen_name or user_id # If there is not a user_id, look it up. if screen_name and not user_id: user_id = self._lookup_user_id(screen_name) if user_id: # Report back if nsid found self.result.uids[seed_id] = user_id else: msg = "User id not found for user {} because account is not found or suspended".format( screen_name) log.exception(msg) self.result.warnings.append( Msg(CODE_TOKEN_NOT_FOUND, msg, seed_id=seed_id)) # Otherwise, get the current screen_name else: new_screen_name = self._lookup_screen_name(user_id) # if can't find the screen_name, ignore get timeline if not new_screen_name: msg = "Screen name not found for user id {} because account is not found or suspended".format( user_id) log.exception(msg) self.result.warnings.append( Msg(CODE_TOKEN_NOT_FOUND, msg, seed_id=seed_id)) # reset the user_id, ignore the get timeline user_id = None if new_screen_name and new_screen_name != screen_name: self.result.token_updates[seed_id] = new_screen_name screen_name = new_screen_name if user_id: try: # Get since_id from state_store since_id = self.state_store.get_state( __name__, "timeline.{}.since_id".format( user_id)) if incremental else None self._harvest_tweets( self.twarc.timeline(user_id=user_id, since_id=since_id)) except HTTPError as e: if e.response.status_code == 401: account = "user {} (User ID: {})".format( screen_name, user_id ) if screen_name else "user ID: {}".format(user_id) msg = "Unauthorized for {} because account is suspended or protected".format( account) log.exception(msg) self.result.warnings.append( Msg(CODE_TOKEN_UNAUTHORIZED, msg, seed_id=seed_id)) else: raise e def _lookup_screen_name(self, user_id): """ Lookup a screen name given a user id. """ try: users = list(self.twarc.user_lookup(user_ids=(user_id, ))) assert len(users) in (0, 1) if users: return users[0]["screen_name"] except HTTPError as e: if e.response.status_code != 404: raise e return None def _lookup_user_id(self, screen_name): """ Lookup a user id given a screen name. """ try: users = list(self.twarc.user_lookup(screen_names=(screen_name, ))) assert len(users) in (0, 1) if users: return users[0]["id_str"] except HTTPError as e: if e.response.status_code != 404: raise e return None def _harvest_tweets(self, tweets): # max_tweet_id = None for count, tweet in enumerate(tweets): if not count % 100: log.debug("Harvested %s tweets", count) self.result.harvest_counter["tweets"] += 1 if self.stop_harvest_seeds_event.is_set(): log.debug("Stopping since stop event set.") break def _process_entities(self, entities): if self.extract_web_resources: for url in entities.get("urls", []): # Exclude links for tweets if url["expanded_url"] and not status_re.match( url["expanded_url"]): self.result.urls.append(url["expanded_url"]) if self.extract_media: for media in entities.get("media", []): if media["media_url"]: self.result.urls.append(media["media_url"]) def process_warc(self, warc_filepath): # Dispatch message based on type. harvest_type = self.message.get("type") log.debug("Harvest type is %s", harvest_type) if harvest_type == "twitter_search": self.process_search_warc(warc_filepath) elif harvest_type == "twitter_filter": self._process_tweets(TwitterStreamWarcIter(warc_filepath)) elif harvest_type == "twitter_sample": self._process_tweets(TwitterStreamWarcIter(warc_filepath)) elif harvest_type == "twitter_user_timeline": self.process_user_timeline_warc(warc_filepath) else: raise KeyError def process_search_warc(self, warc_filepath): incremental = self.message.get("options", {}).get("incremental", False) query = self.message["seeds"][0]["token"] since_id = self.state_store.get_state( __name__, u"{}.since_id".format(query)) if incremental else None max_tweet_id = self._process_tweets(TwitterRestWarcIter(warc_filepath)) # Update state store if incremental and max_tweet_id > since_id: self.state_store.set_state(__name__, u"{}.since_id".format(query), max_tweet_id) def process_user_timeline_warc(self, warc_filepath): incremental = self.message.get("options", {}).get("incremental", False) for count, status in enumerate(TwitterRestWarcIter(warc_filepath)): tweet = status.item if not count % 100: log.debug("Processing %s tweets", count) if "text" in tweet: user_id = tweet["user"]["id_str"] if incremental: # Update state key = "timeline.{}.since_id".format(user_id) self.state_store.set_state( __name__, key, max(self.state_store.get_state(__name__, key), tweet.get("id"))) self._process_tweet(tweet) def _process_tweets(self, warc_iter): max_tweet_id = None for count, status in enumerate(warc_iter): tweet = status.item if not count % 100: log.debug("Processing %s tweets", count) if "text" in tweet: max_tweet_id = max(max_tweet_id, tweet.get("id")) self._process_tweet(tweet) return max_tweet_id def _process_tweet(self, tweet): self.result.increment_stats("tweets") # For more info, see https://dev.twitter.com/overview/api/entities-in-twitter-objects statuses = [tweet] if "retweeted_status" in tweet: statuses.append(tweet["retweeted_status"]) elif "quoted_status" in tweet: statuses.append(tweet["quoted_status"]) for status in statuses: self._process_entities(status.get("entities", {})) self._process_entities(status.get("extended_entities", {})) if self.extract_user_profile_images: self.result.urls.append(tweet["user"]["profile_image_url"]) self.result.urls.append( tweet["user"]["profile_background_image_url"]) if "profile_banner_url" in tweet["user"]: self.result.urls.append(tweet["user"]["profile_banner_url"])
from models import * import json init_db() t = Twarc('BAVSuRNiZ0IGb5CzIlyHzT1Fd', 'XVDj8C2SMLzMNaUTrJP3a8UqhHDvYQKZIRIJ9awHDVRBuqxtGD', '2433228254-RX73xhPl1dQCEBe7zxhz4cgIJQiL5rUKMofBnz5', '33Ju7kmGFPKUXT7AbZ5Nj2ADzaMSL832eur2qwnbFqomt') pos_sum = 0 count = 0 tweets = list() for tweet in t.search("to:realdonaldtrump"): print tweet['id'] print tweet['in_reply_to_status_id'] print tweet['created_at'] print tweet['text'].encode('utf-8') blob = TextBlob(tweet['text']) print(blob.sentiment.polarity) print pos_sum tweet['sentiment'] = blob.sentiment.polarity tweets.append(tweet) if '#yayfortrump' in tweet['text'] and '#nayfortrump' not in tweet['text']: vote = 1 elif '#yayfortrump' not in tweet['text'] and '#nayfortrump' in tweet[ 'text']: vote = -1 elif blob.sentiment.polarity > 0:
from twarc import Twarc client_key = 'client_key' client_secret = 'client_secret' access_token = '197456523-m2qIYWxkQTFKj0ModTQPcdByTnjryHwLRm9L8o5y' access_token_secret = 'access_token_secret' t = Twarc(client_key, client_secret, access_token, access_token_secret) for tweet in t.search("resigncameron"): print(tweet["text"])
class TwitterRelationships(): # Cut-down code to get twitter relationships for a set of hashtags. # Adapted from https://labsblog.f-secure.com/2018/02/16/searching-twitter-with-twarc/ def __init__(self, secretsfile='/Users/sara/twittersecrets.txt'): fsecret = open(secretsfile, 'r') secrets = fsecret.readline() access_token, access_token_secret, consumer_key, consumer_secret = \ [x.strip() for x in secrets.split(',')] self.twarc = Twarc(consumer_key, consumer_secret, access_token, access_token_secret) # Helper functions for saving csv and formatted txt files def write_data(self, data, filename, filetype='txt'): with io.open(filename, "w", encoding="utf-8") as handle: if filetype == 'txt': for item, count in data.most_common(): handle.write(str(count) + "\t" + item + "\n") else: #write to csv handle.write(u"Source,Target,Weight\n") for source, targets in sorted(data.items()): for target, count in sorted(targets.items()): if source != target and source is not None and target is not None: handle.write(source + u"," + target + u"," + str(count) + u"\n") return # Returns the screen_name of the user retweeted, or None def retweeted_user(self, status): if "retweeted_status" in status: orig_tweet = status["retweeted_status"] if "user" in orig_tweet and orig_tweet["user"] is not None: user = orig_tweet["user"] if "screen_name" in user and user["screen_name"] is not None: return user["screen_name"] return # Returns a list of screen_names that the user interacted with in this Tweet def get_interactions(self, status): interactions = [] if "in_reply_to_screen_name" in status: replied_to = status["in_reply_to_screen_name"] if replied_to is not None and replied_to not in interactions: interactions.append(replied_to) if "retweeted_status" in status: orig_tweet = status["retweeted_status"] if "user" in orig_tweet and orig_tweet["user"] is not None: user = orig_tweet["user"] if "screen_name" in user and user["screen_name"] is not None: if user["screen_name"] not in interactions: interactions.append(user["screen_name"]) if "quoted_status" in status: orig_tweet = status["quoted_status"] if "user" in orig_tweet and orig_tweet["user"] is not None: user = orig_tweet["user"] if "screen_name" in user and user["screen_name"] is not None: if user["screen_name"] not in interactions: interactions.append(user["screen_name"]) if "entities" in status: entities = status["entities"] if "user_mentions" in entities: for item in entities["user_mentions"]: if item is not None and "screen_name" in item: mention = item['screen_name'] if mention is not None and mention not in interactions: interactions.append(mention) return interactions # Returns a list of hashtags found in the tweet def get_hashtags(self, status): hashtags = [] if "entities" in status: entities = status["entities"] if "hashtags" in entities: for item in entities["hashtags"]: if item is not None and "text" in item: hashtag = item['text'] if hashtag is not None and hashtag not in hashtags: hashtags.append(hashtag) return hashtags # Returns a list of URLs found in the Tweet def get_urls(self, status): urls = [] if "entities" in status: entities = status["entities"] if "urls" in entities: for item in entities["urls"]: if item is not None and "expanded_url" in item: url = item['expanded_url'] if url is not None and url not in urls: urls.append(url) return urls def get_image_urls(self, status): # Returns the URLs to any images found in the Tweet urls = [] if "entities" in status: entities = status["entities"] if "media" in entities: for item in entities["media"]: if item is not None: if "media_url" in item: murl = item["media_url"] if murl not in urls: urls.append(murl) return urls def fetch_images(self): # Iterate through image URLs, fetching each image if we haven't already pictures_dir = os.path.join(self.save_dir, self.dataname + '_' + "images") if not os.path.exists(pictures_dir): print("Creating directory: " + pictures_dir) os.makedirs(pictures_dir) for url in self.all_image_urls: m = re.search("^http:\/\/pbs\.twimg\.com\/media\/(.+)$", url) if m is not None: filename = m.group(1) print("Getting picture from: " + url) save_path = os.path.join(pictures_dir, filename) if not os.path.exists(save_path): response = requests.get(url, stream=True) with open(save_path, 'wb') as out_file: shutil.copyfileobj(response.raw, out_file) del response return def writedf(self, dataset, name, columns): filename = os.path.join(self.save_dir, self.dataname + '_' + name) with io.open(filename, "w", encoding="utf-8") as handle: handle.write('\t'.join(columns) + u"\n") for row in dataset: handle.write('\t'.join(row) + u"\n") return def save_datasets(self, fetch_images=True): csv_outputs = { "user_user_graph.csv": self.user_user_graph, "user_hashtag_graph.csv": self.user_hashtag_graph, "hashtag_hashtag_graph.csv": self.hashtag_hashtag_graph } for name, dataset in csv_outputs.items(): filename = os.path.join(self.save_dir, self.dataname + '_' + name) self.write_data(dataset, filename, 'csv') text_outputs = { "hashtags.txt": self.hashtag_frequency_dist, "influencers.txt": self.influencer_frequency_dist, "mentioned.txt": self.mentioned_frequency_dist, "urls.txt": self.url_frequency_dist } for name, dataset in text_outputs.items(): filename = os.path.join(self.save_dir, self.dataname + '_' + name) self.write_data(dataset, filename, 'txt') self.writedf(self.url_refs, "url_refs.csv", ['url', 'tweeturl']) self.writedf(self.image_refs, "image_refs.csv", ['url', 'tweeturl']) self.writedf(self.tweets, "tweets.csv", ['url', 'screen_name', 'id', 'created_at', 'text']) if fetch_images: self.fetch_images() return def make_directories(self, target, rootdir='../data/twitter'): # Create a separate save directory for each search query # Since search queries can be a whole sentence, we'll check the length # and simply number it if the query is overly long self.dataname = datetime.now().strftime( "%Y%m%d%H%M%S") + '_' + target.replace(" ", "_") self.save_dir = rootdir if not os.path.exists(rootdir): os.makedirs(rootdir) if len(target) < 30: self.save_dir += "/" + self.dataname else: self.save_dir += "/target_" + str(count + 1) if not os.path.exists(self.save_dir): print("Creating directory: " + self.save_dir) os.makedirs(self.save_dir) return def get_target_data(self, target): # Variables for capturing stuff self.tweets_captured = 0 self.influencer_frequency_dist = Counter() self.mentioned_frequency_dist = Counter() self.hashtag_frequency_dist = Counter() self.url_frequency_dist = Counter() self.user_user_graph = {} self.user_hashtag_graph = {} self.hashtag_hashtag_graph = {} self.all_image_urls = [] self.tweets = [] self.tweet_count = 0 self.url_refs = [] self.image_refs = [] # Start the search for status in self.twarc.search(target): # Output some status as we go, so we know something is happening sys.stdout.write("\r") sys.stdout.flush() sys.stdout.write("Collected " + str(self.tweet_count) + " tweets.") sys.stdout.flush() self.tweet_count += 1 screen_name = None if "user" in status: if "screen_name" in status["user"]: screen_name = status["user"]["screen_name"] retweeted = self.retweeted_user(status) if retweeted is not None: self.influencer_frequency_dist[retweeted] += 1 else: self.influencer_frequency_dist[screen_name] += 1 # Tweet text can be in either "text" or "full_text" field... text = None if "full_text" in status: text = status["full_text"] elif "text" in status: text = status["text"] id_str = None if "id_str" in status: id_str = status["id_str"] # Assemble the URL to the tweet we received... tweet_url = None if id_str is not None and screen_name is not None: tweet_url = "https://twitter.com/" + screen_name + "/status/" + id_str # if tweet_url is not None and text is not None: # self.tweets[tweet_url] = text created_at = None if "created_at" in status: created_at = status["created_at"] self.tweets += [[tweet_url, screen_name, id_str, created_at, text]] #capture everything # Record mapping graph between users interactions = self.get_interactions(status) if interactions is not None: for user in interactions: self.mentioned_frequency_dist[user] += 1 if screen_name not in self.user_user_graph: self.user_user_graph[screen_name] = {} if user not in self.user_user_graph[screen_name]: self.user_user_graph[screen_name][user] = 1 else: self.user_user_graph[screen_name][user] += 1 # Record mapping graph between users and hashtags hashtags = self.get_hashtags(status) if hashtags is not None: if len(hashtags) > 1: hashtag_interactions = [] # This code creates pairs of hashtags in situations where multiple # hashtags were found in a tweet # This is used to create a graph of hashtag-hashtag interactions for comb in combinations(sorted(hashtags), 2): hashtag_interactions.append(comb) if len(hashtag_interactions) > 0: for inter in hashtag_interactions: item1, item2 = inter if item1 not in self.hashtag_hashtag_graph: self.hashtag_hashtag_graph[item1] = {} if item2 not in self.hashtag_hashtag_graph[item1]: self.hashtag_hashtag_graph[item1][item2] = 1 else: self.hashtag_hashtag_graph[item1][item2] += 1 for hashtag in hashtags: self.hashtag_frequency_dist[hashtag] += 1 if screen_name not in self.user_hashtag_graph: self.user_hashtag_graph[screen_name] = {} if hashtag not in self.user_hashtag_graph[screen_name]: self.user_hashtag_graph[screen_name][hashtag] = 1 else: self.user_hashtag_graph[screen_name][hashtag] += 1 urls = self.get_urls(status) if urls is not None: for url in urls: self.url_refs += [[url, tweet_url]] self.url_frequency_dist[url] += 1 image_urls = self.get_image_urls(status) if image_urls is not None: for url in image_urls: self.image_refs += [[url, tweet_url]] if url not in self.all_image_urls: self.all_image_urls.append(url) self.save_datasets(fetch_images=True) return
class TwitterHarvester(BaseHarvester): def __init__(self, process_interval_secs=1200, mq_config=None, debug=False): BaseHarvester.__init__(self, mq_config=mq_config, process_interval_secs=process_interval_secs, debug=debug) self.twarc = None def harvest_seeds(self): # Create a twarc self._create_twarc() # Dispatch message based on type. harvest_type = self.message.get("type") log.debug("Harvest type is %s", harvest_type) if harvest_type == "twitter_search": self.search() elif harvest_type == "twitter_filter": self.filter() else: raise KeyError def _create_twarc(self): self.twarc = Twarc(self.message["credentials"]["consumer_key"], self.message["credentials"]["consumer_secret"], self.message["credentials"]["access_token"], self.message["credentials"]["access_token_secret"]) def search(self): incremental = self.message.get("options", {}).get("incremental", False) for seed in self.message.get("seeds", []): query = seed.get("token") # Get since_id from state_store since_id = self.state_store.get_state(__name__, "{}.since_id".format(query)) if incremental else None max_tweet_id = self._process_tweets(self.twarc.search(query, since_id=since_id)) log.debug("Searching on %s since %s returned %s tweets.", query, since_id, self.harvest_result.summary.get("tweet")) # Update state store if incremental and max_tweet_id: self.state_store.set_state(__name__, "{}.since_id".format(query), max_tweet_id) def filter(self): assert len(self.message.get("seeds", [])) == 1 track = self.message["seeds"][0]["token"] self._process_tweets(self.twarc.stream(track)) def _process_tweets(self, tweets): max_tweet_id = None for count, tweet in enumerate(tweets): if not count % 100: log.debug("Processed %s tweets", count) if self.stop_event.is_set(): log.debug("Stopping since stop event set.") break if "text" in tweet: with self.harvest_result_lock: max_tweet_id = max(max_tweet_id, tweet.get("id")) self.harvest_result.increment_summary("tweet") if "urls" in tweet["entities"]: for url in tweet["entities"]["urls"]: self.harvest_result.urls.append(url["expanded_url"]) if "media" in tweet["entities"]: for media in tweet["entities"]["media"]: self.harvest_result.urls.append(media["media_url"]) return max_tweet_id
NAME THE TWEET DOCUMEsNTS THAT YOU WOULD LIKE THE TWEETS TO BE DUMPED IN ''' file_dir = 'tweet_files' if not os.path.isdir(file_dir): os.mkdir(file_dir) ''' HERE IS WHERE THE TWITTER SCRAPING STARTS. PLEASE DO NOT TOUCH ANTHING BELOW. IF YOU WANT TO CHANGE THE TWARC FUNCTION, YOU MAY DO SO AS LONG AS YOU MAKE SURE THE CORRESPONDING PARAMETERS ARE CHANGED. ''' t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret) for topic in topic_list: #Removing and spaces and hashtags to name output file temp_str = topic.replace('#', '') temp_str = temp_str.replace(' ', '_') file_name = temp_str + '.txt' file_write = os.path.join(file_dir, file_name) print('Started scraping tweets for the topic {0}'.format(topic)) #writing to output file with open(file_write, 'w') as writefile: for tweet in t.search(topic, lang=language): line = tweet['full_text'] #writing tweet to file writefile.write(line) print('Just finished scraping tweets for the topic {0}'.format(topic))
from twarc import Twarc import json CONSUMER_KEY = 'CfHUyBhlMaLv5Mn8r2IziXpLs' CONSUMER_SECRET = 'PqqtbhbyNb5mcJ2dHkSIT2wupOMuEqfSINGYvV8KDIOPuqgDkN' ACCESS_TOKEN = '29202483-qK6twPLeurVc8Ls8zBxdFtaFGyzm76LUBbtXOMMk1' ACCESS_TOKEN_SECRET = 'aOIFdI1TVJjsIPWNO1rAFx2IECzVSCPY4kOnEKBA0pCdA' w = open('tweetDay5.json', 'w') t = Twarc(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET) count = 1 for tweet in t.search("google fi"): w.write(json.dumps(tweet)) w.write('\n') print count count += 1 if count > 1000: break
auth.set_access_token(keys.access_token, keys.access_token_secret) api = tweepy.API(auth) arguments = sys.argv # Get parameters from command line if len(arguments) > 1: # If there's any arguments join with an OR in between hashtags = ' OR '.join(map(str, arguments)) else: # If no arguments don't run print("No arguments passed") sys.exit(0) # Search Twitter for tweets conraining the hashtags tweets = twarc.search(hashtags) for tweet in tweets: user = tweet['user'] # Check if tweet (not retweet) and determine if the user is an influencer if utils.not_retweet(tweet) and utils.is_influencer(tweet): # Check if you follow the influencer and if you've already sent a follow request if user['following'] == False and user['follow_request_sent'] == False: # Follow print('following: ' + user['name']) api.create_friendship(user['id']) # Check if they have more retweets than likes, like the tweet if not liked yet and viceversa if tweet['retweet_count'] >= tweet['favorite_count'] and tweet[
print("Creating directory: " + save_dir) os.makedirs(save_dir) # Variables for capturing stuff tweets_captured = 0 influencer_frequency_dist = Counter() mentioned_frequency_dist = Counter() hashtag_frequency_dist = Counter() url_frequency_dist = Counter() user_user_graph = {} user_hashtag_graph = {} hashtag_hashtag_graph = {} all_image_urls = [] tweets = {} tweet_count = 0 # Start the search for status in twarc.search(target): # Output some status as we go, so we know something is happening sys.stdout.write("\r") sys.stdout.flush() sys.stdout.write("Collected " + str(tweet_count) + " tweets.") sys.stdout.flush() tweet_count += 1 screen_name = None if "user" in status: if "screen_name" in status["user"]: screen_name = status["user"]["screen_name"] retweeted = retweeted_user(status) if retweeted is not None: influencer_frequency_dist[retweeted] += 1
class TwitterHarvester(BaseHarvester): def __init__(self, working_path, stream_restart_interval_secs=30 * 60, mq_config=None, debug=False, connection_errors=5, http_errors=5, debug_warcprox=False, tries=3): BaseHarvester.__init__(self, working_path, mq_config=mq_config, stream_restart_interval_secs=stream_restart_interval_secs, debug=debug, debug_warcprox=debug_warcprox, tries=tries) self.twarc = None self.connection_errors = connection_errors self.http_errors = http_errors self.extract_media = False self.extract_web_resources = False self.extract_user_profile_images = False def harvest_seeds(self): # Create a twarc self._create_twarc() # Get harvest extract options. self.extract_media = self.message.get("options", {}).get("media", False) self.extract_web_resources = self.message.get("options", {}).get("web_resources", False) self.extract_user_profile_images = self.message.get("options", {}).get("user_images", False) # Dispatch message based on type. harvest_type = self.message.get("type") log.debug("Harvest type is %s", harvest_type) if harvest_type == "twitter_search": self.search() elif harvest_type == "twitter_filter": self.filter() elif harvest_type == "twitter_sample": self.sample() elif harvest_type == "twitter_user_timeline": self.user_timeline() else: raise KeyError def _create_twarc(self): self.twarc = Twarc(self.message["credentials"]["consumer_key"], self.message["credentials"]["consumer_secret"], self.message["credentials"]["access_token"], self.message["credentials"]["access_token_secret"], http_errors=self.http_errors, connection_errors=self.connection_errors) def search(self): assert len(self.message.get("seeds", [])) == 1 incremental = self.message.get("options", {}).get("incremental", False) query = self.message["seeds"][0]["token"] since_id = self.state_store.get_state(__name__, u"{}.since_id".format(query)) if incremental else None self._harvest_tweets(self.twarc.search(query, since_id=since_id)) def filter(self): assert len(self.message.get("seeds", [])) == 1 track = self.message["seeds"][0]["token"].get("track") follow = self.message["seeds"][0]["token"].get("follow") locations = self.message["seeds"][0]["token"].get("locations") self._harvest_tweets(self.twarc.filter(track=track, follow=follow, locations=locations)) def sample(self): self._harvest_tweets(self.twarc.sample()) def user_timeline(self): incremental = self.message.get("options", {}).get("incremental", False) for seed in self.message.get("seeds", []): seed_id = seed["id"] screen_name = seed.get("token") user_id = seed.get("uid") log.debug("Processing seed (%s) with screen name %s and user id %s", seed_id, screen_name, user_id) assert screen_name or user_id # If there is not a user_id, look it up. if screen_name and not user_id: user_id = self._lookup_user_id(screen_name) if user_id: # Report back if nsid found self.result.uids[seed_id] = user_id else: msg = "User id not found for user {}".format(screen_name) log.exception(msg) self.result.warnings.append(Msg(CODE_TOKEN_NOT_FOUND, msg)) # Otherwise, get the current screen_name else: new_screen_name = self._lookup_screen_name(user_id) if new_screen_name != screen_name: self.result.token_updates[seed_id] = new_screen_name screen_name = new_screen_name if user_id: try: # Get since_id from state_store since_id = self.state_store.get_state(__name__, "timeline.{}.since_id".format( user_id)) if incremental else None self._harvest_tweets(self.twarc.timeline(user_id=user_id, since_id=since_id)) except HTTPError as e: if e.response.status_code == 401: msg = "Unauthorized for user {} (User ID: {}) because account is suspended or private".format(screen_name, user_id) log.exception(msg) self.result.warnings.append(Msg(CODE_TOKEN_UNAUTHORIZED, msg)) else: raise e def _lookup_screen_name(self, user_id): """ Lookup a screen name given a user id. """ users = list(self.twarc.user_lookup(user_ids=(user_id,))) assert len(users) in (0, 1) if users: return users[0]["screen_name"] return None def _lookup_user_id(self, screen_name): """ Lookup a user id given a screen name. """ users = list(self.twarc.user_lookup(screen_names=(screen_name,))) assert len(users) in (0, 1) if users: return users[0]["id_str"] return None def _harvest_tweets(self, tweets): # max_tweet_id = None for count, tweet in enumerate(tweets): if not count % 100: log.debug("Harvested %s tweets", count) self.result.harvest_counter["tweets"] += 1 if self.stop_harvest_seeds_event.is_set(): log.debug("Stopping since stop event set.") break def _process_entities(self, entities): if self.extract_web_resources: for url in entities.get("urls", []): # Exclude links for tweets if url["expanded_url"] and not status_re.match(url["expanded_url"]): self.result.urls.append(url["expanded_url"]) if self.extract_media: for media in entities.get("media", []): if media["media_url"]: self.result.urls.append(media["media_url"]) def process_warc(self, warc_filepath): # Dispatch message based on type. harvest_type = self.message.get("type") log.debug("Harvest type is %s", harvest_type) if harvest_type == "twitter_search": self.process_search_warc(warc_filepath) elif harvest_type == "twitter_filter": self._process_tweets(TwitterStreamWarcIter(warc_filepath)) elif harvest_type == "twitter_sample": self._process_tweets(TwitterStreamWarcIter(warc_filepath)) elif harvest_type == "twitter_user_timeline": self.process_user_timeline_warc(warc_filepath) else: raise KeyError def process_search_warc(self, warc_filepath): incremental = self.message.get("options", {}).get("incremental", False) query = self.message["seeds"][0]["token"] since_id = self.state_store.get_state(__name__, u"{}.since_id".format(query)) if incremental else None max_tweet_id = self._process_tweets(TwitterRestWarcIter(warc_filepath)) # Update state store if incremental and max_tweet_id > since_id: self.state_store.set_state(__name__, u"{}.since_id".format(query), max_tweet_id) def process_user_timeline_warc(self, warc_filepath): incremental = self.message.get("options", {}).get("incremental", False) for count, status in enumerate(TwitterRestWarcIter(warc_filepath)): tweet = status.item if not count % 100: log.debug("Processing %s tweets", count) if "text" in tweet: user_id = tweet["user"]["id_str"] if incremental: # Update state key = "timeline.{}.since_id".format(user_id) self.state_store.set_state(__name__, key, max(self.state_store.get_state(__name__, key), tweet.get("id"))) self._process_tweet(tweet) def _process_tweets(self, warc_iter): max_tweet_id = None for count, status in enumerate(warc_iter): tweet = status.item if not count % 100: log.debug("Processing %s tweets", count) if "text" in tweet: max_tweet_id = max(max_tweet_id, tweet.get("id")) self._process_tweet(tweet) return max_tweet_id def _process_tweet(self, tweet): self.result.increment_stats("tweets") # For more info, see https://dev.twitter.com/overview/api/entities-in-twitter-objects statuses = [tweet] if "retweeted_status" in tweet: statuses.append(tweet["retweeted_status"]) elif "quoted_status" in tweet: statuses.append(tweet["quoted_status"]) for status in statuses: self._process_entities(status.get("entities", {})) self._process_entities(status.get("extended_entities", {})) if self.extract_user_profile_images: self.result.urls.append(tweet["user"]["profile_image_url"]) self.result.urls.append(tweet["user"]["profile_background_image_url"]) if "profile_banner_url" in tweet["user"]: self.result.urls.append(tweet["user"]["profile_banner_url"])
class TwitterHarvester(BaseHarvester): def __init__(self, working_path, stream_restart_interval_secs=30 * 60, mq_config=None, debug=False, connection_errors=5, http_errors=5, debug_warcprox=False, tries=3): BaseHarvester.__init__( self, working_path, mq_config=mq_config, stream_restart_interval_secs=stream_restart_interval_secs, debug=debug, debug_warcprox=debug_warcprox, tries=tries) self.twarc = None self.connection_errors = connection_errors self.http_errors = http_errors def harvest_seeds(self): # Create a twarc self._create_twarc() # Dispatch message based on type. harvest_type = self.message.get("type") log.debug("Harvest type is %s", harvest_type) if harvest_type == "twitter_search": self.search() elif harvest_type == "twitter_filter": self.filter() elif harvest_type == "twitter_sample": self.sample() elif harvest_type == "twitter_user_timeline": self.user_timeline() else: raise KeyError def _create_twarc(self): self.twarc = Twarc(self.message["credentials"]["consumer_key"], self.message["credentials"]["consumer_secret"], self.message["credentials"]["access_token"], self.message["credentials"]["access_token_secret"], http_errors=self.http_errors, connection_errors=self.connection_errors, tweet_mode="extended") def search(self): assert len(self.message.get("seeds", [])) == 1 incremental = self.message.get("options", {}).get("incremental", False) since_id = self.state_store.get_state( __name__, u"{}.since_id".format( self._search_id())) if incremental else None query, geocode = self._search_parameters() self._harvest_tweets( self.twarc.search(query, geocode=geocode, since_id=since_id)) def _search_parameters(self): if type(self.message["seeds"][0]["token"]) is dict: query = self.message["seeds"][0]["token"].get("query") geocode = self.message["seeds"][0]["token"].get("geocode") else: query = self.message["seeds"][0]["token"] geocode = None return query, geocode def _search_id(self): query, geocode = self._search_parameters() if query and not geocode: return query if geocode and not query: return geocode return ":".join([query, geocode]) def filter(self): assert len(self.message.get("seeds", [])) == 1 track = self.message["seeds"][0]["token"].get("track") follow = self.message["seeds"][0]["token"].get("follow") locations = self.message["seeds"][0]["token"].get("locations") language = self.message["seeds"][0]["token"].get("language") self._harvest_tweets( self.twarc.filter(track=track, follow=follow, locations=locations, lang=language, event=self.stop_harvest_seeds_event)) def sample(self): self._harvest_tweets(self.twarc.sample(self.stop_harvest_seeds_event)) def user_timeline(self): incremental = self.message.get("options", {}).get("incremental", False) for seed in self.message.get("seeds", []): seed_id = seed["id"] screen_name = seed.get("token") user_id = seed.get("uid") log.debug( "Processing seed (%s) with screen name %s and user id %s", seed_id, screen_name, user_id) assert screen_name or user_id # If there is not a user_id, look it up. if screen_name and not user_id: result, user = self._lookup_user(screen_name, "screen_name") if result == "OK": user_id = user["id_str"] self.result.uids[seed_id] = user_id else: msg = u"User id not found for {} because account is {}".format( screen_name, self._result_to_reason(result)) log.exception(msg) self.result.warnings.append( Msg("token_{}".format(result), msg, seed_id=seed_id)) # Otherwise, get the current screen_name else: result, user = self._lookup_user(user_id, "user_id") if result == "OK": new_screen_name = user["screen_name"] if new_screen_name and new_screen_name != screen_name: self.result.token_updates[seed_id] = new_screen_name else: msg = u"User {} (User ID: {}) not found because account is {}".format( screen_name, user_id, self._result_to_reason(result)) log.exception(msg) self.result.warnings.append( Msg("uid_{}".format(result), msg, seed_id=seed_id)) user_id = None if user_id: # Get since_id from state_store since_id = self.state_store.get_state( __name__, "timeline.{}.since_id".format( user_id)) if incremental else None self._harvest_tweets( self.twarc.timeline(user_id=user_id, since_id=since_id)) def _lookup_user(self, id, id_type): url = "https://api.twitter.com/1.1/users/show.json" params = {id_type: id} # USER_DELETED: 404 and {"errors": [{"code": 50, "message": "User not found."}]} # USER_PROTECTED: 200 and user object with "protected": true # USER_SUSPENDED: 403 and {"errors":[{"code":63,"message":"User has been suspended."}]} result = "OK" user = None try: resp = self.twarc.get(url, params=params, allow_404=True) user = resp.json() if user['protected']: result = "unauthorized" except requests.exceptions.HTTPError as e: try: resp_json = e.response.json() except json.decoder.JSONDecodeError: raise e if e.response.status_code == 404 and self._has_error_code( resp_json, 50): result = "not_found" elif e.response.status_code == 403 and self._has_error_code( resp_json, 63): result = "suspended" else: raise e return result, user @staticmethod def _has_error_code(resp, code): if isinstance(code, int): code = (code, ) for error in resp['errors']: if error['code'] in code: return True return False @staticmethod def _result_to_reason(result): if result == "unauthorized": return "protected" elif result == "suspended": return "suspended" return "not found or deleted" def _harvest_tweets(self, tweets): # max_tweet_id = None for count, tweet in enumerate(tweets): if not count % 100: log.debug("Harvested %s tweets", count) self.result.harvest_counter["tweets"] += 1 if self.stop_harvest_seeds_event.is_set(): log.debug("Stopping since stop event set.") break def process_warc(self, warc_filepath): # Dispatch message based on type. harvest_type = self.message.get("type") log.debug("Harvest type is %s", harvest_type) if harvest_type == "twitter_search": self.process_search_warc(warc_filepath) elif harvest_type == "twitter_filter": self._process_tweets(TwitterStreamWarcIter(warc_filepath)) elif harvest_type == "twitter_sample": self._process_tweets(TwitterStreamWarcIter(warc_filepath)) elif harvest_type == "twitter_user_timeline": self.process_user_timeline_warc(warc_filepath) else: raise KeyError def process_search_warc(self, warc_filepath): incremental = self.message.get("options", {}).get("incremental", False) since_id = self.state_store.get_state( __name__, u"{}.since_id".format( self._search_id())) if incremental else None max_tweet_id = self._process_tweets(TwitterRestWarcIter(warc_filepath)) # Update state store if incremental and (max_tweet_id or 0) > (since_id or 0): self.state_store.set_state( __name__, u"{}.since_id".format(self._search_id()), max_tweet_id) def process_user_timeline_warc(self, warc_filepath): incremental = self.message.get("options", {}).get("incremental", False) for count, status in enumerate(TwitterRestWarcIter(warc_filepath)): tweet = status.item if not count % 100: log.debug("Processing %s tweets", count) if "text" in tweet or "full_text" in tweet: user_id = tweet["user"]["id_str"] if incremental: # Update state key = "timeline.{}.since_id".format(user_id) self.state_store.set_state( __name__, key, max( self.state_store.get_state(__name__, key) or 0, tweet.get("id"))) self._process_tweet(tweet) def _process_tweets(self, warc_iter): max_tweet_id = None for count, status in enumerate(warc_iter): tweet = status.item if not count % 100: log.debug("Processing %s tweets", count) if "text" in tweet or "full_text" in tweet: max_tweet_id = max(max_tweet_id or 0, tweet.get("id")) self._process_tweet(tweet) return max_tweet_id def _process_tweet(self, _): self.result.increment_stats("tweets")
from twarc import Twarc tw = Twarc() #get training data for tweet in tw.search("covid-19", lang='en'): try: screen_name = None if "screen_name" in tweet["user"]: screen_name = tweet["user"]["screen_name"] id_str = tweet["id_str"] tweet_url = None if "id_str" != None and "screen_name" != None: tweet_url = "https://twitter.com/" + screen_name + "/status/" + id_str #put training data into a txt file with open("trainingcovid-19.txt", "a+") as f: # Move read cursor to the start of file. f.seek(0) # If file is not empty then append '\n' data = f.read(100) if len(data) > 0: f.write("\n") # Append text at the end of file f.write(tweet['full_text']) f.write("\n") f.write(tweet_url) except UnicodeEncodeError: print("UnicodeEncodeError in finding training data") #now we have to manually sort training data
__location__ = os.path.dirname(os.path.realpath(__file__)) users = os.path.join(__location__, "apostrophe", "tweets.csv") userList = [] with open(users, 'r', encoding='utf-8') as f: reader = csv.reader(f) rowCount = 0 for row in reader: rowCount += 1 if rowCount > 1: if not row[3] in userList: userList.append(row[3]) tweets = [] tweetContent = "" for user in userList: t = Twarc() for tweet in t.search("from:" + user): print (tweet["full_text"]) tweetContent += "%s\n" % str(tweet["full_text"]) tweets.append(tweet) outputFile = os.path.join(__location__, "possibleBotTweets.jsonl") with open(outputFile, "w", encoding='utf-8') as output: for line in tweets: output.write("%s\n" % str(json.dumps(line))) contentOutput = os.path.join(__location__, "possibleBotTweetContent.txt") with open(contentOutput, "w", encoding='utf-8') as output2: output2.write(tweetContent)
tweet_dic['poi_id'] = 18839785 tweet_dic['user']['verified'] = False #CHANGE print(tweet_dic['poi_name']) tweet_dic['country'] = "India" full_text = tweet_dic['full_text'] tweet_dic['text_copy'] = demoji.replace(full_text) tweet_dic['tweet_emotions'] = list(demoji.findall(full_text).keys()) json.dump(tweet_dic, fp, ensure_ascii = False) fp.write("\n") t = Twarc(consumer_key, consumer_secret, access_key, access_secret) max_number = 0 for name in name_list: print(name) with open( file_name, "a", encoding='utf-8') as file: for tweet in t.search(q = str(name), lang='hi'): #print("In") #preprocessing(tweet, file) if 'retweeted_status' in tweet.keys(): print("Its a retweet") continue else: json.dump(tweet, file, ensure_ascii=False) file.write("\n") max_number +=1 print("{} number {}".format(name, max_number)) if max_number > 2500: break time.sleep(10) preprocessing(file_name, file_processed)
#! usr/bin/env python # -*- coding : utf -*- from twarc import Twarc t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret) for tweet in t.search("ferguson"): print(tweet["text"])
def crawl_feed(feed_dict, credentials): twarc = Twarc(credentials['consumer_key'], credentials['consumer_secret'], credentials['access_token'], credentials['access_token_secret']) crawl_time = datetime.datetime.now() crawl_time_filename = crawl_time.strftime('%Y%m%d%I%M%S') crawl_time_html = crawl_time.strftime('%B %d, %Y') crawl_name = feed_dict['crawl_name'] crawl_type = feed_dict['crawl_type'] short_name = feed_dict['short_name'] search_string = feed_dict['search_string'] feed_dir = feed_dict['feed_dir'] json_dir = join(feed_dir, 'json') html_dir = join(feed_dir, 'html') media_dir = join(feed_dir, 'media') logs_dir = join(feed_dir, 'logs') for directory in [feed_dir, json_dir, html_dir, media_dir, logs_dir]: if not os.path.exists(directory): os.makedirs(directory) log_file = join(logs_dir, 'twarc.log') formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') logger = logging.getLogger(crawl_name) handler = logging.FileHandler(log_file) handler.setFormatter(formatter) logger.addHandler(handler) logger.setLevel(logging.INFO) base_filename = short_name + '-' + crawl_time_filename json_file = join(json_dir, base_filename + '.json') print("Searching Twitter API for {0}".format(search_string)) print("Writing JSON and HTML files...") logger.info("starting search for %s", search_string) tweet_count = 0 for tweet in twarc.search(search_string): with open(json_file, 'a') as json_out: json_out.write("{}\n".format(json.dumps(tweet))) if "id_str" in tweet: logger.info("archived https://twitter.com/%s/status/%s", tweet['user']['screen_name'], tweet["id_str"]) elif 'limit' in tweet: logger.warn("%s tweets undelivered", tweet["limit"]["track"]) elif 'warning' in tweet: logger.warn(tweet['warning']['message']) else: logger.warn(json.dumps(tweet)) tweet_count += 1 if tweet_count == 0: logger.info("no new tweets matching %s", search_string) # Write an empty json file. Maybe don't do this? with open(json_file, 'w') as json_out: json_out.close() return base_filename, tweet_count, crawl_time_html