def get_account(item): """ Uses the Twarc libtrary to surface all the tweet twarc can see via a twitter username Searches for media in all tweets - if it can find any it also tries to download that media item """ item.agent_name = agent_name + "_1_get_account" if not os.path.exists(item.storage_folder): os.makedirs(item.storage_folder) t = Twarc(twitter_consumer_key, twitter_consumer_secret, twitter_access_token, twitter_access_token_secret) name = item.url.strip().replace("https://twitter.com/", "").replace("?", "") file_path = os.path.join( item.storage_folder, "{}_{}.json".format(time.strftime("%d-%m-%Y_%H-%M-%S"), name)) if not os.path.exists(item.storage_folder): os.makedirs(item.storage_folder) tweets = [] for tweet in t.timeline(screen_name=name): tweets.append(tweet) tweets = filter_tweets_by_start_date(tweets, item.date_range) for tweet in tweets: get_assets(tweet, item.storage_folder) with open(file_path, "w") as outfile: json.dump(tweets, outfile) item.completed = True return item
def collect_timelines(input_file, output_file, credentials_file): with open(credentials_file) as fp: credentials = tuple(map(str.strip, fp.readlines())) twarc_obj = Twarc(*credentials) df = pd.read_csv(input_file, sep="\t") with open(output_file, "w+") as fp: total = 0 found_users = 0 pbar = tqdm.tqdm(df.values) for uid, tid, u_statuses in pbar: found = 0 pbar.set_description("User {}".format(uid)) try: for tweet_json in twarc_obj.timeline(user_id="{}".format(uid)): found += 1 if found > 190: break total += 1 print(json.dumps(tweet_json), file=fp) pbar.set_postfix(found=found_users + 1, total=total) except requests.exceptions.HTTPError as e: pbar.write("Error for uid={}. {}".format(uid, e)) else: found_users += 1 pbar.close() print("Collected {} tweets.".format(total))
def get_interactions(consumer_key, consumer_secret, access_token, access_token_secret): """ Arguments are Twitter API credentials. To get them you can go here http://apps.twitter.com/. Saves pickled lists of tweet authors and users they mention, and a list of users considered. """ from twarc import Twarc from tqdm import tqdm import pickle t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret) list_ids = ["1335885096063295488", "1288082572195639296", "1287444819015618561", "1283739792702713856", "1081734288368898048", "910757441855459328", "193445218", "90205656", "85315110"] users = set([m['screen_name'] for lid in list_ids for m in t.list_members(lid)]) users_to_exclude = ['premierleague', 'SpursOfficial', 'Arsenal', 'ManCity', 'sterling7', 'kylewalker2', 'HKane', 'benmendy23', 'dele_official', 'RobHolding95', 'm8arteta'] [users.remove(u) for u in users_to_exclude] authors = [] mentions = [] for user in tqdm(users): tl = t.timeline(screen_name=user) tweets = [tt for tt in tl] m = [u['screen_name'] for tw in tweets for u in tw['entities']['user_mentions']] a = [user] * len(m) mentions.append(m) authors.append(a) flat_a = [item for sublist in authors for item in sublist] flat_m = [item for sublist in mentions for item in sublist] pickle.dump(flat_a, open('authors.p', 'wb')) pickle.dump(flat_m, open('mentions.p', 'wb')) pickle.dump(users, open('users.p', 'wb'))
def read_timelines(after_date: datetime, handles: List[str]): consumer_key = os.environ.get('CONSUMER_KEY') consumer_secret = os.environ.get('CONSUMER_SECRET') access_token_key = os.environ.get('ACCESS_TOKEN') access_token_secret = os.environ.get('ACCESS_TOKEN_SECRET') # bearer_token = os.environ.get('BEARER_TOKEN') twarc = Twarc(consumer_key, consumer_secret, access_token_key, access_token_secret) for handle in handles: print(f'Scanning twitter handle @{handle}') for tweet in twarc.timeline(screen_name=handle): created_at = parse_twitter_datetime(tweet['created_at']) print(f'Found tweet created at @{created_at}') yield tweet if created_at <= after_date: break
def get_tweets(tcconfig, up_to_pages=1, source_id="dgSHiftCodes"): logger.debug("get_tweets args: {}".format( [tcconfig, up_to_pages, source_id])) if tcconfig is not None: logger.info("Setting Twitter client credential config") ct = tcconfig["consumer_key"] cs = tcconfig["consumer_secret"] at = tcconfig["access_token"] ats = tcconfig["access_token_secret"] logger.debug("CT: {0}, CS: {1}, AT: {2}, ATS: {3}".format( ct, cs, at, ats)) else: logger.error("No Twitter client config argument provided") raise Exception("tcconfig cannot be None") twsclient = Twarc(ct, cs, at, ats) return twsclient.timeline(screen_name=source_id, max_pages=up_to_pages)
def UserTimeLine_Extract(variables_dict, target): ''' ''' # This creates an instance of Twarc. credentials = variables_dict['credentials'] t = Twarc(consumer_key=credentials['consumer_key'], consumer_secret=credentials['consumer_secret'], access_token=credentials['access_token'], access_token_secret=credentials['access_token_secret'] ) tweet_list = [] # go through user timeline #for tweet in t.timeline(user_id='1339835893'): for tweet in t.timeline(screen_name=target): tweet_json = json.dumps(tweet) tweet_list.append(tweet_json) # tweet infor print "{} is created at {} with the following text: ".format(tweet['id_str'], tweet['created_at']) print "{}".format(tweet['text'].encode('utf-8')) print "by {}. \n".format(tweet['user']['screen_name']) return tweet_list
def get_account(item): item.agent_name = agent_name + "_1_get_account" if not os.path.exists(item.storage_folder): os.makedirs(item.storage_folder) t = Twarc(twitter_consumer_key, twitter_consumer_secret, twitter_access_token, twitter_access_token_secret) name = item.url.strip().replace("https://twitter.com/", "").replace("?", "") file_path = os.path.join( item.storage_folder, "{}_{}.json".format(time.strftime("%d-%m-%Y_%H-%M-%S"), name)) if not os.path.exists(item.storage_folder): os.makedirs(item.storage_folder) tweets = [] for tweet in t.timeline(screen_name=name): tweets.append(tweet) tweets = filter_tweets_by_start_date(tweets, item.date_range) for tweet in tweets: get_assets(tweet, item.storage_folder) with open(file_path, "w") as outfile: json.dump(tweets, outfile) item.completed = True return item
consumer_secret = "2x8Q0WyWNV86XEWRAuYhJB0kUu4M9BosgemxMjnPbiu00t5HE7" access_key = "602847795-0GJCA5vujrexWTCfK6ZtxZD2MZ8pCuA1zBKO5fNa" access_secret = "6cN8sgBp7DiDITJbg0uCSlWoeY84YoLJs5HOxzxmqtjEj" prev_date = timedelta(days=10) today = datetime.now().date() time_range = today - prev_date t = Twarc(consumer_key, consumer_secret, access_key, access_secret) for name in usa_list: print(name) file_name = r"C:\\Users\\ravik\\OneDrive\\Desktop\\UsertimelineReplies\\" + str(name) + ".json" max_poi_tweet = 0 with open( file_name, "a", encoding='utf-8') as file: for tweet in t.timeline(screen_name=name): if 'retweeted_status' in tweet.keys(): print("Its a retweet") continue if max_poi_tweet > 3000: break json.dump(tweet, file, ensure_ascii=False) file.write("\n") max_poi_tweet +=1 max_replies = 0 if datetime.strptime(tweet['created_at'], "%a %b %d %H:%M:%S %z %Y").date() >= time_range: for reply in t.replies(tweet): #print("In") #preprocessing(tweet, file) if 'retweeted_status' in tweet.keys(): print("Its a retweet")
class AnalyzerProcess(): def __init__(self, config, loggerObject, alerLoggerObject, rules, executionMode): self.logger = loggerObject self.alertLogger = alerLoggerObject self.rules = rules self.config = config self.executionMode = executionMode self.access_token = "insert Twitter API access token" self.access_token_secret = "insert Twitter API token secret" self.consumer_key = "insert Twitter API consumer key" self.consumer_secret = "insert Twitter API consumer secret" self.twarc = Twarc(self.consumer_key, self.consumer_secret, self.access_token, self.access_token_secret) self.currdir = "/home/centos/modosint-python3" + path.dirname(__file__) self.wcloud = "" self.stop_words = get_stop_words('spanish') newStopWords = ["http", "https", "co", "n'", "'", '"'] self.stop_words.extend(newStopWords) #Search Tweets that contais term in different Language def searchDifLanguage(self, text, language, ruleId): fichero = open("/var/log/modosint/analyzer-twitter/graylog.txt", "+a", encoding='utf8') with io.open("/var/log/modosint/analyzer-twitter/cache.txt", 'a+') as f: os.chmod("/var/log/modosint/analyzer-twitter/cache.txt", 0o777) traductor = Translator() translatedText = traductor.translate(text, dest=language) repeated = False if self.executionMode == "daemon": searchDif = self.twarc.search(translatedText.text) for tweet in searchDif: tweetTime = parser.parse(''.join(tweet['created_at'])) timeFormed = time.strptime( str(tweetTime.time()).split(',')[0], '%H:%M:%S') createdAtSeconds = datetime.timedelta( hours=timeFormed.tm_hour, minutes=timeFormed.tm_min, seconds=timeFormed.tm_sec).total_seconds() nowTimeUtc = datetime.datetime.utcnow().time() nowTimeFormed = time.strptime( str(nowTimeUtc).split('.')[0], '%H:%M:%S') nowTimeSeconds = datetime.timedelta( hours=nowTimeFormed.tm_hour, minutes=nowTimeFormed.tm_min, seconds=nowTimeFormed.tm_sec).total_seconds() if (nowTimeSeconds - createdAtSeconds < 300): #time in 5 minutes if 'retweeted_status' not in tweet: #avoid RT f.seek(0) #read temporary file (cache) content = f.readlines() content = [ x.strip('\n').strip('u') for x in content ] for i in range(len(content)): if tweet['id_str'] in content: repeated = True else: repeated = False if repeated == False: f.seek(0, 2) #write temporary file (cache) f.write(tweet['id_str']) f.write('\n') texto = tweet['full_text'] for c in texto: if c in emoji.UNICODE_EMOJI: texto = texto.replace(c, "") texto = u'' + texto try: emoji_pattern = re.compile( u"(\ud83d[\ude00-\ude4f])|" # emoticons u"(\ud83c[\udf00-\uffff])|" # symbols & pictographs (1 of 2) u"(\ud83d[\u0000-\uddff])|" # symbols & pictographs (2 of 2) u"(\ud83d[\ude80-\udeff])|" # transport & map symbols u"(\U0001F1E0-\U0001F1FF])|" u"(\U0001F600-\U0001F64F])|" # emoticons 2 u"(\U0001F300-\U0001F5FF])|" # symbols & pictographs u"(\U0001F680-\U0001F6FF])|" u"(\u2600-\u26FF])|" u"(\U0001F1F2\U0001F1F4)|" # Macau flag u"([\U0001F1E6-\U0001F1FF]{2})|" # flags u"([\U0001F600-\U0001F64F])" # emoticons 3 u"(\ud83c[\udde0-\uddff])" # flags (iOS) "+", flags=re.UNICODE) resultesp = traductor.translate( emoji_pattern.sub(r'', texto), dest='es') except ValueError: self.my_logger.debug( '[Emoji Error] Tweet can not be translated. Unrecognized emoji in tweet.' ) tweetdata = { "CreatedTime": tweet['created_at'], "short_message": tweet['full_text'], "TranslatedTweet": resultesp.text, "Author": tweet['user']['screen_name'], "Retweets": tweet['retweet_count'], "Likes": tweet['favorite_count'], "Location": tweet['user']['location'], "Rule": ruleId, "full_message": "Tweet matched with RULE: " + ruleId } autotweet = json.dumps(tweetdata) fichero.write(autotweet + '\n') self.wcloud.write(resultesp.text + '\n') os.chmod( "/var/log/modosint/analyzer-twitter/wcloudRule" + ruleId + ".txt", 0o777) os.chmod( "/var/log/modosint/analyzer-twitter/graylog.txt", 0o777) else: break else: searchDif = self.twarc.search(translatedText.text) for tweet in searchDif: tweetTime = ''.join(tweet['created_at']) datetweet = parser.parse(tweetTime) if (datetweet.date() == datetime.datetime.now().date() or datetweet.date() == (datetime.datetime.now().date() - timedelta(1))): if 'retweeted_status' not in tweet: #avoid RT f.seek(0) #read temporary file (cache) content = f.readlines() content = [ x.strip('\n').strip('u') for x in content ] for i in range(len(content)): if tweet['id_str'] in content: repeated = True else: repeated = False if repeated == False: f.seek(0, 2) #write temporary file (cache) f.write(tweet['id_str']) f.write('\n') texto = tweet['full_text'] for c in texto: if c in emoji.UNICODE_EMOJI: texto = texto.replace(c, "") texto = u'' + texto try: emoji_pattern = re.compile( u"(\ud83d[\ude00-\ude4f])|" # emoticons u"(\ud83c[\udf00-\uffff])|" # symbols & pictographs (1 of 2) u"(\ud83d[\u0000-\uddff])|" # symbols & pictographs (2 of 2) u"(\ud83d[\ude80-\udeff])|" # transport & map symbols u"(\U0001F1E0-\U0001F1FF])|" u"(\U0001F600-\U0001F64F])|" # emoticons 2 u"(\U0001F300-\U0001F5FF])|" # symbols & pictographs u"(\U0001F680-\U0001F6FF])|" u"(\u2600-\u26FF])|" u"(\U0001F1F2\U0001F1F4)|" # Macau flag u"([\U0001F1E6-\U0001F1FF]{2})|" # flags u"([\U0001F600-\U0001F64F])" # emoticons 3 u"(\ud83c[\udde0-\uddff])" # flags (iOS) "+", flags=re.UNICODE) resultesp = traductor.translate( emoji_pattern.sub(r'', texto), dest='es') except ValueError: self.my_logger.debug( '[Emoji Error] Tweet can not be translated. Unrecognized emoji in tweet.' ) tweetdata = { "CreatedTime": tweet['created_at'], "short_message": tweet['full_text'], "TranslatedTweet": resultesp.text, "Author": tweet['user']['screen_name'], "Retweets": tweet['retweet_count'], "Likes": tweet['favorite_count'], "Location": tweet['user']['location'], "Rule": ruleId, "full_message": "Tweet matched with RULE: " + ruleId } autotweet = json.dumps(tweetdata) fichero.write(autotweet + '\n') self.wcloud.write(resultesp.text + '\n') os.chmod( "/var/log/modosint/analyzer-twitter/wcloudRule" + ruleId + ".txt", 0o777) os.chmod( "/var/log/modosint/analyzer-twitter/graylog.txt", 0o777) else: break #Search Tweets that contains term or Hashtag def searchTweetOrHashtag(self, text, ruleId): fichero = open("/var/log/modosint/analyzer-twitter/graylog.txt", "+a", encoding='utf8') with io.open("/var/log/modosint/analyzer-twitter/cache.txt", 'a+') as f: os.chmod("/var/log/modosint/analyzer-twitter/cache.txt", 0o777) repeated = False if self.executionMode == "daemon": tweets = self.twarc.search(text) for tweet in tweets: tweetTime = parser.parse(''.join(tweet['created_at'])) timeFormed = time.strptime( str(tweetTime.time()).split(',')[0], '%H:%M:%S') createdAtSeconds = datetime.timedelta( hours=timeFormed.tm_hour, minutes=timeFormed.tm_min, seconds=timeFormed.tm_sec).total_seconds() nowTimeUtc = datetime.datetime.utcnow().time() nowTimeFormed = time.strptime( str(nowTimeUtc).split('.')[0], '%H:%M:%S') nowTimeSeconds = datetime.timedelta( hours=nowTimeFormed.tm_hour, minutes=nowTimeFormed.tm_min, seconds=nowTimeFormed.tm_sec).total_seconds() if (nowTimeSeconds - createdAtSeconds < 300): #time in 5 minutes if 'retweeted_status' not in tweet: #avoid RT f.seek(0) #read temporary file (cache) content = f.readlines() content = [ x.strip('\n').strip('u') for x in content ] for i in range(len(content)): if tweet['id_str'] in content: repeated = True else: repeated = False if repeated == False: f.seek(0, 2) #write temporary file (cache) f.write(tweet['id_str'] + '\n') tweetdata = { "CreatedTime": tweet['created_at'], "short_message": tweet['full_text'], "Author": tweet['user']['screen_name'], "Retweets": tweet['retweet_count'], "Likes": tweet['favorite_count'], "Location": tweet['user']['location'], "Rule": ruleId, "full_message": "Tweet matched with RULE: " + ruleId } autotweet = json.dumps(tweetdata) fichero.write(autotweet + '\n') self.wcloud.write(tweet['full_text'] + '\n') os.chmod( "/var/log/modosint/analyzer-twitter/wcloudRule" + ruleId + ".txt", 0o777) os.chmod( "/var/log/modosint/analyzer-twitter/graylog.txt", 0o777) else: break else: tweets = self.twarc.search(text) for tweet in tweets: #no daemon(tweets in this day and yesterday) tweetTime = ''.join(tweet['created_at']) datetweet = parser.parse(tweetTime) if (datetweet.date() == datetime.datetime.now().date() or datetweet.date() == (datetime.datetime.now().date() - timedelta(1))): if 'retweeted_status' not in tweet: #avoid RT f.seek(0) #read temporary file (cache) content = f.readlines() content = [ x.strip('\n').strip('u') for x in content ] for i in range(len(content)): if tweet['id_str'] in content: repeated = True else: repeated = False if repeated == False: f.seek(0, 2) #write temporary file (cache) f.write(tweet['id_str'] + '\n') tweetdata = { "CreatedTime": tweet['created_at'], "short_message": tweet['full_text'], "Author": tweet['user']['screen_name'], "Retweets": tweet['retweet_count'], "Likes": tweet['favorite_count'], "Location": tweet['user']['location'], "Rule": ruleId, "full_message": "Tweet matched with RULE: " + ruleId } autotweet = json.dumps(tweetdata) fichero.write(autotweet + '\n') self.wcloud.write(tweet['full_text'] + '\n') os.chmod( "/var/log/modosint/analyzer-twitter/wcloudRule" + ruleId + ".txt", 0o777) os.chmod( "/var/log/modosint/analyzer-twitter/graylog.txt", 0o777) else: break #Search All Tweets or timeline from @user def searchUserTweets(self, user, ruleId, fullstring): fichero = open("/var/log/modosint/analyzer-twitter/graylog.txt", "+a", encoding='utf8') with io.open("/var/log/modosint/analyzer-twitter/cache.txt", 'a+') as f: os.chmod("/var/log/modosint/analyzer-twitter/cache.txt", 0o777) tweets = self.twarc.timeline(None, user, None, None) repeated = False t_end = time.time() + 30 for tweet in tweets: if time.time() < t_end: for text in fullstring: if text in tweet['full_text']: f.seek(0) #read temporary file (cache) content = f.readlines() content = [ x.strip('\n').strip('u') for x in content ] for i in range(len(content)): if tweet['id_str'] in content: repeated = True else: repeated = False if repeated == False: f.seek(0, 2) #write temporary file (cache) f.write(tweet['id_str']) f.write('\n') tweetdata = { "CreatedTime": tweet['created_at'], "short_message": tweet['full_text'], "Author": tweet['user']['screen_name'], "Retweets": tweet['retweet_count'], "Likes": tweet['favorite_count'], "Location": tweet['user']['location'], "Rule": ruleId, "full_message": "Tweet matched with RULE: " + ruleId } autotweet = json.dumps(tweetdata) fichero.write(autotweet + '\n') self.wcloud.write(tweet['full_text'] + '\n') os.chmod( "/var/log/modosint/analyzer-twitter/wcloudRule" + ruleId + ".txt", 0o777) os.chmod( "/var/log/modosint/analyzer-twitter/graylog.txt", 0o777) else: break def create_wordcloud(self, text, ruleId): mask = np.array(Image.open(path.join(self.currdir, "twitter_mask.png"))) # create wordcloud object wc = WordCloud(background_color="white", max_words=200, mask=mask, stopwords=self.stop_words) try: # generate wordcloud wc.generate(text) # save wordcloud wc.to_file( path.join(self.currdir + "/WordCloud/Twitter/", "wcTwitterRule" + ruleId + ".png")) os.chmod( path.join(self.currdir + "/WordCloud/Twitter/", "wcTwitterRule" + ruleId + ".png"), 0o777) except ValueError as e: error = True # custom functionality def run(self): self.logger.info("working...") OSINTRules = self.rules for element in OSINTRules: ruleId = element.get('metadata', False).get('id', False) self.wcloud = open( "/var/log/modosint/analyzer-twitter/wcloudRule" + ruleId + ".txt", "a+") checkUsername = element.get('_username', False) checkString = element.get('_string', False) if checkUsername: user = (''.join(element['_username'])) if checkString: string = (','.join(element['_string'])) fullstring = element['_string'] checkLanguage = element.get('_language', False) if checkLanguage: language = (''.join(element['_language'])) self.searchDifLanguage(string, language, ruleId) else: self.searchTweetOrHashtag(string, ruleId) if checkUsername: self.searchUserTweets(user, ruleId, fullstring) if not os.path.exists(self.currdir + "/WordCloud"): os.makedirs(self.currdir + "/WordCloud/") os.chmod(self.currdir + "/WordCloud/", 0o777) if not os.path.exists(self.currdir + "/WordCloud/Twitter"): os.makedirs(self.currdir + "/WordCloud/Twitter/") os.chmod(self.currdir + "/WordCloud/Twitter/", 0o777) for element in OSINTRules: ruleId = element.get('metadata', False).get('id', False) file_content = open( "/var/log/modosint/analyzer-twitter/wcloudRule" + ruleId + ".txt", "r") file_content = file_content.readlines() self.create_wordcloud(str(file_content), ruleId) self.createPlotMentions() self.createPlotHashtag() self.alertLogger.info("Twitter Analyzer Job Finished succesfully.") def exportReferenceHashtag(self, mensaje): lista = re.findall(r'#\w+', mensaje) return lista if lista != [] else np.NaN def exportReferenceMentions(self, mensaje): lista = re.findall(r'@\w+', mensaje) return lista if lista != [] else np.NaN def createPlotMentions(self): with io.open('/var/log/modosint/analyzer-twitter/graylog.txt', 'r') as f: dataMentions = f.readlines() data_json = json.dumps( list(map(lambda entry: eval(entry[:-1]), dataMentions))) data_twitter = pd.read_json(data_json) referenceMentions = data_twitter.short_message.map( self.exportReferenceMentions) referenceMentions.dropna(inplace=True) referenceMentions.head() referenceMentions = list(referenceMentions) referenceMentions_list = list(itertools.chain(*referenceMentions)) count_referenceMentions = pd.Series( referenceMentions_list).value_counts() fig = plt.figure(figsize=(12, 8)) sns.barplot(y=count_referenceMentions.iloc[:20].index, x=count_referenceMentions.iloc[:20].values) fig.savefig(self.currdir + 'mentionsPlot.png') os.chmod(self.currdir + 'mentionsPlot.png', 0o777) def createPlotHashtag(self): with io.open('/var/log/modosint/analyzer-twitter/graylog.txt', 'r') as f: dataHashtag = f.readlines() data_json = json.dumps( list(map(lambda entry: eval(entry[:-1]), dataHashtag))) data_twitter = pd.read_json(data_json) referenceHash = data_twitter.short_message.map( self.exportReferenceHashtag) referenceHash.dropna(inplace=True) referenceHash.head() referenceHash = list(referenceHash) referenceHash_list = list(itertools.chain(*referenceHash)) count_referenceHash = pd.Series(referenceHash_list).value_counts() fig = plt.figure(figsize=(12, 8)) sns.barplot(y=count_referenceHash.iloc[:20].index, x=count_referenceHash.iloc[:20].values) fig.savefig(self.currdir + 'mentionsHashtag.png') os.chmod(self.currdir + 'mentionsHashtag.png', 0o777)
class TwitterHarvester(BaseHarvester): def __init__(self, working_path, stream_restart_interval_secs=30 * 60, mq_config=None, debug=False, connection_errors=5, http_errors=5, debug_warcprox=False, tries=3): BaseHarvester.__init__( self, working_path, mq_config=mq_config, stream_restart_interval_secs=stream_restart_interval_secs, debug=debug, debug_warcprox=debug_warcprox, tries=tries) self.twarc = None self.connection_errors = connection_errors self.http_errors = http_errors self.extract_media = False self.extract_web_resources = False self.extract_user_profile_images = False def harvest_seeds(self): # Create a twarc self._create_twarc() # Get harvest extract options. self.extract_media = self.message.get("options", {}).get("media", False) self.extract_web_resources = self.message.get("options", {}).get( "web_resources", False) self.extract_user_profile_images = self.message.get("options", {}).get( "user_images", False) # Dispatch message based on type. harvest_type = self.message.get("type") log.debug("Harvest type is %s", harvest_type) if harvest_type == "twitter_search": self.search() elif harvest_type == "twitter_filter": self.filter() elif harvest_type == "twitter_sample": self.sample() elif harvest_type == "twitter_user_timeline": self.user_timeline() else: raise KeyError def _create_twarc(self): self.twarc = Twarc(self.message["credentials"]["consumer_key"], self.message["credentials"]["consumer_secret"], self.message["credentials"]["access_token"], self.message["credentials"]["access_token_secret"], http_errors=self.http_errors, connection_errors=self.connection_errors) def search(self): assert len(self.message.get("seeds", [])) == 1 incremental = self.message.get("options", {}).get("incremental", False) query = self.message["seeds"][0]["token"] since_id = self.state_store.get_state( __name__, u"{}.since_id".format(query)) if incremental else None self._harvest_tweets(self.twarc.search(query, since_id=since_id)) def filter(self): assert len(self.message.get("seeds", [])) == 1 track = self.message["seeds"][0]["token"].get("track") follow = self.message["seeds"][0]["token"].get("follow") locations = self.message["seeds"][0]["token"].get("locations") self._harvest_tweets( self.twarc.filter(track=track, follow=follow, locations=locations, event=self.stop_harvest_seeds_event)) def sample(self): self._harvest_tweets(self.twarc.sample(self.stop_harvest_seeds_event)) def user_timeline(self): incremental = self.message.get("options", {}).get("incremental", False) for seed in self.message.get("seeds", []): seed_id = seed["id"] screen_name = seed.get("token") user_id = seed.get("uid") log.debug( "Processing seed (%s) with screen name %s and user id %s", seed_id, screen_name, user_id) assert screen_name or user_id # If there is not a user_id, look it up. if screen_name and not user_id: user_id = self._lookup_user_id(screen_name) if user_id: # Report back if nsid found self.result.uids[seed_id] = user_id else: msg = "User id not found for user {} because account is not found or suspended".format( screen_name) log.exception(msg) self.result.warnings.append( Msg(CODE_TOKEN_NOT_FOUND, msg, seed_id=seed_id)) # Otherwise, get the current screen_name else: new_screen_name = self._lookup_screen_name(user_id) # if can't find the screen_name, ignore get timeline if not new_screen_name: msg = "Screen name not found for user id {} because account is not found or suspended".format( user_id) log.exception(msg) self.result.warnings.append( Msg(CODE_TOKEN_NOT_FOUND, msg, seed_id=seed_id)) # reset the user_id, ignore the get timeline user_id = None if new_screen_name and new_screen_name != screen_name: self.result.token_updates[seed_id] = new_screen_name screen_name = new_screen_name if user_id: try: # Get since_id from state_store since_id = self.state_store.get_state( __name__, "timeline.{}.since_id".format( user_id)) if incremental else None self._harvest_tweets( self.twarc.timeline(user_id=user_id, since_id=since_id)) except HTTPError as e: if e.response.status_code == 401: account = "user {} (User ID: {})".format( screen_name, user_id ) if screen_name else "user ID: {}".format(user_id) msg = "Unauthorized for {} because account is suspended or protected".format( account) log.exception(msg) self.result.warnings.append( Msg(CODE_TOKEN_UNAUTHORIZED, msg, seed_id=seed_id)) else: raise e def _lookup_screen_name(self, user_id): """ Lookup a screen name given a user id. """ try: users = list(self.twarc.user_lookup(user_ids=(user_id, ))) assert len(users) in (0, 1) if users: return users[0]["screen_name"] except HTTPError as e: if e.response.status_code != 404: raise e return None def _lookup_user_id(self, screen_name): """ Lookup a user id given a screen name. """ try: users = list(self.twarc.user_lookup(screen_names=(screen_name, ))) assert len(users) in (0, 1) if users: return users[0]["id_str"] except HTTPError as e: if e.response.status_code != 404: raise e return None def _harvest_tweets(self, tweets): # max_tweet_id = None for count, tweet in enumerate(tweets): if not count % 100: log.debug("Harvested %s tweets", count) self.result.harvest_counter["tweets"] += 1 if self.stop_harvest_seeds_event.is_set(): log.debug("Stopping since stop event set.") break def _process_entities(self, entities): if self.extract_web_resources: for url in entities.get("urls", []): # Exclude links for tweets if url["expanded_url"] and not status_re.match( url["expanded_url"]): self.result.urls.append(url["expanded_url"]) if self.extract_media: for media in entities.get("media", []): if media["media_url"]: self.result.urls.append(media["media_url"]) def process_warc(self, warc_filepath): # Dispatch message based on type. harvest_type = self.message.get("type") log.debug("Harvest type is %s", harvest_type) if harvest_type == "twitter_search": self.process_search_warc(warc_filepath) elif harvest_type == "twitter_filter": self._process_tweets(TwitterStreamWarcIter(warc_filepath)) elif harvest_type == "twitter_sample": self._process_tweets(TwitterStreamWarcIter(warc_filepath)) elif harvest_type == "twitter_user_timeline": self.process_user_timeline_warc(warc_filepath) else: raise KeyError def process_search_warc(self, warc_filepath): incremental = self.message.get("options", {}).get("incremental", False) query = self.message["seeds"][0]["token"] since_id = self.state_store.get_state( __name__, u"{}.since_id".format(query)) if incremental else None max_tweet_id = self._process_tweets(TwitterRestWarcIter(warc_filepath)) # Update state store if incremental and max_tweet_id > since_id: self.state_store.set_state(__name__, u"{}.since_id".format(query), max_tweet_id) def process_user_timeline_warc(self, warc_filepath): incremental = self.message.get("options", {}).get("incremental", False) for count, status in enumerate(TwitterRestWarcIter(warc_filepath)): tweet = status.item if not count % 100: log.debug("Processing %s tweets", count) if "text" in tweet: user_id = tweet["user"]["id_str"] if incremental: # Update state key = "timeline.{}.since_id".format(user_id) self.state_store.set_state( __name__, key, max(self.state_store.get_state(__name__, key), tweet.get("id"))) self._process_tweet(tweet) def _process_tweets(self, warc_iter): max_tweet_id = None for count, status in enumerate(warc_iter): tweet = status.item if not count % 100: log.debug("Processing %s tweets", count) if "text" in tweet: max_tweet_id = max(max_tweet_id, tweet.get("id")) self._process_tweet(tweet) return max_tweet_id def _process_tweet(self, tweet): self.result.increment_stats("tweets") # For more info, see https://dev.twitter.com/overview/api/entities-in-twitter-objects statuses = [tweet] if "retweeted_status" in tweet: statuses.append(tweet["retweeted_status"]) elif "quoted_status" in tweet: statuses.append(tweet["quoted_status"]) for status in statuses: self._process_entities(status.get("entities", {})) self._process_entities(status.get("extended_entities", {})) if self.extract_user_profile_images: self.result.urls.append(tweet["user"]["profile_image_url"]) self.result.urls.append( tweet["user"]["profile_background_image_url"]) if "profile_banner_url" in tweet["user"]: self.result.urls.append(tweet["user"]["profile_banner_url"])
class TwitterHarvester(BaseHarvester): def __init__(self, working_path, stream_restart_interval_secs=30 * 60, mq_config=None, debug=False, connection_errors=5, http_errors=5, debug_warcprox=False, tries=3): BaseHarvester.__init__( self, working_path, mq_config=mq_config, stream_restart_interval_secs=stream_restart_interval_secs, debug=debug, debug_warcprox=debug_warcprox, tries=tries) self.twarc = None self.connection_errors = connection_errors self.http_errors = http_errors def harvest_seeds(self): # Create a twarc self._create_twarc() # Dispatch message based on type. harvest_type = self.message.get("type") log.debug("Harvest type is %s", harvest_type) if harvest_type == "twitter_search": self.search() elif harvest_type == "twitter_filter": self.filter() elif harvest_type == "twitter_sample": self.sample() elif harvest_type == "twitter_user_timeline": self.user_timeline() else: raise KeyError def _create_twarc(self): self.twarc = Twarc(self.message["credentials"]["consumer_key"], self.message["credentials"]["consumer_secret"], self.message["credentials"]["access_token"], self.message["credentials"]["access_token_secret"], http_errors=self.http_errors, connection_errors=self.connection_errors, tweet_mode="extended") def search(self): assert len(self.message.get("seeds", [])) == 1 incremental = self.message.get("options", {}).get("incremental", False) since_id = self.state_store.get_state( __name__, u"{}.since_id".format( self._search_id())) if incremental else None query, geocode = self._search_parameters() self._harvest_tweets( self.twarc.search(query, geocode=geocode, since_id=since_id)) def _search_parameters(self): if type(self.message["seeds"][0]["token"]) is dict: query = self.message["seeds"][0]["token"].get("query") geocode = self.message["seeds"][0]["token"].get("geocode") else: query = self.message["seeds"][0]["token"] geocode = None return query, geocode def _search_id(self): query, geocode = self._search_parameters() if query and not geocode: return query if geocode and not query: return geocode return ":".join([query, geocode]) def filter(self): assert len(self.message.get("seeds", [])) == 1 track = self.message["seeds"][0]["token"].get("track") follow = self.message["seeds"][0]["token"].get("follow") locations = self.message["seeds"][0]["token"].get("locations") language = self.message["seeds"][0]["token"].get("language") self._harvest_tweets( self.twarc.filter(track=track, follow=follow, locations=locations, lang=language, event=self.stop_harvest_seeds_event)) def sample(self): self._harvest_tweets(self.twarc.sample(self.stop_harvest_seeds_event)) def user_timeline(self): incremental = self.message.get("options", {}).get("incremental", False) for seed in self.message.get("seeds", []): seed_id = seed["id"] screen_name = seed.get("token") user_id = seed.get("uid") log.debug( "Processing seed (%s) with screen name %s and user id %s", seed_id, screen_name, user_id) assert screen_name or user_id # If there is not a user_id, look it up. if screen_name and not user_id: result, user = self._lookup_user(screen_name, "screen_name") if result == "OK": user_id = user["id_str"] self.result.uids[seed_id] = user_id else: msg = u"User id not found for {} because account is {}".format( screen_name, self._result_to_reason(result)) log.exception(msg) self.result.warnings.append( Msg("token_{}".format(result), msg, seed_id=seed_id)) # Otherwise, get the current screen_name else: result, user = self._lookup_user(user_id, "user_id") if result == "OK": new_screen_name = user["screen_name"] if new_screen_name and new_screen_name != screen_name: self.result.token_updates[seed_id] = new_screen_name else: msg = u"User {} (User ID: {}) not found because account is {}".format( screen_name, user_id, self._result_to_reason(result)) log.exception(msg) self.result.warnings.append( Msg("uid_{}".format(result), msg, seed_id=seed_id)) user_id = None if user_id: # Get since_id from state_store since_id = self.state_store.get_state( __name__, "timeline.{}.since_id".format( user_id)) if incremental else None self._harvest_tweets( self.twarc.timeline(user_id=user_id, since_id=since_id)) def _lookup_user(self, id, id_type): url = "https://api.twitter.com/1.1/users/show.json" params = {id_type: id} # USER_DELETED: 404 and {"errors": [{"code": 50, "message": "User not found."}]} # USER_PROTECTED: 200 and user object with "protected": true # USER_SUSPENDED: 403 and {"errors":[{"code":63,"message":"User has been suspended."}]} result = "OK" user = None try: resp = self.twarc.get(url, params=params, allow_404=True) user = resp.json() if user['protected']: result = "unauthorized" except requests.exceptions.HTTPError as e: try: resp_json = e.response.json() except json.decoder.JSONDecodeError: raise e if e.response.status_code == 404 and self._has_error_code( resp_json, 50): result = "not_found" elif e.response.status_code == 403 and self._has_error_code( resp_json, 63): result = "suspended" else: raise e return result, user @staticmethod def _has_error_code(resp, code): if isinstance(code, int): code = (code, ) for error in resp['errors']: if error['code'] in code: return True return False @staticmethod def _result_to_reason(result): if result == "unauthorized": return "protected" elif result == "suspended": return "suspended" return "not found or deleted" def _harvest_tweets(self, tweets): # max_tweet_id = None for count, tweet in enumerate(tweets): if not count % 100: log.debug("Harvested %s tweets", count) self.result.harvest_counter["tweets"] += 1 if self.stop_harvest_seeds_event.is_set(): log.debug("Stopping since stop event set.") break def process_warc(self, warc_filepath): # Dispatch message based on type. harvest_type = self.message.get("type") log.debug("Harvest type is %s", harvest_type) if harvest_type == "twitter_search": self.process_search_warc(warc_filepath) elif harvest_type == "twitter_filter": self._process_tweets(TwitterStreamWarcIter(warc_filepath)) elif harvest_type == "twitter_sample": self._process_tweets(TwitterStreamWarcIter(warc_filepath)) elif harvest_type == "twitter_user_timeline": self.process_user_timeline_warc(warc_filepath) else: raise KeyError def process_search_warc(self, warc_filepath): incremental = self.message.get("options", {}).get("incremental", False) since_id = self.state_store.get_state( __name__, u"{}.since_id".format( self._search_id())) if incremental else None max_tweet_id = self._process_tweets(TwitterRestWarcIter(warc_filepath)) # Update state store if incremental and (max_tweet_id or 0) > (since_id or 0): self.state_store.set_state( __name__, u"{}.since_id".format(self._search_id()), max_tweet_id) def process_user_timeline_warc(self, warc_filepath): incremental = self.message.get("options", {}).get("incremental", False) for count, status in enumerate(TwitterRestWarcIter(warc_filepath)): tweet = status.item if not count % 100: log.debug("Processing %s tweets", count) if "text" in tweet or "full_text" in tweet: user_id = tweet["user"]["id_str"] if incremental: # Update state key = "timeline.{}.since_id".format(user_id) self.state_store.set_state( __name__, key, max( self.state_store.get_state(__name__, key) or 0, tweet.get("id"))) self._process_tweet(tweet) def _process_tweets(self, warc_iter): max_tweet_id = None for count, status in enumerate(warc_iter): tweet = status.item if not count % 100: log.debug("Processing %s tweets", count) if "text" in tweet or "full_text" in tweet: max_tweet_id = max(max_tweet_id or 0, tweet.get("id")) self._process_tweet(tweet) return max_tweet_id def _process_tweet(self, _): self.result.increment_stats("tweets")
def user_info_crawler(screen_name, user_dir, user_profile_f, user_profileimg_f, user_tweets_f, user_clean_tweets_f): try: # crawl user profile # sys.stdout.write('Get user profile >> ') # sys.stdout.flush() if not os.path.exists(os.path.join(user_dir, user_profile_f)): t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret) user_profile_data = t.user_lookup(ids=[screen_name], id_type="screen_name") for user_profile in user_profile_data: with open(os.path.join(user_dir, user_profile_f), 'w') as outfile: json.dump(user_profile, outfile) # crawl user profile image # sys.stdout.write('Get user profile image >> ') # sys.stdout.flush() with open(os.path.join(user_dir, user_profile_f), 'r') as rf: user_profile_json = json.load(rf) if not os.path.exists(os.path.join(user_dir, user_profileimg_f)): # extract user profile image url user_profileimg_url = user_profile_json['profile_image_url'] def image_converter(user_profileimg_url): tmp_file = '../data/user/tmp' + user_profileimg_url[-4:] if sys.version_info[0] == 2: urllib.urlretrieve(user_profileimg_url, tmp_file) elif sys.version_info[0] == 3: urlretrieve(user_profileimg_url, tmp_file) from PIL import Image im = Image.open(tmp_file) rgb_im = im.convert('RGB') rgb_im.save(os.path.join(user_dir, user_profileimg_f)) os.remove(tmp_file) if user_profileimg_url: user_profileimg_url = user_profileimg_url.replace('_normal', '_bigger') image_converter(user_profileimg_url) # crawl user tweets # sys.stdout.write('Get user tweets >> ') # sys.stdout.flush() if not os.path.exists(os.path.join(user_dir, user_tweets_f)): user_timeline_data = t.timeline(screen_name=screen_name) with open(os.path.join(user_dir, user_tweets_f), 'a') as outfile: for user_timeline in user_timeline_data: json.dump(user_timeline, outfile) outfile.write('\n') # clean user tweets # sys.stdout.write('Clean user tweets \n') # sys.stdout.flush() if not os.path.exists(os.path.join(user_dir, user_clean_tweets_f)): tweet_raw_lines = [] with open(os.path.join(user_dir, user_tweets_f), 'r') as rf: for line in rf: tweet_raw_lines.append(json.loads(line)['full_text']) clean_tweets = process_raw_tweets(tweet_raw_lines) with open(os.path.join(user_dir, user_clean_tweets_f), 'w') as wf: for tweet in clean_tweets: if len(tweet) > 0: wf.write(tweet + '\n') wf.close() return user_profile_json except Exception as e: # print(e) print("Could not predict user's role. Check account info, few tweets, incorrect image format...")
def crawl_feed(feed_dict, credentials): twarc = Twarc(credentials['consumer_key'], credentials['consumer_secret'], credentials['access_token'], credentials['access_token_secret']) crawl_time = datetime.datetime.now() crawl_time_filename = crawl_time.strftime('%Y%m%d%I%M%S') crawl_time_html = crawl_time.strftime('%B %d, %Y') crawl_name = feed_dict['crawl_name'] crawl_type = feed_dict['crawl_type'] short_name = feed_dict['short_name'] search_string = feed_dict['search_string'] feed_dir = feed_dict['feed_dir'] json_dir = join(feed_dir, 'json') html_dir = join(feed_dir, 'html') media_dir = join(feed_dir, 'media') logs_dir = join(feed_dir, 'logs') for directory in [feed_dir, json_dir, html_dir, media_dir, logs_dir]: if not os.path.exists(directory): os.makedirs(directory) log_file = join(logs_dir, 'twarc.log') formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') logger = logging.getLogger(crawl_name) handler = logging.FileHandler(log_file) handler.setFormatter(formatter) logger.addHandler(handler) logger.setLevel(logging.INFO) base_filename = short_name + '-' + crawl_time_filename json_file = join(json_dir, base_filename + '.json') print("Searching Twitter API for {0}".format(search_string)) print("Writing JSON and HTML files...") logger.info("starting search for %s", search_string) tweet_count = 0 if crawl_type == "timeline": for tweet in twarc.timeline(screen_name=search_string): with open(json_file, 'a') as json_out: json_out.write("{}\n".format(json.dumps(tweet))) if "id_str" in tweet: logger.info("archived https://twitter.com/%s/status/%s", tweet['user']['screen_name'], tweet["id_str"]) elif 'limit' in tweet: logger.warn("%s tweets undelivered", tweet["limit"]["track"]) elif 'warning' in tweet: logger.warn(tweet['warning']['message']) else: logger.warn(json.dumps(tweet)) tweet_count += 1 else: for tweet in twarc.search(search_string): with open(json_file, 'a') as json_out: json_out.write("{}\n".format(json.dumps(tweet))) if "id_str" in tweet: logger.info("archived https://twitter.com/%s/status/%s", tweet['user']['screen_name'], tweet["id_str"]) elif 'limit' in tweet: logger.warn("%s tweets undelivered", tweet["limit"]["track"]) elif 'warning' in tweet: logger.warn(tweet['warning']['message']) else: logger.warn(json.dumps(tweet)) tweet_count += 1 if tweet_count == 0: logger.info("no new tweets matching %s", search_string) # Write an empty json file. Maybe don't do this? with open(json_file, 'w') as json_out: json_out.close() return base_filename, tweet_count, crawl_time_html
class TwitterHarvester(BaseHarvester): def __init__(self, working_path, stream_restart_interval_secs=30 * 60, mq_config=None, debug=False, connection_errors=5, http_errors=5, debug_warcprox=False, tries=3): BaseHarvester.__init__(self, working_path, mq_config=mq_config, stream_restart_interval_secs=stream_restart_interval_secs, debug=debug, debug_warcprox=debug_warcprox, tries=tries) self.twarc = None self.connection_errors = connection_errors self.http_errors = http_errors self.extract_media = False self.extract_web_resources = False self.extract_user_profile_images = False def harvest_seeds(self): # Create a twarc self._create_twarc() # Get harvest extract options. self.extract_media = self.message.get("options", {}).get("media", False) self.extract_web_resources = self.message.get("options", {}).get("web_resources", False) self.extract_user_profile_images = self.message.get("options", {}).get("user_images", False) # Dispatch message based on type. harvest_type = self.message.get("type") log.debug("Harvest type is %s", harvest_type) if harvest_type == "twitter_search": self.search() elif harvest_type == "twitter_filter": self.filter() elif harvest_type == "twitter_sample": self.sample() elif harvest_type == "twitter_user_timeline": self.user_timeline() else: raise KeyError def _create_twarc(self): self.twarc = Twarc(self.message["credentials"]["consumer_key"], self.message["credentials"]["consumer_secret"], self.message["credentials"]["access_token"], self.message["credentials"]["access_token_secret"], http_errors=self.http_errors, connection_errors=self.connection_errors) def search(self): assert len(self.message.get("seeds", [])) == 1 incremental = self.message.get("options", {}).get("incremental", False) query = self.message["seeds"][0]["token"] since_id = self.state_store.get_state(__name__, u"{}.since_id".format(query)) if incremental else None self._harvest_tweets(self.twarc.search(query, since_id=since_id)) def filter(self): assert len(self.message.get("seeds", [])) == 1 track = self.message["seeds"][0]["token"].get("track") follow = self.message["seeds"][0]["token"].get("follow") locations = self.message["seeds"][0]["token"].get("locations") self._harvest_tweets(self.twarc.filter(track=track, follow=follow, locations=locations)) def sample(self): self._harvest_tweets(self.twarc.sample()) def user_timeline(self): incremental = self.message.get("options", {}).get("incremental", False) for seed in self.message.get("seeds", []): seed_id = seed["id"] screen_name = seed.get("token") user_id = seed.get("uid") log.debug("Processing seed (%s) with screen name %s and user id %s", seed_id, screen_name, user_id) assert screen_name or user_id # If there is not a user_id, look it up. if screen_name and not user_id: user_id = self._lookup_user_id(screen_name) if user_id: # Report back if nsid found self.result.uids[seed_id] = user_id else: msg = "User id not found for user {}".format(screen_name) log.exception(msg) self.result.warnings.append(Msg(CODE_TOKEN_NOT_FOUND, msg)) # Otherwise, get the current screen_name else: new_screen_name = self._lookup_screen_name(user_id) if new_screen_name != screen_name: self.result.token_updates[seed_id] = new_screen_name screen_name = new_screen_name if user_id: try: # Get since_id from state_store since_id = self.state_store.get_state(__name__, "timeline.{}.since_id".format( user_id)) if incremental else None self._harvest_tweets(self.twarc.timeline(user_id=user_id, since_id=since_id)) except HTTPError as e: if e.response.status_code == 401: msg = "Unauthorized for user {} (User ID: {}) because account is suspended or private".format(screen_name, user_id) log.exception(msg) self.result.warnings.append(Msg(CODE_TOKEN_UNAUTHORIZED, msg)) else: raise e def _lookup_screen_name(self, user_id): """ Lookup a screen name given a user id. """ users = list(self.twarc.user_lookup(user_ids=(user_id,))) assert len(users) in (0, 1) if users: return users[0]["screen_name"] return None def _lookup_user_id(self, screen_name): """ Lookup a user id given a screen name. """ users = list(self.twarc.user_lookup(screen_names=(screen_name,))) assert len(users) in (0, 1) if users: return users[0]["id_str"] return None def _harvest_tweets(self, tweets): # max_tweet_id = None for count, tweet in enumerate(tweets): if not count % 100: log.debug("Harvested %s tweets", count) self.result.harvest_counter["tweets"] += 1 if self.stop_harvest_seeds_event.is_set(): log.debug("Stopping since stop event set.") break def _process_entities(self, entities): if self.extract_web_resources: for url in entities.get("urls", []): # Exclude links for tweets if url["expanded_url"] and not status_re.match(url["expanded_url"]): self.result.urls.append(url["expanded_url"]) if self.extract_media: for media in entities.get("media", []): if media["media_url"]: self.result.urls.append(media["media_url"]) def process_warc(self, warc_filepath): # Dispatch message based on type. harvest_type = self.message.get("type") log.debug("Harvest type is %s", harvest_type) if harvest_type == "twitter_search": self.process_search_warc(warc_filepath) elif harvest_type == "twitter_filter": self._process_tweets(TwitterStreamWarcIter(warc_filepath)) elif harvest_type == "twitter_sample": self._process_tweets(TwitterStreamWarcIter(warc_filepath)) elif harvest_type == "twitter_user_timeline": self.process_user_timeline_warc(warc_filepath) else: raise KeyError def process_search_warc(self, warc_filepath): incremental = self.message.get("options", {}).get("incremental", False) query = self.message["seeds"][0]["token"] since_id = self.state_store.get_state(__name__, u"{}.since_id".format(query)) if incremental else None max_tweet_id = self._process_tweets(TwitterRestWarcIter(warc_filepath)) # Update state store if incremental and max_tweet_id > since_id: self.state_store.set_state(__name__, u"{}.since_id".format(query), max_tweet_id) def process_user_timeline_warc(self, warc_filepath): incremental = self.message.get("options", {}).get("incremental", False) for count, status in enumerate(TwitterRestWarcIter(warc_filepath)): tweet = status.item if not count % 100: log.debug("Processing %s tweets", count) if "text" in tweet: user_id = tweet["user"]["id_str"] if incremental: # Update state key = "timeline.{}.since_id".format(user_id) self.state_store.set_state(__name__, key, max(self.state_store.get_state(__name__, key), tweet.get("id"))) self._process_tweet(tweet) def _process_tweets(self, warc_iter): max_tweet_id = None for count, status in enumerate(warc_iter): tweet = status.item if not count % 100: log.debug("Processing %s tweets", count) if "text" in tweet: max_tweet_id = max(max_tweet_id, tweet.get("id")) self._process_tweet(tweet) return max_tweet_id def _process_tweet(self, tweet): self.result.increment_stats("tweets") # For more info, see https://dev.twitter.com/overview/api/entities-in-twitter-objects statuses = [tweet] if "retweeted_status" in tweet: statuses.append(tweet["retweeted_status"]) elif "quoted_status" in tweet: statuses.append(tweet["quoted_status"]) for status in statuses: self._process_entities(status.get("entities", {})) self._process_entities(status.get("extended_entities", {})) if self.extract_user_profile_images: self.result.urls.append(tweet["user"]["profile_image_url"]) self.result.urls.append(tweet["user"]["profile_background_image_url"]) if "profile_banner_url" in tweet["user"]: self.result.urls.append(tweet["user"]["profile_banner_url"])