def callback(): oauth_token = request.args.get('oauth_token') oauth_verifier = request.args.get('oauth_verifier') print("callback token:\n" + oauth_token + "\n") print("callback verifier:\n" + oauth_verifier + "\n") access_token, oauth_token_secret, user_id, screen_name = Twitter.get_access_token( oauth_token, oauth_verifier) user = {'username': screen_name} posts = [] Twitter.create_sandbox_account(access_token, oauth) return render_template('index.html', title='Home', user=user, posts=posts)
def make_illegal_video_info(tweet_id: str, json_name: str) -> None: tweet = Twitter().api.get_status(id=tweet_id, tweet_mode='extended') for i, _ in enumerate(tweet.extended_entities['media']): tweet.extended_entities['media'][i]['video_info'][ 'variants'] = [] json_path = f'{JSON_DIR}/twitter/tweet/{json_name}.json' with open(json_path, 'w', newline='\n') as f: json.dump(tweet, f, default=TwitterJson.json_dump_default, indent=2)
def make_illegal_extended_url(tweet_id: str, json_name: str) -> None: tweet = Twitter().api.get_status(id=tweet_id, tweet_mode='extended') for i, _ in enumerate(tweet.entities['urls']): tweet.entities['urls'][i]['url'] = tweet.entities['urls'][0][ 'expanded_url'] tweet.entities['urls'][i]['expanded_url'] = '' json_path = f'{JSON_DIR}/twitter/tweet/{json_name}.json' with open(json_path, 'w', newline='\n') as f: json.dump(tweet, f, default=TwitterJson.json_dump_default, indent=2)
def fetch_failed_upload_media(tweet_id: str) -> List[Tuple[str, str]]: twitter = Twitter() tweet: tweepy.Status = twitter.api.get_status( id=tweet_id, tweet_mode='extended') tweet_medias: Dict[str, TweetMedia] = twitter.get_tweet_medias(tweet) tweet_media: TweetMedia = tweet_medias[tweet_id] failed_upload_media: List[Tuple[str, str]] = [] description: str = Twitter.make_tweet_description(tweet) for url in tweet_media.urls: failed_upload_media.append((url, description)) return failed_upload_media
def make(tweet_id: str, json_name: str) -> None: tweet = Twitter().api.get_status(id=tweet_id, tweet_mode='extended') json_path = f'{JSON_DIR}/twitter/tweet/{json_name}.json' with open(json_path, 'w', newline='\n') as f: json.dump(tweet, f, default=TwitterJson.json_dump_default, indent=2)
def __init__(self) -> None: self._save_mode: str = Env.get_environment('SAVE_MODE', default='local') self.twitter: Twitter = Twitter() self.store: Store = Store() if self._save_mode == 'google': self.google_photos: GooglePhotos = GooglePhotos() self._download_dir: str = './download' os.makedirs(self._download_dir, exist_ok=True)
def setUp(self) -> None: os.environ['TWITTER_CONSUMER_KEY'] = 'DUMMY' os.environ['TWITTER_CONSUMER_SECRET'] = 'DUMMY' os.environ['TWITTER_ACCESS_TOKEN'] = 'DUMMY' os.environ['TWITTER_ACCESS_TOKEN_SECRET'] = 'DUMMY' self.mock_cursor = mock.MagicMock(tweepy.Cursor) self.mock_instagram = mock.MagicMock(Instagram) mock_cursor.reset_mock() mock_cursor.pages.reset_mock(side_effect=True) mock_instagram.reset_mock() mock_twitter_func.reset_mock() # Return mock when instantiating mock_cursor.return_value = self.mock_cursor mock_instagram.return_value = self.mock_instagram self.twitter = Twitter()
def make(self, json_name: str) -> None: self.user.since_id = 1 tweets: list if json_name == 'timeline': tweets = Twitter().api.user_timeline(id=TEST_USER_ID, count=TEST_TWEET_COUNT, tweet_mode="extended") elif json_name == 'fav': tweets = Twitter().api.favorites(id=TEST_USER_ID, count=TEST_TWEET_COUNT, tweet_mode="extended") else: return json_path = f'{JSON_DIR}/twitter/tweets/{json_name}.json' with open(json_path, 'w', newline='\n') as f: json.dump(tweets, f, default=TwitterJson.json_dump_default, indent=2)
def datatable(): account_number = 'gq1iff' campaign_number = request.args.get('campaign_number') table_name = request.args.get('table_name') start_time = request.args.get('start_time') end_time = request.args.get('end_time') values_dict = Twitter.get_data_from_user(account_number, campaign_number, 'ENGAGEMENT', table_name) return render_template('datatable.html', title='Data Table', data=values_dict)
def test_difference_tweet_medias(self) -> None: old_tweets: Dict[ str, TweetMedia] = TwitterTestUtils.load_target_media_tweets( json_name='old') new_tweets: Dict[ str, TweetMedia] = TwitterTestUtils.load_target_media_tweets( json_name='new') target_tweet_medias: Dict[ str, TweetMedia] = Twitter.difference_tweet_medias( new_tweets, old_tweets) assert len(target_tweet_medias) == 1
def make(self, json_name: str) -> None: mode: str = json_name self.user.since_id = 1 os.environ['MODE_SPECIFIED'] = mode target_tweet_medias: Dict[ str, TweetMedia] = Twitter().get_target_tweets(self.user) json_path = f'{JSON_DIR}/twitter/target_tweet_medias/{json_name}.json' with open(json_path, 'w', newline='\n') as f: json.dump(target_tweet_medias, f, default=TwitterJson.json_dump_default, indent=2)
def test_get_instagram_url(self, json_name: str, has_url: bool) -> None: tweet: tweepy.Status = TwitterTestUtils.load_tweet(json_name=json_name) assert has_attributes(tweet, 'entities') # noinspection PyProtectedMember url: str = Twitter._get_instagram_url(tweet.entities) assert isinstance(url, str) if has_url: pattern = re.compile( r'^https?://([\w-]+\.)+[\w-]+/?([\w\-./?%&=+]*)?$') assert pattern.fullmatch(url) is not None else: assert len(url) == 0
def test_get_video_url(self, json_name: str, has_url: bool) -> None: tweet: tweepy.Status = TwitterTestUtils.load_tweet(json_name=json_name) assert has_attributes( tweet, 'extended_entities') and 'media' in tweet.extended_entities pattern = re.compile( r'^https?://([\w-]+\.)+[\w-]+/?([\w\-./?%&=+]*)?$') for media in tweet.extended_entities['media']: # noinspection PyProtectedMember url: str = Twitter._get_video_url(media) if has_url: assert pattern.fullmatch(url) is not None else: assert len(url) == 0
def backup_media(self, tweet_medias: Dict[str, TweetMedia]) -> None: if not tweet_medias: logger.info('No new tweet media.') return target_tweet_ids = self.store.fetch_not_added_tweet_ids( list(tweet_medias.keys())) if not target_tweet_ids: logger.info('No new tweet media.') return logger.info(f'Target tweet media count={len(target_tweet_ids)}') if self._save_mode == 'google': self.google_photos.init_album() for tweet_id, in target_tweet_ids: target_tweet_media: TweetMedia = tweet_medias[tweet_id] target_tweet: tweepy.Status = target_tweet_media.tweet failed_upload_medias: List[Tuple[str, str]] = [] target_tweet_media.show_info() for url in target_tweet_media.urls: description: str = Twitter.make_tweet_description(target_tweet) is_saved: bool = self.save_media(url, description, target_tweet.user.screen_name) if not is_saved: failed_upload_medias.append((url, description)) logger.warning( f'Save failed. tweet_id={tweet_id}, media_url={url}') continue self.store_tweet_info(target_tweet) if not failed_upload_medias: logger.debug( f'All media upload succeeded. urls={target_tweet_media.urls}' ) continue self.store_failed_upload_media(target_tweet, failed_upload_medias)
def __init__(self): self.twitter = Twitter() self.store = Store() self.google_photos = GooglePhotos() self._download_dir = './download' os.makedirs(self._download_dir, exist_ok=True)
class TestTwitter: twitter: Twitter mock_cursor: mock.MagicMock mock_instagram: mock.MagicMock def __init__(self) -> None: self.user: TwitterUser = TwitterUser(id=TEST_TWITTER_ID) def setUp(self) -> None: os.environ['TWITTER_CONSUMER_KEY'] = 'DUMMY' os.environ['TWITTER_CONSUMER_SECRET'] = 'DUMMY' os.environ['TWITTER_ACCESS_TOKEN'] = 'DUMMY' os.environ['TWITTER_ACCESS_TOKEN_SECRET'] = 'DUMMY' self.mock_cursor = mock.MagicMock(tweepy.Cursor) self.mock_instagram = mock.MagicMock(Instagram) mock_cursor.reset_mock() mock_cursor.pages.reset_mock(side_effect=True) mock_instagram.reset_mock() mock_twitter_func.reset_mock() # Return mock when instantiating mock_cursor.return_value = self.mock_cursor mock_instagram.return_value = self.mock_instagram self.twitter = Twitter() @staticmethod def tearDown() -> None: delete_env('TWITTER_CONSUMER_KEY') delete_env('TWITTER_CONSUMER_SECRET') delete_env('TWITTER_ACCESS_TOKEN') delete_env('TWITTER_ACCESS_TOKEN_SECRET') @nose2.tools.params( ('test.jpg', 'test.jpg?name=orig'), ('test.jpg?foo=bar', 'test.jpg?foo=bar&name=orig'), ('test.jpg?name=100', 'test.jpg?name=orig'), ('test.jpg?name=aBc789', 'test.jpg?name=orig'), ('test.jpg?name=aBc789&foo=aaa', 'test.jpg?name=orig&foo=aaa')) def test_make_original_image_url(self, url: str, ans: str) -> None: original_url: str = Twitter.make_original_image_url(url) assert original_url == ans @nose2.tools.params(('is_fav_rt_quoted', True), ('is_not_fav_rt_quoted', False) # Maybe quoted_status never be empty ) def test_is_quoted(self, json_name: str, ans: bool) -> None: tweet: tweepy.Status = TwitterTestUtils.load_tweet(json_name=json_name) result: bool = Twitter.is_quoted(tweet) assert result is ans @nose2.tools.params(('is_fav_rt_quoted', True), ('is_not_fav_rt_quoted', False) # Maybe favorited is always included ) def test_is_favorited(self, json_name: str, ans: bool) -> None: tweet: tweepy.Status = TwitterTestUtils.load_tweet(json_name=json_name) result: bool = Twitter.is_favorited(tweet) assert result is ans @nose2.tools.params(('is_fav_rt_quoted', True), ('is_not_fav_rt_quoted', False) # Maybe retweeted_status never be empty ) def test_is_retweeted(self, json_name: str, ans: bool) -> None: tweet: tweepy.Status = TwitterTestUtils.load_tweet(json_name=json_name) result: bool = Twitter.is_retweeted(tweet) assert result is ans @nose2.tools.params(('has_images', True), ('has_illegal_images', False) # Maybe media_url_https is always included ) def test_get_photo_url(self, json_name: str, has_url: bool) -> None: tweet: tweepy.Status = TwitterTestUtils.load_tweet(json_name=json_name) assert has_attributes( tweet, 'extended_entities') and 'media' in tweet.extended_entities pattern = re.compile( r'^https?://([\w-]+\.)+[\w-]+/?([\w\-./?%&=+]*)?$') for media in tweet.extended_entities['media']: # noinspection PyProtectedMember url: str = Twitter._get_photo_url(media) if has_url: assert pattern.fullmatch(url) is not None else: assert len(url) == 0 @nose2.tools.params(('has_video', True), ('has_illegal_video', False)) def test_get_video_url(self, json_name: str, has_url: bool) -> None: tweet: tweepy.Status = TwitterTestUtils.load_tweet(json_name=json_name) assert has_attributes( tweet, 'extended_entities') and 'media' in tweet.extended_entities pattern = re.compile( r'^https?://([\w-]+\.)+[\w-]+/?([\w\-./?%&=+]*)?$') for media in tweet.extended_entities['media']: # noinspection PyProtectedMember url: str = Twitter._get_video_url(media) if has_url: assert pattern.fullmatch(url) is not None else: assert len(url) == 0 @nose2.tools.params(('has_instagram_url', True), ('has_not_images', False) # Maybe urls never be empty ) def test_has_instagram_url(self, json_name: str, has_url: bool) -> None: tweet: tweepy.Status = TwitterTestUtils.load_tweet(json_name=json_name) assert has_attributes(tweet, 'entities') # noinspection PyProtectedMember assert Twitter._has_instagram_url(tweet.entities) is has_url @nose2.tools.params(('has_images', False), ('has_instagram_url', True), ('has_illegal_instagram_url', True)) def test_get_instagram_url(self, json_name: str, has_url: bool) -> None: tweet: tweepy.Status = TwitterTestUtils.load_tweet(json_name=json_name) assert has_attributes(tweet, 'entities') # noinspection PyProtectedMember url: str = Twitter._get_instagram_url(tweet.entities) assert isinstance(url, str) if has_url: pattern = re.compile( r'^https?://([\w-]+\.)+[\w-]+/?([\w\-./?%&=+]*)?$') assert pattern.fullmatch(url) is not None else: assert len(url) == 0 @nose2.tools.params('has_images', 'has_video' # Maybe extended_entities never be empty ) def test_get_twitter_media_urls(self, json_name: str) -> None: tweet: tweepy.Status = TwitterTestUtils.load_tweet(json_name=json_name) assert has_attributes(tweet, 'extended_entities') # noinspection PyProtectedMember media_url_list: List[str] = self.twitter._get_twitter_media_urls( tweet.extended_entities) assert len(media_url_list) != 0 pattern = re.compile( r'^https?://([\w-]+\.)+[\w-]+/?([\w\-./?%&=+]*)?$') for url in media_url_list: assert pattern.fullmatch(url) is not None @nose2.tools.params(('has_images', 'Twitter'), ('has_not_images', None), ('has_instagram_url', 'Instagram')) def test_get_tweet_medias(self, json_name: str, media_type: Optional[str]) -> None: self.mock_instagram.get_media_urls.return_value = [INSTAGRAM_DUMMY_URL] tweet: tweepy.Status = TwitterTestUtils.load_tweet(json_name=json_name) target_tweet_medias: Dict[ str, TweetMedia] = self.twitter.get_tweet_medias(tweet) if media_type is None: assert len(target_tweet_medias) == 0 return for key, value in target_tweet_medias.items(): assert isinstance(key, str) assert isinstance(value, TweetMedia) assert len(value.urls) != 0 def test_make_tweet_permalink(self) -> None: tweet: tweepy.Status = TwitterTestUtils.load_tweet( json_name='has_images') permalink: str = self.twitter.make_tweet_permalink(tweet) assert f'https://twitter.com/{tweet.user.screen_name}/status/{tweet.id_str}' == permalink def test_make_tweet_description(self) -> None: tweet: tweepy.Status = TwitterTestUtils.load_tweet( json_name='has_images') description: str = self.twitter.make_tweet_description(tweet) assert f'{tweet.user.name}\n' \ f'@{tweet.user.screen_name}\n' \ f'{tweet.full_text}' == description def test_difference_tweet_medias(self) -> None: old_tweets: Dict[ str, TweetMedia] = TwitterTestUtils.load_target_media_tweets( json_name='old') new_tweets: Dict[ str, TweetMedia] = TwitterTestUtils.load_target_media_tweets( json_name='new') target_tweet_medias: Dict[ str, TweetMedia] = Twitter.difference_tweet_medias( new_tweets, old_tweets) assert len(target_tweet_medias) == 1 @nose2.tools.params( 6, ) def test_get_favorite_media(self, count: int) -> None: self.mock_cursor.pages.side_effect = MockTweepyCursor.pages self.mock_instagram.get_media_urls.return_value = [INSTAGRAM_DUMMY_URL] with LogCapture(level=logging.INFO) as log: target_tweet_medias: Dict[ str, TweetMedia] = self.twitter.get_favorite_media(self.user) log.check(( 'app.twitter', 'INFO', f'Get favorite tweet media. user={self.user.id}. ' f'pages={self.twitter.tweet_page}, count={self.twitter.tweet_count}' )) assert len(target_tweet_medias) == count for tweet_id, tweet_media in target_tweet_medias.items(): assert isinstance(tweet_id, str) assert isinstance(tweet_media, TweetMedia) @mock.patch('app.crawler.Twitter.get_tweet_medias', mock_twitter_func) def test_get_favorite_media__exception(self) -> None: self.mock_cursor.pages.side_effect = MockTweepyCursor.pages self.mock_instagram.get_media_urls.return_value = [INSTAGRAM_DUMMY_URL] mock_twitter_func.side_effect = Exception() with LogCapture(level=logging.ERROR) as log: target_tweet_medias: Dict[ str, TweetMedia] = self.twitter.get_favorite_media(self.user) assert LogCaptureHelper.check_contain( log, ('app.twitter', 'ERROR', 'Get tweet media error. exception=()')) assert len(target_tweet_medias) == 0 @nose2.tools.params( ('rt', 7), ('rtfav', 7), ('rrrt', 7), ('mixed', 3), ) def test_get_rt_media(self, mode: str, count: int) -> None: self.mock_cursor.pages.side_effect = MockTweepyCursor.pages self.mock_instagram.get_media_urls.return_value = [INSTAGRAM_DUMMY_URL] self.twitter.mode = mode target_tweet_medias: Dict[str, TweetMedia] = self.twitter.get_rt_media( self.user) assert len(target_tweet_medias) == count for tweet_id, tweet_media in target_tweet_medias.items(): assert isinstance(tweet_id, str) assert isinstance(tweet_media, TweetMedia) @mock.patch('app.crawler.Twitter.get_tweet_medias', mock_twitter_func) def test_get_rt_media__exception(self) -> None: self.mock_cursor.pages.side_effect = MockTweepyCursor.pages self.mock_instagram.get_media_urls.return_value = [INSTAGRAM_DUMMY_URL] mock_twitter_func.side_effect = Exception() with LogCapture(level=logging.ERROR) as log: target_tweet_medias: Dict[str, TweetMedia] = self.twitter.get_rt_media( self.user) assert LogCaptureHelper.check_contain( log, ('app.twitter', 'ERROR', 'Get tweet media error. exception=()')) assert len(target_tweet_medias) == 0 @nose2.tools.params( ('rt', 7), ('fav', 6), ('rtfav', 10), ('rrrt', 7), ('mixed', 3), ) def test_get_target_tweets(self, mode: str, count: int) -> None: self.mock_cursor.pages.side_effect = MockTweepyCursor.pages self.mock_instagram.get_media_urls.return_value = [INSTAGRAM_DUMMY_URL] self.twitter.mode = mode target_tweet_medias: Dict[str, TweetMedia] = self.twitter.get_target_tweets( self.user) assert len(target_tweet_medias) == count for tweet_id, tweet_media in target_tweet_medias.items(): assert isinstance(tweet_id, str) assert isinstance(tweet_media, TweetMedia)
def test_has_instagram_url(self, json_name: str, has_url: bool) -> None: tweet: tweepy.Status = TwitterTestUtils.load_tweet(json_name=json_name) assert has_attributes(tweet, 'entities') # noinspection PyProtectedMember assert Twitter._has_instagram_url(tweet.entities) is has_url
def test_is_retweeted(self, json_name: str, ans: bool) -> None: tweet: tweepy.Status = TwitterTestUtils.load_tweet(json_name=json_name) result: bool = Twitter.is_retweeted(tweet) assert result is ans
def test_make_original_image_url(self, url: str, ans: str) -> None: original_url: str = Twitter.make_original_image_url(url) assert original_url == ans
def twitter(): oauth_token, oauth_token_secret = Twitter.get_request_token() url = 'https://api.twitter.com/oauth/authorize?oauth_token=%s' % ( oauth_token) return redirect(url)
class Crawler: def __init__(self): self.twitter = Twitter() self.store = Store() self.google_photos = GooglePhotos() self._download_dir = './download' os.makedirs(self._download_dir, exist_ok=True) @staticmethod @retry(urllib.error.HTTPError, tries=3, delay=1) def download_media(media_url, download_path): urllib.request.urlretrieve(media_url, download_path) def upload_google_photos(self, media_path): while True: try: self.google_photos.upload_media(media_path) except HttpError as error: print(f'HTTP status={error.resp.reason}', file=sys.stderr) traceback.print_exc() return False except Exception as error: print(f'Error reason={error}', file=sys.stderr) traceback.print_exc() return False break return True def make_download_path(self, url): url = re.sub('\?.*$', '', url) return f'{self._download_dir}/{os.path.basename(url)}' def backup_media(self, media_tweet_dicts): for tweet_id, tweet_status in media_tweet_dicts.items(): if self.store.is_added_tweet(tweet_id): continue print(tweet_status) for url in tweet_status['urls']: # download download_path = self.make_download_path(url) if url.startswith("https://pbs.twimg.com/media") or url.startswith("http://pbs.twimg.com/media"): url = self.twitter.make_original_image_url(url) try: Crawler.download_media(url, download_path) except urllib.error.HTTPError as e: traceback.print_exc() print(f'download failed. tweet_id={tweet_id}, media_url={url}', file=sys.stderr) continue # upload is_uploaded = self.upload_google_photos(download_path) if not is_uploaded: print(f'upload failed. tweet_id={tweet_id}, media_url={url}', file=sys.stderr) continue # delete sub = subprocess.Popen(f'rm -f {download_path}', shell=True) subprocess.Popen.wait(sub) # store update try: self.store.insert_tweet_info(tweet_id, tweet_status['user_id'], tweet_status['tweet_date']) except Exception as e: print(f'Insert failed. tweet_id={tweet_id}', e.args, file=sys.stderr) traceback.print_exc() def crawling_rt(self, user): media_tweet_dicts = self.twitter.get_rt_media(user) self.backup_media(media_tweet_dicts) def main(self): interval_minutes = int(Env.get_environment('INTERVAL', default='5')) user_ids = Env.get_environment('TWITTER_USER_IDS') user_list = [TwitterUser(user_id) for user_id in user_ids.split(',')] while True: try: for user in user_list: self.crawling_rt(user) except: traceback.print_exc() time.sleep(interval_minutes * 60)