def index(): form = UrlEntry() if form.validate_on_submit(): url_to_prospect = format_url(form.url.data) domain_data = models.DomainData.query.filter_by(domain_url=url_to_prospect).first() crawler = Crawler() if domain_data: domain_data = crawler.scrape_domain_data(url_to_prospect, domain_data) else: domain_data = crawler.scrape_domain_data(url_to_prospect) pages_to_scrape = crawler.spider_site(domain_data.domain_url) pages_data = [crawler.scrape_page_data(page_to_scrape, domain_data) for page_to_scrape in pages_to_scrape] ranker = Ranker() domain_data.ranking = ranker.rank_site(domain_data) domain_data.level = ranker.domain_level_calculator(domain_data.ranking) db.session.add(domain_data) db.session.add_all(pages_data) db.session.commit() return redirect(url_for('siteinspect', site_name=domain_data.site_name)) return render_template("index.html", form=form)
def setUp(self) -> None: self.clear_env() mock_google_photos.reset_mock() mock_google_photos.upload_media.reset_mock(side_effect=True) mock_twitter.reset_mock(side_effect=True) mock_twitter.make_original_image_url.reset_mock(side_effect=True) mock_store.reset_mock() mock_store.fetch_not_added_tweet_ids.reset_mock(return_value=True) mock_store.fetch_all_failed_upload_medias.reset_mock(return_value=True) mock_store.insert_tweet_info.reset_mock(side_effect=True) mock_store.insert_failed_upload_media.reset_mock(side_effect=True) mock_request.reset_mock(side_effect=True) mock_request.urlretrieve.reset_mock(side_effect=True) mock_makedirs.reset_mock() mock_rmtree.reset_mock() mock_sleep.reset_mock(side_effect=True) mock_crawler_func.reset_mock(side_effect=True, return_value=True) mock_crawler_func2.reset_mock(side_effect=True, return_value=True) mock_google_photos.return_value = mock_google_photos mock_twitter.return_value = mock_twitter mock_store.return_value = mock_store os.environ['SAVE_MODE'] = 'google' self.crawler = Crawler()
def test_crawler_downloadOneUrlNewspaperThread_method_returns_correct_result( self): c = Crawler( "https://politica.elpais.com/politica/2017/08/29/actualidad/1504006030_167758.html" ) c.downloadOneUrlThread("alienigenaviolanenes.html") self.assertEqual(os.path.exists("alienigenaviolanenes.html"), True) self.assertEqual(len(c.files), 1)
async def run(loop): manager = PersistManager(use_index=True) # enable 'use_index' to use Elasticsearch (Part 3) crawler = Crawler(loop=loop, manager=manager) await crawler.get_history() # Retrieve 5 minute history (Part 1) await crawler.run_updates() # Constant updates (Part 2)
def main(keywords: List[str], access_token: str, access_token_secret: str, config: str = None, api_type: str = None, token: int = 0, verbose: bool = False): """ An entry point to twitter crawler application """ loglevel = 'DEBUG' if verbose else 'INFO' LOG.setLevel(loglevel) LOG.info(msg=f"Argument {config} {api_type}") crawler_config = None if config: crawler_config = read_config(config) if keywords and access_token and access_token_secret: crawler_config = construct_config(keywords, access_token, access_token_secret) if crawler_config: LOG.debug(crawler_config) LOG.debug(f"Api Type - {api_type}") crawler = Crawler.create_crawler_instance(api_type, crawler_config, int(token)) crawler.execute() click.echo("Option is required")
def test_crawler_urlsLevelHost_method_returns_correct_result(self): c = Crawler("http://www.elpais.es") c.urlsLevelHost(1) uno = len(c.urls) c.urlsLevelHost(2) dos = len(c.urls) self.assertEqual(dos > 1, True)
def main(): parser = argparse.ArgumentParser() parser.add_argument("url") parser.add_argument("--use-web-api", action='store_true') args = parser.parse_args() if not args.use_web_api: from app.crawler import Crawler import asyncio import logging loop = asyncio.get_event_loop() c = Crawler(logging_level=logging.INFO) found_domains = loop.run_until_complete(c.crawl(args.url)) print(c._domains) else: import requests found_domains = requests.post("http://localhost/count_domains", json={"urls": [args.url]}) found_domains = list(found_domains.json().values())[0] print(f"found {found_domains} domains")
class Getter(): def __init__(self): self.redis = RedisClient() self.crawler = Crawler() # 判断是否达到了代理池限制 def is_over_threshold(self): if self.redis.count() >= POOL_UPPER_THRESHOLD: return True else: return False def run(self): print('获取器开始执行') if not self.is_over_threshold(): for site_name in self.crawler.__CrawlName__: proxies = self.crawler.get_raw_proxies(site_name) sys.stdout.flush() for item in proxies: self.redis.add(item)
def test_crawler_downloadInit_method_returns_correct_result(self): c = Crawler("http://www.gnu.org") self.assertEqual(c.url, "http://www.gnu.org") self.assertEqual( c.title, "The GNU Operating System and the Free Software Movement")
def test_crawler_urlsLevel1Host_method_returns_correct_result(self): c = Crawler("http://www.elpais.es") c.urlsLevel1Host() self.assertEqual(len(c.urls) > 1, True)
def test_crawler_downloadOneUrlThread_method_returns_correct_result(self): c = Crawler("http://www.elpais.es") c.downloadOneUrlThread("elpais.html") self.assertEqual(os.path.exists("elpais.html"), True)
def test_crawler_downloadOneUrl_method_returns_correct_result(self): c = Crawler("http://www.urjc.es") c.downloadOneUrl("urjc.html") self.assertEqual(os.path.exists("urjc.html"), True)
def __init__(self): self.redis = RedisClient() self.crawler = Crawler()
def main(): proxies = Crawler.run() Sniffer(proxies).verify()
from app.crawler import Crawler # Load settings. import settings # Setup logging. logging.basicConfig(level=settings.LOG_LEVEL) logging.getLogger("requests").setLevel(logging.WARNING) logger = logging.getLogger(__name__) # Connect to database. client = pymongo.MongoClient(settings.DB_HOST, settings.DB_PORT) db = client['aiatthewebscale'] crawler = Crawler(settings) for runId in range(8050, 8050 + 1, 1): for i in range(1, 100000 + 1, 1): logger.info('At interaction {i} for runId {runId}'.format(i = i, runId = runId)) context = crawler.get(runId, i) # Update event. event = {} event['runid'] = runId event['i'] = i event.update(context) db['contexts'].insert(event)
class TestCrawler: crawler: Crawler @mock.patch('app.crawler.GooglePhotos', mock_google_photos) @mock.patch('app.crawler.Twitter', mock_twitter) @mock.patch('app.crawler.Store', mock_store) @mock.patch('os.makedirs', mock_makedirs) def setUp(self) -> None: self.clear_env() mock_google_photos.reset_mock() mock_google_photos.upload_media.reset_mock(side_effect=True) mock_twitter.reset_mock(side_effect=True) mock_twitter.make_original_image_url.reset_mock(side_effect=True) mock_store.reset_mock() mock_store.fetch_not_added_tweet_ids.reset_mock(return_value=True) mock_store.fetch_all_failed_upload_medias.reset_mock(return_value=True) mock_store.insert_tweet_info.reset_mock(side_effect=True) mock_store.insert_failed_upload_media.reset_mock(side_effect=True) mock_request.reset_mock(side_effect=True) mock_request.urlretrieve.reset_mock(side_effect=True) mock_makedirs.reset_mock() mock_rmtree.reset_mock() mock_sleep.reset_mock(side_effect=True) mock_crawler_func.reset_mock(side_effect=True, return_value=True) mock_crawler_func2.reset_mock(side_effect=True, return_value=True) mock_google_photos.return_value = mock_google_photos mock_twitter.return_value = mock_twitter mock_store.return_value = mock_store os.environ['SAVE_MODE'] = 'google' self.crawler = Crawler() def tearDown(self) -> None: self.clear_env() @staticmethod def clear_env() -> None: delete_env('TWITTER_USER_IDS') delete_env('INTERVAL') delete_env('MODE_SPECIFIED') delete_env('TWEET_COUNT') delete_env('TWEET_PAGES') delete_env('SAVE_MODE') delete_env('LOGGING_LEVEL') delete_env('DATABASE_URL') delete_env('DATABASE_SSLMODE') delete_env('TZ') delete_env('TWITTER_CONSUMER_KEY') delete_env('TWITTER_CONSUMER_SECRET') delete_env('TWITTER_ACCESS_TOKEN') delete_env('TWITTER_ACCESS_TOKEN_SECRET') delete_env('GOOGLE_CLIENT_ID') delete_env('GOOGLE_CLIENT_SECRET') delete_env('GOOGLE_REFRESH_TOKEN') delete_env('GOOGLE_ALBUM_TITLE') @staticmethod def load_failed_upload_media(json_name: str) -> List[Tuple[str, str]]: json_path = f'{JSON_DIR}/crawler/failed_upload_media/{json_name}.json' return [tuple(failed_upload_media) for failed_upload_media in load_json(json_path)] # type: ignore @staticmethod def load_fetch_all_failed_upload_media(json_name: str) -> List[Tuple[str, str, str]]: json_path = f'{JSON_DIR}/crawler/fetch_all_failed_upload_media/{json_name}.json' return [tuple(failed_upload_media_info) for failed_upload_media_info in load_json(json_path)] # type: ignore @mock.patch('os.makedirs', mock_makedirs) @nose2.tools.params( ('https://test.com/test.jpg', 'download_dir/path/test.jpg'), ) def test_download_media(self, media_url: str, download_path: str) -> None: mock_makedirs.reset_mock() self.crawler.download_media(media_url, download_path) mock_makedirs.assert_called_once_with(os.path.dirname(download_path), exist_ok=True) mock_request.urlretrieve.assert_called_once_with(media_url, download_path) @nose2.tools.params( ('download_dir/path/test.jpg', 'test description', True), ) def test_upload_google_photos(self, media_path: str, description: str, ans: bool) -> None: is_upload: bool = self.crawler.upload_google_photos(media_path, description) assert is_upload is ans mock_google_photos.upload_media.assert_called_once_with(media_path, description) @nose2.tools.params( (500, 'Server Error'), ) def test_upload_google_photos__http_error(self, status: int, reason: str) -> None: res: dict = {'status': status, 'reason': reason} error_response = httplib2.Response(res) error_response.reason = reason mock_google_photos.upload_media.side_effect = HttpError(resp=error_response, content=b"{}") with LogCapture() as log: is_upload: bool = self.crawler.upload_google_photos('media_path', 'description') msg = f'HTTP status={reason}' log.check(('app.crawler', 'ERROR', msg)) assert is_upload is False @nose2.tools.params( 'Any exception' ) def test_upload_google_photos__any_exception(self, reason: str) -> None: mock_google_photos.upload_media.side_effect = Exception(reason) self.crawler.google_photos = mock_google_photos with LogCapture() as log: is_upload: bool = self.crawler.upload_google_photos('media_path', 'description') msg = f'Error reason={reason}' log.check(('app.crawler', 'ERROR', msg)) assert is_upload is False mock_google_photos.upload_media.reset_mock(side_effect=True) @nose2.tools.params( ('https://test.com/test.jpg', 'test_user', 'test_user/test.jpg') ) def test_make_download_path(self, url: str, user_id: str, ans: str) -> None: download_path: str = self.crawler.make_download_path(url, user_id) # noinspection PyProtectedMember assert download_path == f'{self.crawler._download_dir}/{ans}' @mock.patch('shutil.rmtree', mock_rmtree) @mock.patch('os.makedirs', mock_makedirs) @nose2.tools.params( ('https://test.com/test.jpg', 'other', 'google'), ('https://pbs.twimg.com/media/test.png', 'Twitter', 'google'), ('http://pbs.twimg.com/media/test.jpg', 'Twitter', 'local') ) def test_save_media(self, url: str, media_type: str, save_mode: str) -> None: self.crawler._save_mode = save_mode mock_twitter.make_original_image_url.side_effect = Twitter.make_original_image_url # make log msg msg_url = url if media_type == 'Twitter': msg_url = f'{url}?name=orig' download_path = f'{TEST_DOWNLOAD_DIR_PATH}/{os.path.basename(url)}' download_file_msg = f'Download file. url={msg_url}, path={download_path}' delete_msg = f'Delete directory. path={TEST_DOWNLOAD_DIR_PATH}' with LogCapture() as log: is_save = self.crawler.save_media(url, TEST_DESCRIPTION, TEST_USER_ID) if save_mode == 'local': log.check(('app.crawler', 'DEBUG', download_file_msg)) elif save_mode == 'google': log.check(('app.crawler', 'DEBUG', download_file_msg), ('app.crawler', 'DEBUG', delete_msg)) assert is_save is True if save_mode == 'local': assert mock_google_photos.upload_media.call_count == 0 elif save_mode == 'google': assert mock_google_photos.upload_media.call_count == 1 @mock.patch('time.sleep', mock_sleep) # for retry def test_save_media__download_failed(self) -> None: mock_request.urlretrieve.side_effect = urllib.error.HTTPError(TEST_MEDIA_URL, code='500', msg='', hdrs='', fp=None) with LogCapture(level=logging.ERROR) as log: is_save = self.crawler.save_media(TEST_MEDIA_URL, TEST_DESCRIPTION, TEST_USER_ID) log.check(('app.crawler', 'ERROR', f'Download failed. media_url={TEST_MEDIA_URL}')) assert is_save is False @mock.patch('time.sleep', mock_sleep) # for retry def test_save_media__upload_failed(self) -> None: mock_google_photos.upload_media.side_effect = Exception() with LogCapture(level=logging.ERROR) as log: is_save = self.crawler.save_media(TEST_MEDIA_URL, TEST_DESCRIPTION, TEST_USER_ID) log.check(('app.crawler', 'ERROR', f'Error reason='), ('app.crawler', 'ERROR', f'upload failed. media_url={TEST_MEDIA_URL}')) assert is_save is False @mock.patch('tests.test_twitter.TweetMedia', mock_media_tweet) @mock.patch('app.crawler.Crawler.store_tweet_info', mock_crawler_func) @nose2.tools.params( 'local', 'google' ) def test_backup_media(self, save_mode: str) -> None: mock_store.fetch_not_added_tweet_ids.return_value = [(TEST_TWEET_ID,)] self.crawler._save_mode = save_mode target_media_tweets: Dict[str, TweetMedia] = TwitterTestUtils.load_target_media_tweets(TEST_MEDIA_TWEETS) target_media_tweet = target_media_tweets[TEST_TWEET_ID] with LogCapture(level=logging.DEBUG) as log: self.crawler.backup_media(target_media_tweets) log.check(('app.crawler', 'INFO', f'Target tweet media count={TEST_TARGET_ID_COUNT}'), ('app.crawler', 'DEBUG', f'All media upload succeeded. urls={target_media_tweet.urls}')) mock_crawler_func.assert_called_once_with(target_media_tweets[TEST_TWEET_ID].tweet) if save_mode == 'local': assert mock_google_photos.init_album.call_count == 0 elif save_mode == 'google': assert mock_google_photos.init_album.call_count == 1 def test_backup_media__no_new_tweet(self) -> None: with LogCapture(level=logging.INFO) as log: self.crawler.backup_media({}) log.check(('app.crawler', 'INFO', 'No new tweet media.')) def test_backup_media__no_new_tweet_ids(self) -> None: mock_store.fetch_not_added_tweet_ids.return_value = [] target_media_tweets: Dict[str, TweetMedia] = TwitterTestUtils.load_target_media_tweets(TEST_MEDIA_TWEETS) with LogCapture(level=logging.INFO) as log: self.crawler.backup_media(target_media_tweets) log.check(('app.crawler', 'INFO', 'No new tweet media.')) @mock.patch('app.crawler.Crawler.store_failed_upload_media', mock_crawler_func) def test_backup_media__save_failed(self) -> None: mock_store.fetch_not_added_tweet_ids.return_value = [(TEST_TWEET_ID,)] target_media_tweets: Dict[str, TweetMedia] = TwitterTestUtils.load_target_media_tweets(TEST_MEDIA_TWEETS) target_media_tweet = target_media_tweets[TEST_TWEET_ID] url = target_media_tweet.urls[0] with mock.patch('app.crawler.Crawler.save_media', return_value=False): with LogCapture(level=logging.WARNING) as log: self.crawler.backup_media(target_media_tweets) log.check(('app.crawler', 'WARNING', f'Save failed. tweet_id={TEST_TWEET_ID}, media_url={url}')) target_tweet: tweepy.Status = target_media_tweets[TEST_TWEET_ID].tweet failed_upload_media: List[Tuple[str, str]] = self.load_failed_upload_media('one') mock_crawler_func.assert_called_once_with(target_tweet, failed_upload_media) def test_store_tweet_info(self) -> None: target_tweet: tweepy.Status = TwitterTestUtils.load_tweet(json_name=TEST_TWEET) self.crawler.store_tweet_info(target_tweet) mock_store.insert_tweet_info.assert_called_once_with(target_tweet.id_str, target_tweet.user.screen_name, str(target_tweet.created_at)) def test_store_tweet_info__exception(self) -> None: mock_store.insert_tweet_info.side_effect = Exception() target_tweet: tweepy.Status = TwitterTestUtils.load_tweet(json_name=TEST_TWEET) with LogCapture(level=logging.ERROR) as log: self.crawler.store_tweet_info(target_tweet) log.check(('app.crawler', 'ERROR', f'Insert failed. tweet_id={target_tweet.id_str}, exception=()')) def test_store_failed_upload_media(self) -> None: target_tweet: tweepy.Status = TwitterTestUtils.load_tweet(json_name='has_video') failed_upload_media: List[Tuple[str, str]] = self.load_failed_upload_media('one') self.crawler.store_failed_upload_media(target_tweet, failed_upload_media) failed_url, description = failed_upload_media[0] mock_store.insert_failed_upload_media.assert_called_once_with(failed_url, description, target_tweet.user.screen_name) def test_store_failed_upload_media__three(self) -> None: target_tweet: tweepy.Status = TwitterTestUtils.load_tweet(json_name='has_instagram_url') failed_upload_media: List[Tuple[str, str]] = self.load_failed_upload_media('three') self.crawler.store_failed_upload_media(target_tweet, failed_upload_media) assert mock_store.insert_failed_upload_media.call_count == 3 def test_store_failed_upload_media__exception(self) -> None: mock_store.insert_failed_upload_media.side_effect = Exception() target_tweet: tweepy.Status = TwitterTestUtils.load_tweet(json_name='has_video') failed_upload_media: List[Tuple[str, str]] = self.load_failed_upload_media('one') failed_url, description = failed_upload_media[0] with LogCapture(level=logging.ERROR) as log: self.crawler.store_failed_upload_media(target_tweet, failed_upload_media) log.check(('app.crawler', 'ERROR', f'Insert failed. failed_url={failed_url}, description={description},' f' exception=()')) @mock.patch('app.crawler.Crawler.save_media', mock_crawler_func) def test_retry_backup_media(self) -> None: all_failed_upload_media: List[Tuple[str, str, str]] = self.load_fetch_all_failed_upload_media('one') mock_store.fetch_all_failed_upload_medias.return_value = all_failed_upload_media url, description, user_id = all_failed_upload_media[0] with LogCapture(level=logging.INFO) as log: self.crawler.retry_backup_media() log.check(('app.crawler', 'INFO', f'Retry Save media. media_url={url}')) mock_store.fetch_all_failed_upload_medias.assert_called_once_with() mock_store.delete_failed_upload_media.assert_called_once_with(url) mock_crawler_func.assert_called_once_with(url, description, user_id) @mock.patch('app.crawler.Crawler.save_media', mock_crawler_func) def test_retry_backup_media__three(self) -> None: all_failed_upload_media: List[Tuple[str, str, str]] = self.load_fetch_all_failed_upload_media('three') mock_store.fetch_all_failed_upload_medias.return_value = all_failed_upload_media self.crawler.retry_backup_media() assert mock_crawler_func.call_count == 3 def test_retry_backup_media__save_failed(self) -> None: all_failed_upload_media: List[Tuple[str, str, str]] = self.load_fetch_all_failed_upload_media('one') mock_store.fetch_all_failed_upload_medias.return_value = all_failed_upload_media url, _, _ = all_failed_upload_media[0] with mock.patch('app.crawler.Crawler.save_media', return_value=False): with LogCapture(level=logging.WARNING) as log: self.crawler.retry_backup_media() log.check(('app.crawler', 'WARNING', f'Retry Save failed. media_url={url}')) mock_store.delete_failed_upload_media.assert_not_called() @mock.patch('app.crawler.Crawler.save_media', mock_crawler_func) def test_retry_backup_media__exception(self) -> None: mock_crawler_func.side_effect = Exception() all_failed_upload_media: List[Tuple[str, str, str]] = self.load_fetch_all_failed_upload_media('one') mock_store.fetch_all_failed_upload_medias.return_value = all_failed_upload_media url, _, _ = all_failed_upload_media[0] with LogCapture(level=logging.ERROR) as log: self.crawler.retry_backup_media() log.check(('app.crawler', 'ERROR', f'Retry backup failed. failed_url={url}, exception=()')) @mock.patch('app.crawler.Crawler.backup_media', mock_crawler_func) @mock.patch('app.crawler.Crawler.retry_backup_media', mock_crawler_func2) def test_crawling_tweets(self) -> None: mock_twitter.get_target_tweets.return_value = {} user = TwitterUser(id=TEST_TWITTER_ID) self.crawler.crawling_tweets(user) mock_twitter.get_target_tweets(user) mock_crawler_func.assert_called_once_with({}) mock_crawler_func2.assert_called_once_with() @mock.patch('time.sleep', mock_sleep) @mock.patch('app.crawler.Crawler.crawling_tweets', mock_crawler_func) @nose2.tools.params( '10', None ) def test_main(self, interval: Optional[str]) -> None: mock_sleep.side_effect = Exception() setattr(mock_twitter, 'mode', DEFAULT_MODE) test_interval: str = DEFAULT_INTERVAL if interval: os.environ['INTERVAL'] = interval test_interval = interval os.environ['TWITTER_USER_IDS'] = TEST_TWITTER_ID with LogCapture(level=logging.INFO) as log: with nose2.tools.such.helper.assertRaises(Exception): self.crawler.main() log.check(('app.crawler', 'INFO', f'Crawling start. user = {TEST_TWITTER_ID}, mode={DEFAULT_MODE}'), ('app.crawler', 'INFO', f'Interval. sleep {test_interval} minutes.')) mock_crawler_func.assert_called_once_with(TwitterUser(id=TEST_TWITTER_ID)) @mock.patch('time.sleep', mock_sleep) @mock.patch('app.crawler.Crawler.crawling_tweets', mock_crawler_func) def test_main__exception(self) -> None: mock_sleep.side_effect = Exception() mock_crawler_func.side_effect = Exception() setattr(mock_twitter, 'mode', DEFAULT_MODE) os.environ['TWITTER_USER_IDS'] = TEST_TWITTER_ID with LogCapture(level=logging.ERROR) as log: with nose2.tools.such.helper.assertRaises(Exception): self.crawler.main() log.check(('app.crawler', 'ERROR', 'Crawling error exception=()'))
from flask import Flask from flask_apscheduler import APScheduler from config import Config from app.models import Database from app.crawler import Crawler scheduler = APScheduler() config = Config() db = Database() crawler = Crawler() def create_app(): app = Flask(__name__) app.config.from_object(config) scheduler.init_app(app) scheduler.start() from app.api import api as api_blueprint app.register_blueprint(api_blueprint) return app
from app.crawler import Crawler from app.db import DB db = DB() if __name__ == '__main__': crawler = Crawler(db) crawler.run()