def test_execute(self): crwler = FileCrawler() crwler_controller = CrawlController(crwler, "d0b7f41f-ad37-3b47-ab70-9feac35557cc") crwler_controller.execute() self.assertEqual(len(crwler_controller.profiles), 19) for member in crwler_controller.members: self.assertIn(member.profile_id, crwler_controller.profile_documents) self.assertGreater(len(crwler_controller.group_documents), 0) self.assertTrue(crwler_controller.succeeded)
def test_crawl_group_members(self): """ Check if the crawler successfully fetches the 22 group members :return: """ crwler = FileCrawler() crwler_controller = CrawlController(crwler, "d0b7f41f-ad37-3b47-ab70-9feac35557cc") self.assertIsNotNone(crwler_controller.members) self.assertEqual(len(crwler_controller.members), 0) crwler_controller.crawl_group_members() self.assertEqual(len(crwler_controller.members), 19)
def test_all_with_pipeline_data(self): sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() crawler = FileCrawler() crawl_controller = CrawlController( crawler, "d0b7f41f-ad37-3b47-ab70-9feac35557cc") analysis_controller = AnalysisController() pipeline_controller = PipelineController( data_controller=data_controller, crawl_controller=crawl_controller, analysis_controller=analysis_controller) # Clean run pipeline_controller.execute() # Test slim profiles slim_profiles = data_controller.api_data.get_profiles_slim() self.assertEqual(len(slim_profiles), 19) # Test fields fields = data_controller.api_data.get_fields() self.assertEqual(len(fields), 14)
def __init__(self, *args, **kwargs): super(MendeleyCache, self).__init__(*args, **kwargs) # Read configuration self.configuration = ServiceConfiguration() self.configuration.load() log.info("Configuration has been loaded") # Create service controllers self.data_controller = DataController(self.configuration.database) self.data_controller.assert_schema() log.info("Schema has been checked") # Create crawler based on configuration self.crawler = None """:type : AbstractCrawler""" if not self.configuration.uses_mendeley: log.info("Pipeline uses FileCrawler") self.crawler = FileCrawler() else: from mendeleycache.crawler.sdk_crawler import SDKCrawler log.info("Pipeline uses SDKCrawler".format( app_id=self.configuration.crawler.app_id, app_secret=self.configuration.crawler.app_secret)) self.crawler = SDKCrawler( app_id=self.configuration.crawler.app_id, app_secret=self.configuration.crawler.app_secret) # Create the pipeline self.crawl_controller = CrawlController( self.crawler, self.configuration.crawler.research_group) self.analysis_controller = AnalysisController() self.pipeline_controller = PipelineController( data_controller=self.data_controller, crawl_controller=self.crawl_controller, analysis_controller=self.analysis_controller) log.info("Pipeline has been initialized") # Create the routing controllers self.fields_controller = FieldsController(self, self.data_controller) self.profiles_controller = ProfilesController(self, self.data_controller, self.configuration.cache) self.publications_controller = DocumentsController( self, self.data_controller) self.cache_controller = CacheController(self, self.data_controller, self.pipeline_controller, self.configuration) self.root_controller = RootController(self, self.data_controller, self.configuration) # Register the routes self.register_routes() log.info("Routes have been registered") log.info("MendeleyCache has been initialized")
def test_crawl_profiles(self): """ Check if the crawler successfully fetches profiles and for all profiles at least an entry in the doc dict :return: """ crwler = FileCrawler() crwler_controller = CrawlController(crwler, "d0b7f41f-ad37-3b47-ab70-9feac35557cc") crwler_controller.crawl_group_members() crwler_controller.crawl_profiles() self.assertIsNotNone(crwler_controller.profiles) self.assertEqual(len(crwler_controller.profiles), 19) for member in crwler_controller.members: self.assertIn(member.profile_id, crwler_controller.profile_documents)
def test_execute(self): sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() crawler = FileCrawler() crawl_controller = CrawlController( crawler, "d0b7f41f-ad37-3b47-ab70-9feac35557cc") analysis_controller = AnalysisController() pipeline_controller = PipelineController( data_controller=data_controller, crawl_controller=crawl_controller, analysis_controller=analysis_controller) # Clean run shall not crash pipeline_controller.execute() # Second run shall not crash either pipeline_controller.execute() # Third run shall not crash either pipeline_controller.execute()
def sample_pipeline(app_id=None, app_secret=None): from mendeleycache.crawler.sdk_crawler import SDKCrawler sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() crawler = None if app_id is None and app_secret is None: crawler = FileCrawler() else: crawler = SDKCrawler(app_id=app_id, app_secret=app_secret) crawl_controller = CrawlController(crawler, "d0b7f41f-ad37-3b47-ab70-9feac35557cc") analysis_controller = AnalysisController() pipeline_controller = PipelineController( data_controller=data_controller, crawl_controller=crawl_controller, analysis_controller=analysis_controller) # Clean run shall not crash pipeline_controller.execute() rows = data_controller.engine.execute("SELECT * FROM profile").fetchall() print() print("Profiles:") for row in rows: print(row) rows = data_controller.engine.execute( "SELECT * FROM cache_profile").fetchall() print() print("Cache profiles:") for row in rows: print(row) rows = data_controller.engine.execute( "SELECT id, owner_mendeley_id, title, authors, tags FROM document" ).fetchall() print() print("Documents:") for row in rows: print(row) rows = data_controller.engine.execute( "SELECT * FROM cache_document").fetchall() print() print("Cache documents:") for row in rows: print(row) rows = data_controller.engine.execute( "SELECT * FROM cache_field").fetchall() print() print("Cache fields:") for row in rows: print(row) rows = data_controller.engine.execute( "SELECT * FROM cache_document_has_cache_field").fetchall() print() print("LINK: Cache document -> Cache field:") for row in rows: print(row) rows = data_controller.engine.execute( "SELECT * FROM cache_profile_has_cache_document").fetchall() print() print("LINK: Cache profile -> Cache document:") for row in rows: print(row) print()
log.critical("Database is not initialized") exit() crawler = None if not config.uses_mendeley: log.info("Pipeline uses FileCrawler") crawler = FileCrawler() else: from mendeleycache.crawler.sdk_crawler import SDKCrawler log.info("Pipeline uses SDKCrawler".format( app_id=config.crawler.app_id, app_secret=config.crawler.app_secret)) crawler = SDKCrawler(app_id=config.crawler.app_id, app_secret=config.crawler.app_secret) crawl_controller = CrawlController(crawler, config.crawler.research_group) analysis_controller = AnalysisController() pipeline_controller = PipelineController( data_controller=data_controller, crawl_controller=crawl_controller, analysis_controller=analysis_controller) pipeline_controller.execute() # Show file-crawler sample data elif command == "sample-file-pipeline": sample_pipeline() # Trigger the pipeline with the mendeley sdk crawler elif command == "sample-sdk-pipeline": if not len(sys.argv) >= 4: log.critical(
def test_crawl_group_documents(self): crwler = FileCrawler() crwler_controller = CrawlController(crwler, "d0b7f41f-ad37-3b47-ab70-9feac35557cc") crwler_controller.crawl_group_members() crwler_controller.crawl_group_documents() self.assertGreater(len(crwler_controller.group_documents), 0)