Ejemplo n.º 1
0
    def test_all_with_pipeline_data(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        crawler = FileCrawler()
        crawl_controller = CrawlController(
            crawler, "d0b7f41f-ad37-3b47-ab70-9feac35557cc")

        analysis_controller = AnalysisController()

        pipeline_controller = PipelineController(
            data_controller=data_controller,
            crawl_controller=crawl_controller,
            analysis_controller=analysis_controller)

        # Clean run
        pipeline_controller.execute()

        # Test slim profiles
        slim_profiles = data_controller.api_data.get_profiles_slim()
        self.assertEqual(len(slim_profiles), 19)

        # Test fields
        fields = data_controller.api_data.get_fields()
        self.assertEqual(len(fields), 14)
Ejemplo n.º 2
0
 def test_execute(self):
     crwler = FileCrawler()
     crwler_controller = CrawlController(crwler, "d0b7f41f-ad37-3b47-ab70-9feac35557cc")
     crwler_controller.execute()
     self.assertEqual(len(crwler_controller.profiles), 19)
     for member in crwler_controller.members:
         self.assertIn(member.profile_id, crwler_controller.profile_documents)
     self.assertGreater(len(crwler_controller.group_documents), 0)
     self.assertTrue(crwler_controller.succeeded)
    def __init__(self, *args, **kwargs):
        super(MendeleyCache, self).__init__(*args, **kwargs)

        # Read configuration
        self.configuration = ServiceConfiguration()
        self.configuration.load()
        log.info("Configuration has been loaded")

        # Create service controllers
        self.data_controller = DataController(self.configuration.database)
        self.data_controller.assert_schema()
        log.info("Schema has been checked")

        # Create crawler based on configuration
        self.crawler = None
        """:type : AbstractCrawler"""
        if not self.configuration.uses_mendeley:
            log.info("Pipeline uses FileCrawler")
            self.crawler = FileCrawler()
        else:
            from mendeleycache.crawler.sdk_crawler import SDKCrawler
            log.info("Pipeline uses SDKCrawler".format(
                app_id=self.configuration.crawler.app_id,
                app_secret=self.configuration.crawler.app_secret))
            self.crawler = SDKCrawler(
                app_id=self.configuration.crawler.app_id,
                app_secret=self.configuration.crawler.app_secret)

        # Create the pipeline
        self.crawl_controller = CrawlController(
            self.crawler, self.configuration.crawler.research_group)
        self.analysis_controller = AnalysisController()
        self.pipeline_controller = PipelineController(
            data_controller=self.data_controller,
            crawl_controller=self.crawl_controller,
            analysis_controller=self.analysis_controller)
        log.info("Pipeline has been initialized")

        # Create the routing controllers
        self.fields_controller = FieldsController(self, self.data_controller)
        self.profiles_controller = ProfilesController(self,
                                                      self.data_controller,
                                                      self.configuration.cache)
        self.publications_controller = DocumentsController(
            self, self.data_controller)
        self.cache_controller = CacheController(self, self.data_controller,
                                                self.pipeline_controller,
                                                self.configuration)
        self.root_controller = RootController(self, self.data_controller,
                                              self.configuration)

        # Register the routes
        self.register_routes()
        log.info("Routes have been registered")
        log.info("MendeleyCache has been initialized")
Ejemplo n.º 4
0
 def test_crawl_group_members(self):
     """
     Check if the crawler successfully fetches the 22 group members
     :return:
     """
     crwler = FileCrawler()
     crwler_controller = CrawlController(crwler, "d0b7f41f-ad37-3b47-ab70-9feac35557cc")
     self.assertIsNotNone(crwler_controller.members)
     self.assertEqual(len(crwler_controller.members), 0)
     crwler_controller.crawl_group_members()
     self.assertEqual(len(crwler_controller.members), 19)
Ejemplo n.º 5
0
 def test_crawl_profiles(self):
     """
     Check if the crawler successfully fetches profiles and for all profiles at least an entry in the doc dict
     :return:
     """
     crwler = FileCrawler()
     crwler_controller = CrawlController(crwler, "d0b7f41f-ad37-3b47-ab70-9feac35557cc")
     crwler_controller.crawl_group_members()
     crwler_controller.crawl_profiles()
     self.assertIsNotNone(crwler_controller.profiles)
     self.assertEqual(len(crwler_controller.profiles), 19)
     for member in crwler_controller.members:
         self.assertIn(member.profile_id, crwler_controller.profile_documents)
Ejemplo n.º 6
0
    def test_execute(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        crawler = FileCrawler()
        crawl_controller = CrawlController(
            crawler, "d0b7f41f-ad37-3b47-ab70-9feac35557cc")

        analysis_controller = AnalysisController()

        pipeline_controller = PipelineController(
            data_controller=data_controller,
            crawl_controller=crawl_controller,
            analysis_controller=analysis_controller)

        # Clean run shall not crash
        pipeline_controller.execute()

        # Second run shall not crash either
        pipeline_controller.execute()

        # Third run shall not crash either
        pipeline_controller.execute()
Ejemplo n.º 7
0
def sample_pipeline(app_id=None, app_secret=None):
    from mendeleycache.crawler.sdk_crawler import SDKCrawler

    sqlite_in_memory = SQLiteConfiguration("")
    data_controller = DataController(sqlite_in_memory)
    data_controller.run_schema()

    crawler = None
    if app_id is None and app_secret is None:
        crawler = FileCrawler()
    else:
        crawler = SDKCrawler(app_id=app_id, app_secret=app_secret)

    crawl_controller = CrawlController(crawler,
                                       "d0b7f41f-ad37-3b47-ab70-9feac35557cc")

    analysis_controller = AnalysisController()

    pipeline_controller = PipelineController(
        data_controller=data_controller,
        crawl_controller=crawl_controller,
        analysis_controller=analysis_controller)

    # Clean run shall not crash
    pipeline_controller.execute()

    rows = data_controller.engine.execute("SELECT * FROM profile").fetchall()
    print()
    print("Profiles:")
    for row in rows:
        print(row)

    rows = data_controller.engine.execute(
        "SELECT * FROM cache_profile").fetchall()
    print()
    print("Cache profiles:")
    for row in rows:
        print(row)

    rows = data_controller.engine.execute(
        "SELECT id, owner_mendeley_id, title, authors, tags FROM document"
    ).fetchall()
    print()
    print("Documents:")
    for row in rows:
        print(row)

    rows = data_controller.engine.execute(
        "SELECT * FROM cache_document").fetchall()
    print()
    print("Cache documents:")
    for row in rows:
        print(row)

    rows = data_controller.engine.execute(
        "SELECT * FROM cache_field").fetchall()
    print()
    print("Cache fields:")
    for row in rows:
        print(row)

    rows = data_controller.engine.execute(
        "SELECT * FROM cache_document_has_cache_field").fetchall()
    print()
    print("LINK: Cache document -> Cache field:")
    for row in rows:
        print(row)

    rows = data_controller.engine.execute(
        "SELECT * FROM cache_profile_has_cache_document").fetchall()
    print()
    print("LINK: Cache profile -> Cache document:")
    for row in rows:
        print(row)

    print()
Ejemplo n.º 8
0
            log.critical("Database is not initialized")
            exit()

        crawler = None
        if not config.uses_mendeley:
            log.info("Pipeline uses FileCrawler")
            crawler = FileCrawler()
        else:
            from mendeleycache.crawler.sdk_crawler import SDKCrawler
            log.info("Pipeline uses SDKCrawler".format(
                app_id=config.crawler.app_id,
                app_secret=config.crawler.app_secret))
            crawler = SDKCrawler(app_id=config.crawler.app_id,
                                 app_secret=config.crawler.app_secret)

        crawl_controller = CrawlController(crawler,
                                           config.crawler.research_group)
        analysis_controller = AnalysisController()
        pipeline_controller = PipelineController(
            data_controller=data_controller,
            crawl_controller=crawl_controller,
            analysis_controller=analysis_controller)
        pipeline_controller.execute()

    # Show file-crawler sample data
    elif command == "sample-file-pipeline":
        sample_pipeline()

    # Trigger the pipeline with the mendeley sdk crawler
    elif command == "sample-sdk-pipeline":
        if not len(sys.argv) >= 4:
            log.critical(
Ejemplo n.º 9
0
 def test_crawl_group_documents(self):
     crwler = FileCrawler()
     crwler_controller = CrawlController(crwler, "d0b7f41f-ad37-3b47-ab70-9feac35557cc")
     crwler_controller.crawl_group_members()
     crwler_controller.crawl_group_documents()
     self.assertGreater(len(crwler_controller.group_documents), 0)