Example #1
0
    def test_all_with_pipeline_data(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        crawler = FileCrawler()
        crawl_controller = CrawlController(crawler, "d0b7f41f-ad37-3b47-ab70-9feac35557cc")

        analysis_controller = AnalysisController()

        pipeline_controller = PipelineController(
            data_controller=data_controller,
            crawl_controller=crawl_controller,
            analysis_controller=analysis_controller
        )

        # Clean run
        pipeline_controller.execute()

        # Test slim profiles
        slim_profiles = data_controller.api_data.get_profiles_slim()
        self.assertEqual(len(slim_profiles), 19)

        # Test fields
        fields = data_controller.api_data.get_fields()
        self.assertEqual(len(fields), 14)
Example #2
0
    def test_update_cache_fields(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        # The call shall not crash with empty input
        r = data_controller.crawl_data.update_cache_fields(dict())
        self.assertIsNone(r)

        field1 = CacheField(title="field 1", unified_title="field1")
        field2 = CacheField(title="field 2", unified_title="field2")
        unified_field_title_to_field = dict()
        unified_field_title_to_field["field1"] = field1
        unified_field_title_to_field["field2"] = field2

        data_controller.crawl_data.update_cache_fields(
            unified_field_title_to_field)

        # Check data count in the table
        cnt = data_controller.engine.execute(
            "SELECT COUNT(*) FROM cache_field").fetchone()
        self.assertEqual(cnt[0], 2)

        # Then query rows
        id = generate_id('field1')
        row = data_controller.engine.execute("SELECT id, title "
                                             "FROM cache_field "
                                             "WHERE id='%s' " % id).fetchone()

        # Check first row
        self.assertEqual(row["id"], id)
        self.assertEqual(row['title'], "field 1")
Example #3
0
    def test_get_profiles_slim(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        # The call shall not crash with empty input
        data_controller.api_data.get_profiles_slim()
Example #4
0
    def test_get_profiles_slim(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        # The call shall not crash with empty input
        data_controller.api_data.get_profiles_slim()
Example #5
0
    def test_all_with_pipeline_data(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        crawler = FileCrawler()
        crawl_controller = CrawlController(
            crawler, "d0b7f41f-ad37-3b47-ab70-9feac35557cc")

        analysis_controller = AnalysisController()

        pipeline_controller = PipelineController(
            data_controller=data_controller,
            crawl_controller=crawl_controller,
            analysis_controller=analysis_controller)

        # Clean run
        pipeline_controller.execute()

        # Test slim profiles
        slim_profiles = data_controller.api_data.get_profiles_slim()
        self.assertEqual(len(slim_profiles), 19)

        # Test fields
        fields = data_controller.api_data.get_fields()
        self.assertEqual(len(fields), 14)
Example #6
0
    def test_update_cache_fields(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        # The call shall not crash with empty input
        r = data_controller.crawl_data.update_cache_fields(dict())
        self.assertIsNone(r)

        field1 = CacheField(title="field 1", unified_title="field1")
        field2 = CacheField(title="field 2", unified_title="field2")
        unified_field_title_to_field = dict()
        unified_field_title_to_field["field1"] = field1
        unified_field_title_to_field["field2"] = field2

        data_controller.crawl_data.update_cache_fields(unified_field_title_to_field)

        # Check data count in the table
        cnt = data_controller.engine.execute("SELECT COUNT(*) FROM cache_field").fetchone()
        self.assertEqual(cnt[0], 2)

        # Then query rows
        id = generate_id('field1')
        row = data_controller.engine.execute(
            "SELECT id, title "
            "FROM cache_field "
            "WHERE id='%s' " % id
        ).fetchone()

        # Check first row
        self.assertEqual(row["id"], id)
        self.assertEqual(row['title'], "field 1")
Example #7
0
    def test_link_fields_to_documents(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        # The call shall not crash with empty input
        r = data_controller.crawl_data.link_fields_to_documents(dict())
        self.assertIsNone(r)
Example #8
0
    def test_link_fields_to_documents(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        # The call shall not crash with empty input
        r = data_controller.crawl_data.link_fields_to_documents(dict())
        self.assertIsNone(r)
Example #9
0
    def test_update_profiles(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        # The call shall not crash with empty input
        r = data_controller.crawl_data.update_profiles([])
        self.assertIsNone(r)

        profile1 = Profile("id1", "Hans", "Mustermann", "", "")
        profile2 = Profile("id2", "Max", "Mustermann", "", "")
        data_controller.crawl_data.update_profiles([profile1, profile2])

        # Check data count in the table
        cnt = data_controller.engine.execute(
            "SELECT COUNT(*) FROM profile").fetchone()
        self.assertEqual(cnt[0], 2)

        # Then query rows
        rows = data_controller.engine.execute(
            "SELECT id, mendeley_id, cache_profile_id, first_name, last_name, display_name "
            "FROM profile ").fetchall()

        # Check first row
        self.assertEqual(rows[0]["mendeley_id"], "id1")
        self.assertEqual(rows[0]["cache_profile_id"],
                         generate_id("hansmustermann"))
        self.assertEqual(rows[0]['first_name'], "Hans")
        self.assertEqual(rows[0]['last_name'], "Mustermann")

        # Check second row
        self.assertEqual(rows[1]["mendeley_id"], "id2")
        self.assertEqual(rows[1]["cache_profile_id"],
                         generate_id("maxmustermann"))
        self.assertEqual(rows[1]['first_name'], "Max")
        self.assertEqual(rows[1]['last_name'], "Mustermann")

        profile1 = Profile("id1", "Hans", "Supermann", "", "")
        profile2 = Profile("id2", "Max", "Mustermann", "", "")
        data_controller.crawl_data.update_profiles([profile1, profile2])

        # Check data count in the table
        cnt = data_controller.engine.execute(
            "SELECT COUNT(*) FROM profile").fetchone()
        self.assertEqual(cnt[0], 2)

        # Then query rows
        rows = data_controller.engine.execute(
            "SELECT id, mendeley_id, cache_profile_id, first_name, last_name, display_name "
            "FROM profile ").fetchall()

        # Check first row
        self.assertEqual(rows[0]["mendeley_id"], "id1")
        self.assertEqual(rows[0]["cache_profile_id"],
                         generate_id("hanssupermann"))
        self.assertEqual(rows[0]['first_name'], "Hans")
        self.assertEqual(rows[0]['last_name'], "Supermann")
Example #10
0
    def test_get_documents_by_profile_ids_and_field_ids(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        # The call shall not crash with empty input
        data_controller.api_data.get_documents_by_profile_ids_and_field_ids([], [])
        data_controller.api_data.get_documents_by_profile_ids_and_field_ids([42, 43], [])
        data_controller.api_data.get_documents_by_profile_ids_and_field_ids([], [42, 43])
        data_controller.api_data.get_documents_by_profile_ids_and_field_ids([42, 43], [44, 45])
    def __init__(self, *args, **kwargs):
        super(MendeleyCache, self).__init__(*args, **kwargs)

        # Read configuration
        self.configuration = ServiceConfiguration()
        self.configuration.load()
        log.info("Configuration has been loaded")

        # Create service controllers
        self.data_controller = DataController(self.configuration.database)
        self.data_controller.assert_schema()
        log.info("Schema has been checked")

        # Create crawler based on configuration
        self.crawler = None
        """:type : AbstractCrawler"""
        if not self.configuration.uses_mendeley:
            log.info("Pipeline uses FileCrawler")
            self.crawler = FileCrawler()
        else:
            from mendeleycache.crawler.sdk_crawler import SDKCrawler
            log.info("Pipeline uses SDKCrawler".format(
                app_id=self.configuration.crawler.app_id,
                app_secret=self.configuration.crawler.app_secret))
            self.crawler = SDKCrawler(
                app_id=self.configuration.crawler.app_id,
                app_secret=self.configuration.crawler.app_secret)

        # Create the pipeline
        self.crawl_controller = CrawlController(
            self.crawler, self.configuration.crawler.research_group)
        self.analysis_controller = AnalysisController()
        self.pipeline_controller = PipelineController(
            data_controller=self.data_controller,
            crawl_controller=self.crawl_controller,
            analysis_controller=self.analysis_controller)
        log.info("Pipeline has been initialized")

        # Create the routing controllers
        self.fields_controller = FieldsController(self, self.data_controller)
        self.profiles_controller = ProfilesController(self,
                                                      self.data_controller,
                                                      self.configuration.cache)
        self.publications_controller = DocumentsController(
            self, self.data_controller)
        self.cache_controller = CacheController(self, self.data_controller,
                                                self.pipeline_controller,
                                                self.configuration)
        self.root_controller = RootController(self, self.data_controller,
                                              self.configuration)

        # Register the routes
        self.register_routes()
        log.info("Routes have been registered")
        log.info("MendeleyCache has been initialized")
Example #12
0
    def test_update_profiles(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        # The call shall not crash with empty input
        r = data_controller.crawl_data.update_profiles([])
        self.assertIsNone(r)

        profile1 = Profile("id1", "Hans", "Mustermann", "", "")
        profile2 = Profile("id2", "Max", "Mustermann", "", "")
        data_controller.crawl_data.update_profiles([profile1, profile2])

        # Check data count in the table
        cnt = data_controller.engine.execute("SELECT COUNT(*) FROM profile").fetchone()
        self.assertEqual(cnt[0], 2)

        # Then query rows
        rows = data_controller.engine.execute(
            "SELECT id, mendeley_id, cache_profile_id, first_name, last_name, display_name "
            "FROM profile "
        ).fetchall()

        # Check first row
        self.assertEqual(rows[0]["mendeley_id"], "id1")
        self.assertEqual(rows[0]["cache_profile_id"], generate_id("hansmustermann"))
        self.assertEqual(rows[0]['first_name'], "Hans")
        self.assertEqual(rows[0]['last_name'], "Mustermann")

        # Check second row
        self.assertEqual(rows[1]["mendeley_id"], "id2")
        self.assertEqual(rows[1]["cache_profile_id"], generate_id("maxmustermann"))
        self.assertEqual(rows[1]['first_name'], "Max")
        self.assertEqual(rows[1]['last_name'], "Mustermann")

        profile1 = Profile("id1", "Hans", "Supermann", "", "")
        profile2 = Profile("id2", "Max", "Mustermann", "", "")
        data_controller.crawl_data.update_profiles([profile1, profile2])

        # Check data count in the table
        cnt = data_controller.engine.execute("SELECT COUNT(*) FROM profile").fetchone()
        self.assertEqual(cnt[0], 2)

        # Then query rows
        rows = data_controller.engine.execute(
            "SELECT id, mendeley_id, cache_profile_id, first_name, last_name, display_name "
            "FROM profile "
        ).fetchall()

        # Check first row
        self.assertEqual(rows[0]["mendeley_id"], "id1")
        self.assertEqual(rows[0]["cache_profile_id"], generate_id("hanssupermann"))
        self.assertEqual(rows[0]['first_name'], "Hans")
        self.assertEqual(rows[0]['last_name'], "Supermann")
Example #13
0
    def test_get_documents_by_profile_ids_and_field_ids(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        # The call shall not crash with empty input
        data_controller.api_data.get_documents_by_profile_ids_and_field_ids([],
                                                                            [])
        data_controller.api_data.get_documents_by_profile_ids_and_field_ids(
            [42, 43], [])
        data_controller.api_data.get_documents_by_profile_ids_and_field_ids(
            [], [42, 43])
        data_controller.api_data.get_documents_by_profile_ids_and_field_ids(
            [42, 43], [44, 45])
Example #14
0
    def test_update_cache_profiles(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        # The call shall not crash with empty input
        r = data_controller.crawl_data.update_cache_profiles(dict())
        self.assertIsNone(r)

        profile1 = Profile("id1", "Hans", "Mustermann", "Longer Real Name", "")
        profile2 = Profile("id2", "Max", "Mustermann", "", "")
        profile3 = Profile("id3", "Hans", "Mustermann", "", "")

        unified_name_to_profiles = dict()
        unified_name_to_profiles["hansmustermann"] = [profile1, profile3]
        unified_name_to_profiles["maxmustermann"] = [profile2]

        data_controller.crawl_data.update_cache_profiles(
            unified_name_to_profiles)

        # Check data count in the table
        cnt = data_controller.engine.execute(
            "SELECT COUNT(*) FROM cache_profile").fetchone()
        self.assertEqual(cnt[0], 2)

        # Then query same title row
        id = generate_id('hansmustermann')
        row = data_controller.engine.execute("SELECT id, profile_id, name "
                                             "FROM cache_profile "
                                             "WHERE id='%s'" % id).fetchone()

        self.assertEqual(row["id"], id)
        self.assertEqual(row["name"], "Hans Mustermann")

        # Now update profiles and check if the link is set correctly
        data_controller.crawl_data.update_profiles(
            [profile1, profile2, profile3])
        row = data_controller.engine.execute(
            "SELECT cp.name as name "
            "FROM cache_profile cp, profile p "
            "WHERE cp.profile_id = p.id "
            "AND cp.id='%s' " % id).fetchone()
        self.assertEqual(row["name"], "Hans Mustermann")
Example #15
0
    def test_update_cache_profiles(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        # The call shall not crash with empty input
        r = data_controller.crawl_data.update_cache_profiles(dict())
        self.assertIsNone(r)

        profile1 = Profile("id1", "Hans", "Mustermann", "Longer Real Name", "")
        profile2 = Profile("id2", "Max", "Mustermann", "", "")
        profile3 = Profile("id3", "Hans", "Mustermann", "", "")

        unified_name_to_profiles = dict()
        unified_name_to_profiles["hansmustermann"] = [profile1, profile3]
        unified_name_to_profiles["maxmustermann"] = [profile2]

        data_controller.crawl_data.update_cache_profiles(unified_name_to_profiles)

        # Check data count in the table
        cnt = data_controller.engine.execute("SELECT COUNT(*) FROM cache_profile").fetchone()
        self.assertEqual(cnt[0], 2)

        # Then query same title row
        id = generate_id('hansmustermann')
        row = data_controller.engine.execute(
            "SELECT id, profile_id, name "
            "FROM cache_profile "
            "WHERE id='%s'" % id
        ).fetchone()

        self.assertEqual(row["id"], id)
        self.assertEqual(row["name"], "Hans Mustermann")

        # Now update profiles and check if the link is set correctly
        data_controller.crawl_data.update_profiles([profile1, profile2, profile3])
        row = data_controller.engine.execute(
            "SELECT cp.name as name "
            "FROM cache_profile cp, profile p "
            "WHERE cp.profile_id = p.id "
            "AND cp.id='%s' " % id
        ).fetchone()
        self.assertEqual(row["name"], "Hans Mustermann")
Example #16
0
    def __init__(self, *args, **kwargs):
        super(MendeleyCache, self).__init__(*args, **kwargs)

        # Read configuration
        self.configuration = ServiceConfiguration()
        self.configuration.load()
        log.info("Configuration has been loaded")

        # Create service controllers
        self.data_controller = DataController(self.configuration.database)
        self.data_controller.assert_schema()
        log.info("Schema has been checked")

        # Create crawler based on configuration
        self.crawler = None
        """:type : AbstractCrawler"""
        if not self.configuration.uses_mendeley:
            log.info("Pipeline uses FileCrawler")
            self.crawler = FileCrawler()
        else:
            from mendeleycache.crawler.sdk_crawler import SDKCrawler

            log.info(
                "Pipeline uses SDKCrawler".format(
                    app_id=self.configuration.crawler.app_id, app_secret=self.configuration.crawler.app_secret
                )
            )
            self.crawler = SDKCrawler(
                app_id=self.configuration.crawler.app_id, app_secret=self.configuration.crawler.app_secret
            )

        # Create the pipeline
        self.crawl_controller = CrawlController(self.crawler, self.configuration.crawler.research_group)
        self.analysis_controller = AnalysisController()
        self.pipeline_controller = PipelineController(
            data_controller=self.data_controller,
            crawl_controller=self.crawl_controller,
            analysis_controller=self.analysis_controller,
        )
        log.info("Pipeline has been initialized")

        # Create the routing controllers
        self.fields_controller = FieldsController(self, self.data_controller)
        self.profiles_controller = ProfilesController(self, self.data_controller, self.configuration.cache)
        self.publications_controller = DocumentsController(self, self.data_controller)
        self.cache_controller = CacheController(
            self, self.data_controller, self.pipeline_controller, self.configuration
        )
        self.root_controller = RootController(self, self.data_controller, self.configuration)

        # Register the routes
        self.register_routes()
        log.info("Routes have been registered")
        log.info("MendeleyCache has been initialized")
Example #17
0
    def test_errors(self):
        sqlite_in_memory = SQLiteConfiguration("")
        ctrl = DataController(sqlite_in_memory)

        # Try completely dumb sql
        try:
            with ctrl.engine.connect() as conn:
                conn.execute("COMPLETELY WRONG COMMAND")
                self.fail("DBAPI exception not fired")
        except DBAPIError as e:
            pass
Example #18
0
    def test_execute(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        crawler = FileCrawler()
        crawl_controller = CrawlController(crawler, "d0b7f41f-ad37-3b47-ab70-9feac35557cc")

        analysis_controller = AnalysisController()

        pipeline_controller = PipelineController(
            data_controller=data_controller,
            crawl_controller=crawl_controller,
            analysis_controller=analysis_controller
        )

        # Clean run shall not crash
        pipeline_controller.execute()

        # Second run shall not crash either
        pipeline_controller.execute()

        # Third run shall not crash either
        pipeline_controller.execute()
Example #19
0
    def test_execute(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        crawler = FileCrawler()
        crawl_controller = CrawlController(
            crawler, "d0b7f41f-ad37-3b47-ab70-9feac35557cc")

        analysis_controller = AnalysisController()

        pipeline_controller = PipelineController(
            data_controller=data_controller,
            crawl_controller=crawl_controller,
            analysis_controller=analysis_controller)

        # Clean run shall not crash
        pipeline_controller.execute()

        # Second run shall not crash either
        pipeline_controller.execute()

        # Third run shall not crash either
        pipeline_controller.execute()
Example #20
0
    def test_update_documents(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        # The call shall not crash with empty input
        r = data_controller.crawl_data.update_documents([])
        self.assertIsNone(r)

        document1 = Document(core_id="doc1",
                             core_profile_id="id1",
                             core_title="title1",
                             core_type="conference_proceedings",
                             core_created=datetime.now(),
                             core_last_modified=datetime.now(),
                             core_abstract="blabla",
                             core_source="ACM xy",
                             core_year=2015,
                             core_authors=[("Hans", "Mustermann"),
                                           ("Nicht", "Existent")],
                             core_keywords=[],
                             doc_website="",
                             conf_website="",
                             conf_pages="",
                             conf_month=0,
                             conf_city="",
                             tags=["t ag- 1"])
        document2 = Document(core_id="doc2",
                             core_profile_id="id2",
                             core_title="title2",
                             core_type="conference_proceedings",
                             core_created=datetime.now(),
                             core_last_modified=datetime.now(),
                             core_abstract="blabla2",
                             core_source="ACM xyz",
                             core_year=2014,
                             core_authors=[],
                             core_keywords=[],
                             doc_website="",
                             conf_website="",
                             conf_pages="",
                             conf_month=0,
                             conf_city="",
                             tags=[])
        data_controller.crawl_data.update_documents([document1, document2])

        # Check data count in the table
        cnt = data_controller.engine.execute(
            "SELECT COUNT(*) FROM document").fetchone()
        self.assertEqual(cnt[0], 2)

        # Then query rows
        rows = data_controller.engine.execute(
            "SELECT id, mendeley_id, cache_document_id, owner_mendeley_id,"
            " title, doc_type, created, last_modified, abstract, source, pub_year "
            "FROM document ").fetchall()

        # Check first row
        self.assertEqual(rows[0]["mendeley_id"], "doc1")
        self.assertEqual(rows[0]["cache_document_id"], generate_id("title1"))
        self.assertEqual(rows[0]["owner_mendeley_id"], "id1")
        self.assertEqual(rows[0]['title'], "title1")
        self.assertEqual(rows[0]["doc_type"], "conference_proceedings")
        self.assertEqual(rows[0]['abstract'], "blabla")
        self.assertEqual(rows[0]["source"], "ACM xy")
        self.assertEqual(rows[0]["pub_year"], 2015)

        # Check second row
        self.assertEqual(rows[1]["mendeley_id"], "doc2")
        self.assertEqual(rows[1]["cache_document_id"], generate_id("title2"))
        self.assertEqual(rows[1]["owner_mendeley_id"], "id2")
        self.assertEqual(rows[1]['title'], "title2")
        self.assertEqual(rows[1]["doc_type"], "conference_proceedings")
        self.assertEqual(rows[1]['abstract'], "blabla2")
        self.assertEqual(rows[1]["source"], "ACM xyz")
        self.assertEqual(rows[1]["pub_year"], 2014)

        document1 = Document(core_id="doc1",
                             core_profile_id="id1",
                             core_title="newtitle1",
                             core_type="conference_proceedings",
                             core_created=datetime.now(),
                             core_last_modified=datetime.now(),
                             core_abstract="blablaNew",
                             core_source="ACM xyz1",
                             core_year=2015,
                             core_authors=[],
                             core_keywords=[],
                             doc_website="",
                             conf_website="",
                             conf_pages="",
                             conf_month=0,
                             conf_city="",
                             tags=[])
        document2 = Document(core_id="doc2",
                             core_profile_id="id2",
                             core_title="title2",
                             core_type="conference_proceedings",
                             core_created=datetime.now(),
                             core_last_modified=datetime.now(),
                             core_abstract="blabla2",
                             core_source="ACM xyz",
                             core_year=2014,
                             core_authors=[],
                             core_keywords=[],
                             doc_website="",
                             conf_website="",
                             conf_pages="",
                             conf_month=0,
                             conf_city="",
                             tags=[])

        data_controller.crawl_data.update_documents([document1, document2])

        # Check data count in the table
        cnt = data_controller.engine.execute(
            "SELECT COUNT(*) FROM document").fetchone()
        self.assertEqual(cnt[0], 2)

        # Then query rows
        rows = data_controller.engine.execute(
            "SELECT id, mendeley_id, cache_document_id, owner_mendeley_id,"
            " title, doc_type, created, last_modified, abstract, source, pub_year "
            "FROM document ").fetchall()

        # Check first row
        self.assertEqual(rows[0]["mendeley_id"], "doc1")
        self.assertEqual(rows[0]["cache_document_id"],
                         generate_id("newtitle1"))
        self.assertEqual(rows[0]["owner_mendeley_id"], "id1")
        self.assertEqual(rows[0]['title'], "newtitle1")
        self.assertEqual(rows[0]["doc_type"], "conference_proceedings")
        self.assertEqual(rows[0]['abstract'], "blablaNew")
        self.assertEqual(rows[0]["source"], "ACM xyz1")
        self.assertEqual(rows[0]["pub_year"], 2015)
Example #21
0
 def test_assert_drop(self):
     sqlite_in_memory = SQLiteConfiguration("")
     ctrl = DataController(sqlite_in_memory)
     ctrl.assert_schema()
     self.assertTrue(ctrl.is_initialized())
     ctrl.drop_all()
     self.assertFalse(ctrl.is_initialized())
     ctrl.assert_schema()
     self.assertTrue(ctrl.is_initialized())
     with ctrl.engine.begin() as conn:
         conn.execute("DROP TABLE cache_document")
     self.assertFalse(ctrl.is_initialized())
     ctrl.assert_schema()
     self.assertTrue(ctrl.is_initialized())
Example #22
0
 def test_is_initialized(self):
     sqlite_in_memory = SQLiteConfiguration("")
     ctrl = DataController(sqlite_in_memory)
     ctrl.run_schema()
     self.assertTrue(ctrl.is_initialized())
Example #23
0
    def test_run_schema(self):
        sqlite_in_memory = SQLiteConfiguration("")
        ctrl = DataController(sqlite_in_memory)

        # First check that none of the tables exists
        self.assertFalse(ctrl.table_exists('profile'))
        self.assertFalse(ctrl.table_exists('document'))
        self.assertFalse(ctrl.table_exists('cache_profile'))
        self.assertFalse(ctrl.table_exists('cache_document'))
        self.assertFalse(ctrl.table_exists('cache_field'))
        self.assertFalse(ctrl.table_exists('cache_document_has_cache_field'))
        self.assertFalse(ctrl.table_exists('cache_profile_has_cache_document'))
        self.assertFalse(ctrl.table_exists('update_log'))

        # Create schema
        ctrl.run_schema()

        # After schema creation all tables need to exist
        self.assertTrue(ctrl.table_exists('profile'))
        self.assertTrue(ctrl.table_exists('document'))
        self.assertTrue(ctrl.table_exists('cache_profile'))
        self.assertTrue(ctrl.table_exists('cache_document'))
        self.assertTrue(ctrl.table_exists('cache_field'))
        self.assertTrue(ctrl.table_exists('cache_document_has_cache_field'))
        self.assertTrue(ctrl.table_exists('cache_profile_has_cache_document'))
        self.assertTrue(ctrl.table_exists('update_log'))
Example #24
0
 def test_is_initialized(self):
     sqlite_in_memory = SQLiteConfiguration("")
     ctrl = DataController(sqlite_in_memory)
     ctrl.run_schema()
     self.assertTrue(ctrl.is_initialized())
Example #25
0
 def test_assert_drop(self):
     sqlite_in_memory = SQLiteConfiguration("")
     ctrl = DataController(sqlite_in_memory)
     ctrl.assert_schema()
     self.assertTrue(ctrl.is_initialized())
     ctrl.drop_all()
     self.assertFalse(ctrl.is_initialized())
     ctrl.assert_schema()
     self.assertTrue(ctrl.is_initialized())
     with ctrl.engine.begin() as conn:
         conn.execute("DROP TABLE cache_document")
     self.assertFalse(ctrl.is_initialized())
     ctrl.assert_schema()
     self.assertTrue(ctrl.is_initialized())
Example #26
0
def sample_pipeline(app_id=None, app_secret=None):
    from mendeleycache.crawler.sdk_crawler import SDKCrawler

    sqlite_in_memory = SQLiteConfiguration("")
    data_controller = DataController(sqlite_in_memory)
    data_controller.run_schema()

    crawler = None
    if app_id is None and app_secret is None:
        crawler = FileCrawler()
    else:
        crawler = SDKCrawler(app_id=app_id, app_secret=app_secret)

    crawl_controller = CrawlController(crawler, "d0b7f41f-ad37-3b47-ab70-9feac35557cc")

    analysis_controller = AnalysisController()

    pipeline_controller = PipelineController(
        data_controller=data_controller,
        crawl_controller=crawl_controller,
        analysis_controller=analysis_controller
    )

    # Clean run shall not crash
    pipeline_controller.execute()

    rows = data_controller.engine.execute("SELECT * FROM profile").fetchall()
    print()
    print("Profiles:")
    for row in rows:
        print(row)

    rows = data_controller.engine.execute("SELECT * FROM cache_profile").fetchall()
    print()
    print("Cache profiles:")
    for row in rows:
        print(row)

    rows = data_controller.engine.execute("SELECT id, owner_mendeley_id, title, authors, tags FROM document").fetchall()
    print()
    print("Documents:")
    for row in rows:
        print(row)

    rows = data_controller.engine.execute("SELECT * FROM cache_document").fetchall()
    print()
    print("Cache documents:")
    for row in rows:
        print(row)

    rows = data_controller.engine.execute("SELECT * FROM cache_field").fetchall()
    print()
    print("Cache fields:")
    for row in rows:
        print(row)

    rows = data_controller.engine.execute("SELECT * FROM cache_document_has_cache_field").fetchall()
    print()
    print("LINK: Cache document -> Cache field:")
    for row in rows:
        print(row)

    rows = data_controller.engine.execute("SELECT * FROM cache_profile_has_cache_document").fetchall()
    print()
    print("LINK: Cache profile -> Cache document:")
    for row in rows:
        print(row)

    print()
Example #27
0
    def test_update_cache_documents(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        # The call shall not crash with empty input
        r = data_controller.crawl_data.update_cache_documents(dict())
        self.assertIsNone(r)

        document1 = Document(
            core_id="doc1",
            core_profile_id="id1",
            core_title="sametitle1",
            core_type="conference_proceedings",
            core_created=datetime.now(),
            core_last_modified=datetime.now(),
            core_abstract="Older Abtract",
            core_source="Older source",
            core_year=2015,
            core_authors=[],
            core_keywords=[],
            doc_website="",
            conf_website="",
            conf_pages="",
            conf_month=0,
            conf_city="",
            tags=[]
        )
        document2 = Document(
            core_id="doc2",
            core_profile_id="id2",
            core_title="title2",
            core_type="conference_proceedings",
            core_created=datetime.now(),
            core_last_modified=datetime.now(),
            core_abstract="blabla2",
            core_source="ACM xyz",
            core_year=2014,
            core_authors=[],
            core_keywords=[],
            doc_website="",
            conf_website="",
            conf_pages="",
            conf_month=0,
            conf_city="",
            tags=[]
        )
        document3 = Document(
            core_id="doc3",
            core_profile_id="id3",
            core_title="sametitle1",
            core_type="conference_proceedings",
            core_created=datetime.now(),
            core_last_modified=datetime.now(),
            core_abstract="Newer abstract",
            core_source="Newer source",
            core_year=2015,
            core_authors=[],
            core_keywords=[],
            doc_website="",
            conf_website="",
            conf_pages="",
            conf_month=0,
            conf_city="",
            tags=[]
        )
        unified_document_title_to_documents = dict()
        unified_document_title_to_documents["samtetitle1"] = [document1, document3]
        unified_document_title_to_documents["title2"] = [document2]

        # Trigger cache document update
        data_controller.crawl_data.update_cache_documents(unified_document_title_to_documents)

        # Check data count in the table
        cnt = data_controller.engine.execute("SELECT COUNT(*) FROM cache_document").fetchone()
        self.assertEqual(cnt[0], 2)

        # Then query sametitle row
        id = generate_id('sametitle1')
        row = data_controller.engine.execute(
            "SELECT id, document_id, title "
            "FROM cache_document "
            "WHERE id='%s'" % id
        ).fetchone()
        self.assertEqual(row["id"], id)
        self.assertEqual(row["title"], "sametitle1")

        # Now update documents and check if the link is set correctly
        data_controller.crawl_data.update_documents([document1, document2, document3])
        row = data_controller.engine.execute(
            "SELECT cd.title as title "
            "FROM cache_document cd, document d "
            "WHERE cd.document_id = d.id "
            "AND cd.id='%s' " % id
        ).fetchone()
        self.assertEqual(row["title"], "sametitle1")
Example #28
0
        # Create suites
        all = loader.discover(start_dir=project_root)

        # Run suites
        runner.run(all)

    elif command == "prepare":
        log.info("Preparing environment for gunicorn workers")
        # Read configuration
        configuration = ServiceConfiguration()
        configuration.load()
        log.info("Configuration has been loaded")

        # Create data controller and assert schema
        # That will remove the race conditions of the gunicorn worker if it's done on every startup
        data_controller = DataController(configuration.database)
        data_controller.assert_schema()

    # Pipeline runner
    elif command == "pipeline":
        config = ServiceConfiguration()
        config.load()

        data_controller = DataController(config.database)
        if not data_controller.is_initialized():
            log.critical("Database is not initialized")
            exit()

        crawler = None
        if not config.uses_mendeley:
            log.info("Pipeline uses FileCrawler")
Example #29
0
    def test_update_documents(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        # The call shall not crash with empty input
        r = data_controller.crawl_data.update_documents([])
        self.assertIsNone(r)

        document1 = Document(
            core_id="doc1",
            core_profile_id="id1",
            core_title="title1",
            core_type="conference_proceedings",
            core_created=datetime.now(),
            core_last_modified=datetime.now(),
            core_abstract="blabla",
            core_source="ACM xy",
            core_year=2015,
            core_authors=[("Hans", "Mustermann"), ("Nicht", "Existent")],
            core_keywords=[],
            doc_website="",
            conf_website="",
            conf_pages="",
            conf_month=0,
            conf_city="",
            tags=["t ag- 1"]
        )
        document2 = Document(
            core_id="doc2",
            core_profile_id="id2",
            core_title="title2",
            core_type="conference_proceedings",
            core_created=datetime.now(),
            core_last_modified=datetime.now(),
            core_abstract="blabla2",
            core_source="ACM xyz",
            core_year=2014,
            core_authors=[],
            core_keywords=[],
            doc_website="",
            conf_website="",
            conf_pages="",
            conf_month=0,
            conf_city="",
            tags=[]
        )
        data_controller.crawl_data.update_documents([document1, document2])

        # Check data count in the table
        cnt = data_controller.engine.execute("SELECT COUNT(*) FROM document").fetchone()
        self.assertEqual(cnt[0], 2)

        # Then query rows
        rows = data_controller.engine.execute(
            "SELECT id, mendeley_id, cache_document_id, owner_mendeley_id,"
            " title, doc_type, created, last_modified, abstract, source, pub_year "
            "FROM document "
        ).fetchall()

        # Check first row
        self.assertEqual(rows[0]["mendeley_id"], "doc1")
        self.assertEqual(rows[0]["cache_document_id"], generate_id("title1"))
        self.assertEqual(rows[0]["owner_mendeley_id"], "id1")
        self.assertEqual(rows[0]['title'], "title1")
        self.assertEqual(rows[0]["doc_type"], "conference_proceedings")
        self.assertEqual(rows[0]['abstract'], "blabla")
        self.assertEqual(rows[0]["source"], "ACM xy")
        self.assertEqual(rows[0]["pub_year"], 2015)

        # Check second row
        self.assertEqual(rows[1]["mendeley_id"], "doc2")
        self.assertEqual(rows[1]["cache_document_id"], generate_id("title2"))
        self.assertEqual(rows[1]["owner_mendeley_id"], "id2")
        self.assertEqual(rows[1]['title'], "title2")
        self.assertEqual(rows[1]["doc_type"], "conference_proceedings")
        self.assertEqual(rows[1]['abstract'], "blabla2")
        self.assertEqual(rows[1]["source"], "ACM xyz")
        self.assertEqual(rows[1]["pub_year"], 2014)

        document1 = Document(
            core_id="doc1",
            core_profile_id="id1",
            core_title="newtitle1",
            core_type="conference_proceedings",
            core_created=datetime.now(),
            core_last_modified=datetime.now(),
            core_abstract="blablaNew",
            core_source="ACM xyz1",
            core_year=2015,
            core_authors=[],
            core_keywords=[],
            doc_website="",
            conf_website="",
            conf_pages="",
            conf_month=0,
            conf_city="",
            tags=[]
        )
        document2 = Document(
            core_id="doc2",
            core_profile_id="id2",
            core_title="title2",
            core_type="conference_proceedings",
            core_created=datetime.now(),
            core_last_modified=datetime.now(),
            core_abstract="blabla2",
            core_source="ACM xyz",
            core_year=2014,
            core_authors=[],
            core_keywords=[],
            doc_website="",
            conf_website="",
            conf_pages="",
            conf_month=0,
            conf_city="",
            tags=[]
        )

        data_controller.crawl_data.update_documents([document1, document2])

        # Check data count in the table
        cnt = data_controller.engine.execute("SELECT COUNT(*) FROM document").fetchone()
        self.assertEqual(cnt[0], 2)

        # Then query rows
        rows = data_controller.engine.execute(
            "SELECT id, mendeley_id, cache_document_id, owner_mendeley_id,"
            " title, doc_type, created, last_modified, abstract, source, pub_year "
            "FROM document "
        ).fetchall()

         # Check first row
        self.assertEqual(rows[0]["mendeley_id"], "doc1")
        self.assertEqual(rows[0]["cache_document_id"], generate_id("newtitle1"))
        self.assertEqual(rows[0]["owner_mendeley_id"], "id1")
        self.assertEqual(rows[0]['title'], "newtitle1")
        self.assertEqual(rows[0]["doc_type"], "conference_proceedings")
        self.assertEqual(rows[0]['abstract'], "blablaNew")
        self.assertEqual(rows[0]["source"], "ACM xyz1")
        self.assertEqual(rows[0]["pub_year"], 2015)
Example #30
0
        # Create suites
        all = loader.discover(start_dir=project_root)

        # Run suites
        runner.run(all)

    elif command == "prepare":
        log.info("Preparing environment for gunicorn workers")
        # Read configuration
        configuration = ServiceConfiguration()
        configuration.load()
        log.info("Configuration has been loaded")

        # Create data controller and assert schema
        # That will remove the race conditions of the gunicorn worker if it's done on every startup
        data_controller = DataController(configuration.database)
        data_controller.assert_schema()

    # Pipeline runner
    elif command == "pipeline":
        config = ServiceConfiguration()
        config.load()

        data_controller = DataController(config.database)
        if not data_controller.is_initialized():
            log.critical("Database is not initialized")
            exit()

        crawler = None
        if not config.uses_mendeley:
            log.info("Pipeline uses FileCrawler")
Example #31
0
def sample_pipeline(app_id=None, app_secret=None):
    from mendeleycache.crawler.sdk_crawler import SDKCrawler

    sqlite_in_memory = SQLiteConfiguration("")
    data_controller = DataController(sqlite_in_memory)
    data_controller.run_schema()

    crawler = None
    if app_id is None and app_secret is None:
        crawler = FileCrawler()
    else:
        crawler = SDKCrawler(app_id=app_id, app_secret=app_secret)

    crawl_controller = CrawlController(crawler,
                                       "d0b7f41f-ad37-3b47-ab70-9feac35557cc")

    analysis_controller = AnalysisController()

    pipeline_controller = PipelineController(
        data_controller=data_controller,
        crawl_controller=crawl_controller,
        analysis_controller=analysis_controller)

    # Clean run shall not crash
    pipeline_controller.execute()

    rows = data_controller.engine.execute("SELECT * FROM profile").fetchall()
    print()
    print("Profiles:")
    for row in rows:
        print(row)

    rows = data_controller.engine.execute(
        "SELECT * FROM cache_profile").fetchall()
    print()
    print("Cache profiles:")
    for row in rows:
        print(row)

    rows = data_controller.engine.execute(
        "SELECT id, owner_mendeley_id, title, authors, tags FROM document"
    ).fetchall()
    print()
    print("Documents:")
    for row in rows:
        print(row)

    rows = data_controller.engine.execute(
        "SELECT * FROM cache_document").fetchall()
    print()
    print("Cache documents:")
    for row in rows:
        print(row)

    rows = data_controller.engine.execute(
        "SELECT * FROM cache_field").fetchall()
    print()
    print("Cache fields:")
    for row in rows:
        print(row)

    rows = data_controller.engine.execute(
        "SELECT * FROM cache_document_has_cache_field").fetchall()
    print()
    print("LINK: Cache document -> Cache field:")
    for row in rows:
        print(row)

    rows = data_controller.engine.execute(
        "SELECT * FROM cache_profile_has_cache_document").fetchall()
    print()
    print("LINK: Cache profile -> Cache document:")
    for row in rows:
        print(row)

    print()
Example #32
0
    def test_update_cache_documents(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        # The call shall not crash with empty input
        r = data_controller.crawl_data.update_cache_documents(dict())
        self.assertIsNone(r)

        document1 = Document(core_id="doc1",
                             core_profile_id="id1",
                             core_title="sametitle1",
                             core_type="conference_proceedings",
                             core_created=datetime.now(),
                             core_last_modified=datetime.now(),
                             core_abstract="Older Abtract",
                             core_source="Older source",
                             core_year=2015,
                             core_authors=[],
                             core_keywords=[],
                             doc_website="",
                             conf_website="",
                             conf_pages="",
                             conf_month=0,
                             conf_city="",
                             tags=[])
        document2 = Document(core_id="doc2",
                             core_profile_id="id2",
                             core_title="title2",
                             core_type="conference_proceedings",
                             core_created=datetime.now(),
                             core_last_modified=datetime.now(),
                             core_abstract="blabla2",
                             core_source="ACM xyz",
                             core_year=2014,
                             core_authors=[],
                             core_keywords=[],
                             doc_website="",
                             conf_website="",
                             conf_pages="",
                             conf_month=0,
                             conf_city="",
                             tags=[])
        document3 = Document(core_id="doc3",
                             core_profile_id="id3",
                             core_title="sametitle1",
                             core_type="conference_proceedings",
                             core_created=datetime.now(),
                             core_last_modified=datetime.now(),
                             core_abstract="Newer abstract",
                             core_source="Newer source",
                             core_year=2015,
                             core_authors=[],
                             core_keywords=[],
                             doc_website="",
                             conf_website="",
                             conf_pages="",
                             conf_month=0,
                             conf_city="",
                             tags=[])
        unified_document_title_to_documents = dict()
        unified_document_title_to_documents["samtetitle1"] = [
            document1, document3
        ]
        unified_document_title_to_documents["title2"] = [document2]

        # Trigger cache document update
        data_controller.crawl_data.update_cache_documents(
            unified_document_title_to_documents)

        # Check data count in the table
        cnt = data_controller.engine.execute(
            "SELECT COUNT(*) FROM cache_document").fetchone()
        self.assertEqual(cnt[0], 2)

        # Then query sametitle row
        id = generate_id('sametitle1')
        row = data_controller.engine.execute("SELECT id, document_id, title "
                                             "FROM cache_document "
                                             "WHERE id='%s'" % id).fetchone()
        self.assertEqual(row["id"], id)
        self.assertEqual(row["title"], "sametitle1")

        # Now update documents and check if the link is set correctly
        data_controller.crawl_data.update_documents(
            [document1, document2, document3])
        row = data_controller.engine.execute(
            "SELECT cd.title as title "
            "FROM cache_document cd, document d "
            "WHERE cd.document_id = d.id "
            "AND cd.id='%s' " % id).fetchone()
        self.assertEqual(row["title"], "sametitle1")
Example #33
0
    def test_run_schema(self):
        sqlite_in_memory = SQLiteConfiguration("")
        ctrl = DataController(sqlite_in_memory)

        # First check that none of the tables exists
        self.assertFalse(ctrl.table_exists('profile'))
        self.assertFalse(ctrl.table_exists('document'))
        self.assertFalse(ctrl.table_exists('cache_profile'))
        self.assertFalse(ctrl.table_exists('cache_document'))
        self.assertFalse(ctrl.table_exists('cache_field'))
        self.assertFalse(ctrl.table_exists('cache_document_has_cache_field'))
        self.assertFalse(ctrl.table_exists('cache_profile_has_cache_document'))
        self.assertFalse(ctrl.table_exists('update_log'))

        # Create schema
        ctrl.run_schema()

        # After schema creation all tables need to exist
        self.assertTrue(ctrl.table_exists('profile'))
        self.assertTrue(ctrl.table_exists('document'))
        self.assertTrue(ctrl.table_exists('cache_profile'))
        self.assertTrue(ctrl.table_exists('cache_document'))
        self.assertTrue(ctrl.table_exists('cache_field'))
        self.assertTrue(ctrl.table_exists('cache_document_has_cache_field'))
        self.assertTrue(ctrl.table_exists('cache_profile_has_cache_document'))
        self.assertTrue(ctrl.table_exists('update_log'))