def test_all_with_pipeline_data(self): sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() crawler = FileCrawler() crawl_controller = CrawlController(crawler, "d0b7f41f-ad37-3b47-ab70-9feac35557cc") analysis_controller = AnalysisController() pipeline_controller = PipelineController( data_controller=data_controller, crawl_controller=crawl_controller, analysis_controller=analysis_controller ) # Clean run pipeline_controller.execute() # Test slim profiles slim_profiles = data_controller.api_data.get_profiles_slim() self.assertEqual(len(slim_profiles), 19) # Test fields fields = data_controller.api_data.get_fields() self.assertEqual(len(fields), 14)
def test_update_cache_fields(self): sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() # The call shall not crash with empty input r = data_controller.crawl_data.update_cache_fields(dict()) self.assertIsNone(r) field1 = CacheField(title="field 1", unified_title="field1") field2 = CacheField(title="field 2", unified_title="field2") unified_field_title_to_field = dict() unified_field_title_to_field["field1"] = field1 unified_field_title_to_field["field2"] = field2 data_controller.crawl_data.update_cache_fields( unified_field_title_to_field) # Check data count in the table cnt = data_controller.engine.execute( "SELECT COUNT(*) FROM cache_field").fetchone() self.assertEqual(cnt[0], 2) # Then query rows id = generate_id('field1') row = data_controller.engine.execute("SELECT id, title " "FROM cache_field " "WHERE id='%s' " % id).fetchone() # Check first row self.assertEqual(row["id"], id) self.assertEqual(row['title'], "field 1")
def test_get_profiles_slim(self): sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() # The call shall not crash with empty input data_controller.api_data.get_profiles_slim()
def test_all_with_pipeline_data(self): sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() crawler = FileCrawler() crawl_controller = CrawlController( crawler, "d0b7f41f-ad37-3b47-ab70-9feac35557cc") analysis_controller = AnalysisController() pipeline_controller = PipelineController( data_controller=data_controller, crawl_controller=crawl_controller, analysis_controller=analysis_controller) # Clean run pipeline_controller.execute() # Test slim profiles slim_profiles = data_controller.api_data.get_profiles_slim() self.assertEqual(len(slim_profiles), 19) # Test fields fields = data_controller.api_data.get_fields() self.assertEqual(len(fields), 14)
def test_update_cache_fields(self): sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() # The call shall not crash with empty input r = data_controller.crawl_data.update_cache_fields(dict()) self.assertIsNone(r) field1 = CacheField(title="field 1", unified_title="field1") field2 = CacheField(title="field 2", unified_title="field2") unified_field_title_to_field = dict() unified_field_title_to_field["field1"] = field1 unified_field_title_to_field["field2"] = field2 data_controller.crawl_data.update_cache_fields(unified_field_title_to_field) # Check data count in the table cnt = data_controller.engine.execute("SELECT COUNT(*) FROM cache_field").fetchone() self.assertEqual(cnt[0], 2) # Then query rows id = generate_id('field1') row = data_controller.engine.execute( "SELECT id, title " "FROM cache_field " "WHERE id='%s' " % id ).fetchone() # Check first row self.assertEqual(row["id"], id) self.assertEqual(row['title'], "field 1")
def test_link_fields_to_documents(self): sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() # The call shall not crash with empty input r = data_controller.crawl_data.link_fields_to_documents(dict()) self.assertIsNone(r)
def test_update_profiles(self): sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() # The call shall not crash with empty input r = data_controller.crawl_data.update_profiles([]) self.assertIsNone(r) profile1 = Profile("id1", "Hans", "Mustermann", "", "") profile2 = Profile("id2", "Max", "Mustermann", "", "") data_controller.crawl_data.update_profiles([profile1, profile2]) # Check data count in the table cnt = data_controller.engine.execute( "SELECT COUNT(*) FROM profile").fetchone() self.assertEqual(cnt[0], 2) # Then query rows rows = data_controller.engine.execute( "SELECT id, mendeley_id, cache_profile_id, first_name, last_name, display_name " "FROM profile ").fetchall() # Check first row self.assertEqual(rows[0]["mendeley_id"], "id1") self.assertEqual(rows[0]["cache_profile_id"], generate_id("hansmustermann")) self.assertEqual(rows[0]['first_name'], "Hans") self.assertEqual(rows[0]['last_name'], "Mustermann") # Check second row self.assertEqual(rows[1]["mendeley_id"], "id2") self.assertEqual(rows[1]["cache_profile_id"], generate_id("maxmustermann")) self.assertEqual(rows[1]['first_name'], "Max") self.assertEqual(rows[1]['last_name'], "Mustermann") profile1 = Profile("id1", "Hans", "Supermann", "", "") profile2 = Profile("id2", "Max", "Mustermann", "", "") data_controller.crawl_data.update_profiles([profile1, profile2]) # Check data count in the table cnt = data_controller.engine.execute( "SELECT COUNT(*) FROM profile").fetchone() self.assertEqual(cnt[0], 2) # Then query rows rows = data_controller.engine.execute( "SELECT id, mendeley_id, cache_profile_id, first_name, last_name, display_name " "FROM profile ").fetchall() # Check first row self.assertEqual(rows[0]["mendeley_id"], "id1") self.assertEqual(rows[0]["cache_profile_id"], generate_id("hanssupermann")) self.assertEqual(rows[0]['first_name'], "Hans") self.assertEqual(rows[0]['last_name'], "Supermann")
def test_get_documents_by_profile_ids_and_field_ids(self): sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() # The call shall not crash with empty input data_controller.api_data.get_documents_by_profile_ids_and_field_ids([], []) data_controller.api_data.get_documents_by_profile_ids_and_field_ids([42, 43], []) data_controller.api_data.get_documents_by_profile_ids_and_field_ids([], [42, 43]) data_controller.api_data.get_documents_by_profile_ids_and_field_ids([42, 43], [44, 45])
def __init__(self, *args, **kwargs): super(MendeleyCache, self).__init__(*args, **kwargs) # Read configuration self.configuration = ServiceConfiguration() self.configuration.load() log.info("Configuration has been loaded") # Create service controllers self.data_controller = DataController(self.configuration.database) self.data_controller.assert_schema() log.info("Schema has been checked") # Create crawler based on configuration self.crawler = None """:type : AbstractCrawler""" if not self.configuration.uses_mendeley: log.info("Pipeline uses FileCrawler") self.crawler = FileCrawler() else: from mendeleycache.crawler.sdk_crawler import SDKCrawler log.info("Pipeline uses SDKCrawler".format( app_id=self.configuration.crawler.app_id, app_secret=self.configuration.crawler.app_secret)) self.crawler = SDKCrawler( app_id=self.configuration.crawler.app_id, app_secret=self.configuration.crawler.app_secret) # Create the pipeline self.crawl_controller = CrawlController( self.crawler, self.configuration.crawler.research_group) self.analysis_controller = AnalysisController() self.pipeline_controller = PipelineController( data_controller=self.data_controller, crawl_controller=self.crawl_controller, analysis_controller=self.analysis_controller) log.info("Pipeline has been initialized") # Create the routing controllers self.fields_controller = FieldsController(self, self.data_controller) self.profiles_controller = ProfilesController(self, self.data_controller, self.configuration.cache) self.publications_controller = DocumentsController( self, self.data_controller) self.cache_controller = CacheController(self, self.data_controller, self.pipeline_controller, self.configuration) self.root_controller = RootController(self, self.data_controller, self.configuration) # Register the routes self.register_routes() log.info("Routes have been registered") log.info("MendeleyCache has been initialized")
def test_update_profiles(self): sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() # The call shall not crash with empty input r = data_controller.crawl_data.update_profiles([]) self.assertIsNone(r) profile1 = Profile("id1", "Hans", "Mustermann", "", "") profile2 = Profile("id2", "Max", "Mustermann", "", "") data_controller.crawl_data.update_profiles([profile1, profile2]) # Check data count in the table cnt = data_controller.engine.execute("SELECT COUNT(*) FROM profile").fetchone() self.assertEqual(cnt[0], 2) # Then query rows rows = data_controller.engine.execute( "SELECT id, mendeley_id, cache_profile_id, first_name, last_name, display_name " "FROM profile " ).fetchall() # Check first row self.assertEqual(rows[0]["mendeley_id"], "id1") self.assertEqual(rows[0]["cache_profile_id"], generate_id("hansmustermann")) self.assertEqual(rows[0]['first_name'], "Hans") self.assertEqual(rows[0]['last_name'], "Mustermann") # Check second row self.assertEqual(rows[1]["mendeley_id"], "id2") self.assertEqual(rows[1]["cache_profile_id"], generate_id("maxmustermann")) self.assertEqual(rows[1]['first_name'], "Max") self.assertEqual(rows[1]['last_name'], "Mustermann") profile1 = Profile("id1", "Hans", "Supermann", "", "") profile2 = Profile("id2", "Max", "Mustermann", "", "") data_controller.crawl_data.update_profiles([profile1, profile2]) # Check data count in the table cnt = data_controller.engine.execute("SELECT COUNT(*) FROM profile").fetchone() self.assertEqual(cnt[0], 2) # Then query rows rows = data_controller.engine.execute( "SELECT id, mendeley_id, cache_profile_id, first_name, last_name, display_name " "FROM profile " ).fetchall() # Check first row self.assertEqual(rows[0]["mendeley_id"], "id1") self.assertEqual(rows[0]["cache_profile_id"], generate_id("hanssupermann")) self.assertEqual(rows[0]['first_name'], "Hans") self.assertEqual(rows[0]['last_name'], "Supermann")
def test_get_documents_by_profile_ids_and_field_ids(self): sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() # The call shall not crash with empty input data_controller.api_data.get_documents_by_profile_ids_and_field_ids([], []) data_controller.api_data.get_documents_by_profile_ids_and_field_ids( [42, 43], []) data_controller.api_data.get_documents_by_profile_ids_and_field_ids( [], [42, 43]) data_controller.api_data.get_documents_by_profile_ids_and_field_ids( [42, 43], [44, 45])
def test_update_cache_profiles(self): sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() # The call shall not crash with empty input r = data_controller.crawl_data.update_cache_profiles(dict()) self.assertIsNone(r) profile1 = Profile("id1", "Hans", "Mustermann", "Longer Real Name", "") profile2 = Profile("id2", "Max", "Mustermann", "", "") profile3 = Profile("id3", "Hans", "Mustermann", "", "") unified_name_to_profiles = dict() unified_name_to_profiles["hansmustermann"] = [profile1, profile3] unified_name_to_profiles["maxmustermann"] = [profile2] data_controller.crawl_data.update_cache_profiles( unified_name_to_profiles) # Check data count in the table cnt = data_controller.engine.execute( "SELECT COUNT(*) FROM cache_profile").fetchone() self.assertEqual(cnt[0], 2) # Then query same title row id = generate_id('hansmustermann') row = data_controller.engine.execute("SELECT id, profile_id, name " "FROM cache_profile " "WHERE id='%s'" % id).fetchone() self.assertEqual(row["id"], id) self.assertEqual(row["name"], "Hans Mustermann") # Now update profiles and check if the link is set correctly data_controller.crawl_data.update_profiles( [profile1, profile2, profile3]) row = data_controller.engine.execute( "SELECT cp.name as name " "FROM cache_profile cp, profile p " "WHERE cp.profile_id = p.id " "AND cp.id='%s' " % id).fetchone() self.assertEqual(row["name"], "Hans Mustermann")
def test_update_cache_profiles(self): sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() # The call shall not crash with empty input r = data_controller.crawl_data.update_cache_profiles(dict()) self.assertIsNone(r) profile1 = Profile("id1", "Hans", "Mustermann", "Longer Real Name", "") profile2 = Profile("id2", "Max", "Mustermann", "", "") profile3 = Profile("id3", "Hans", "Mustermann", "", "") unified_name_to_profiles = dict() unified_name_to_profiles["hansmustermann"] = [profile1, profile3] unified_name_to_profiles["maxmustermann"] = [profile2] data_controller.crawl_data.update_cache_profiles(unified_name_to_profiles) # Check data count in the table cnt = data_controller.engine.execute("SELECT COUNT(*) FROM cache_profile").fetchone() self.assertEqual(cnt[0], 2) # Then query same title row id = generate_id('hansmustermann') row = data_controller.engine.execute( "SELECT id, profile_id, name " "FROM cache_profile " "WHERE id='%s'" % id ).fetchone() self.assertEqual(row["id"], id) self.assertEqual(row["name"], "Hans Mustermann") # Now update profiles and check if the link is set correctly data_controller.crawl_data.update_profiles([profile1, profile2, profile3]) row = data_controller.engine.execute( "SELECT cp.name as name " "FROM cache_profile cp, profile p " "WHERE cp.profile_id = p.id " "AND cp.id='%s' " % id ).fetchone() self.assertEqual(row["name"], "Hans Mustermann")
def __init__(self, *args, **kwargs): super(MendeleyCache, self).__init__(*args, **kwargs) # Read configuration self.configuration = ServiceConfiguration() self.configuration.load() log.info("Configuration has been loaded") # Create service controllers self.data_controller = DataController(self.configuration.database) self.data_controller.assert_schema() log.info("Schema has been checked") # Create crawler based on configuration self.crawler = None """:type : AbstractCrawler""" if not self.configuration.uses_mendeley: log.info("Pipeline uses FileCrawler") self.crawler = FileCrawler() else: from mendeleycache.crawler.sdk_crawler import SDKCrawler log.info( "Pipeline uses SDKCrawler".format( app_id=self.configuration.crawler.app_id, app_secret=self.configuration.crawler.app_secret ) ) self.crawler = SDKCrawler( app_id=self.configuration.crawler.app_id, app_secret=self.configuration.crawler.app_secret ) # Create the pipeline self.crawl_controller = CrawlController(self.crawler, self.configuration.crawler.research_group) self.analysis_controller = AnalysisController() self.pipeline_controller = PipelineController( data_controller=self.data_controller, crawl_controller=self.crawl_controller, analysis_controller=self.analysis_controller, ) log.info("Pipeline has been initialized") # Create the routing controllers self.fields_controller = FieldsController(self, self.data_controller) self.profiles_controller = ProfilesController(self, self.data_controller, self.configuration.cache) self.publications_controller = DocumentsController(self, self.data_controller) self.cache_controller = CacheController( self, self.data_controller, self.pipeline_controller, self.configuration ) self.root_controller = RootController(self, self.data_controller, self.configuration) # Register the routes self.register_routes() log.info("Routes have been registered") log.info("MendeleyCache has been initialized")
def test_errors(self): sqlite_in_memory = SQLiteConfiguration("") ctrl = DataController(sqlite_in_memory) # Try completely dumb sql try: with ctrl.engine.connect() as conn: conn.execute("COMPLETELY WRONG COMMAND") self.fail("DBAPI exception not fired") except DBAPIError as e: pass
def test_execute(self): sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() crawler = FileCrawler() crawl_controller = CrawlController(crawler, "d0b7f41f-ad37-3b47-ab70-9feac35557cc") analysis_controller = AnalysisController() pipeline_controller = PipelineController( data_controller=data_controller, crawl_controller=crawl_controller, analysis_controller=analysis_controller ) # Clean run shall not crash pipeline_controller.execute() # Second run shall not crash either pipeline_controller.execute() # Third run shall not crash either pipeline_controller.execute()
def test_execute(self): sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() crawler = FileCrawler() crawl_controller = CrawlController( crawler, "d0b7f41f-ad37-3b47-ab70-9feac35557cc") analysis_controller = AnalysisController() pipeline_controller = PipelineController( data_controller=data_controller, crawl_controller=crawl_controller, analysis_controller=analysis_controller) # Clean run shall not crash pipeline_controller.execute() # Second run shall not crash either pipeline_controller.execute() # Third run shall not crash either pipeline_controller.execute()
def test_update_documents(self): sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() # The call shall not crash with empty input r = data_controller.crawl_data.update_documents([]) self.assertIsNone(r) document1 = Document(core_id="doc1", core_profile_id="id1", core_title="title1", core_type="conference_proceedings", core_created=datetime.now(), core_last_modified=datetime.now(), core_abstract="blabla", core_source="ACM xy", core_year=2015, core_authors=[("Hans", "Mustermann"), ("Nicht", "Existent")], core_keywords=[], doc_website="", conf_website="", conf_pages="", conf_month=0, conf_city="", tags=["t ag- 1"]) document2 = Document(core_id="doc2", core_profile_id="id2", core_title="title2", core_type="conference_proceedings", core_created=datetime.now(), core_last_modified=datetime.now(), core_abstract="blabla2", core_source="ACM xyz", core_year=2014, core_authors=[], core_keywords=[], doc_website="", conf_website="", conf_pages="", conf_month=0, conf_city="", tags=[]) data_controller.crawl_data.update_documents([document1, document2]) # Check data count in the table cnt = data_controller.engine.execute( "SELECT COUNT(*) FROM document").fetchone() self.assertEqual(cnt[0], 2) # Then query rows rows = data_controller.engine.execute( "SELECT id, mendeley_id, cache_document_id, owner_mendeley_id," " title, doc_type, created, last_modified, abstract, source, pub_year " "FROM document ").fetchall() # Check first row self.assertEqual(rows[0]["mendeley_id"], "doc1") self.assertEqual(rows[0]["cache_document_id"], generate_id("title1")) self.assertEqual(rows[0]["owner_mendeley_id"], "id1") self.assertEqual(rows[0]['title'], "title1") self.assertEqual(rows[0]["doc_type"], "conference_proceedings") self.assertEqual(rows[0]['abstract'], "blabla") self.assertEqual(rows[0]["source"], "ACM xy") self.assertEqual(rows[0]["pub_year"], 2015) # Check second row self.assertEqual(rows[1]["mendeley_id"], "doc2") self.assertEqual(rows[1]["cache_document_id"], generate_id("title2")) self.assertEqual(rows[1]["owner_mendeley_id"], "id2") self.assertEqual(rows[1]['title'], "title2") self.assertEqual(rows[1]["doc_type"], "conference_proceedings") self.assertEqual(rows[1]['abstract'], "blabla2") self.assertEqual(rows[1]["source"], "ACM xyz") self.assertEqual(rows[1]["pub_year"], 2014) document1 = Document(core_id="doc1", core_profile_id="id1", core_title="newtitle1", core_type="conference_proceedings", core_created=datetime.now(), core_last_modified=datetime.now(), core_abstract="blablaNew", core_source="ACM xyz1", core_year=2015, core_authors=[], core_keywords=[], doc_website="", conf_website="", conf_pages="", conf_month=0, conf_city="", tags=[]) document2 = Document(core_id="doc2", core_profile_id="id2", core_title="title2", core_type="conference_proceedings", core_created=datetime.now(), core_last_modified=datetime.now(), core_abstract="blabla2", core_source="ACM xyz", core_year=2014, core_authors=[], core_keywords=[], doc_website="", conf_website="", conf_pages="", conf_month=0, conf_city="", tags=[]) data_controller.crawl_data.update_documents([document1, document2]) # Check data count in the table cnt = data_controller.engine.execute( "SELECT COUNT(*) FROM document").fetchone() self.assertEqual(cnt[0], 2) # Then query rows rows = data_controller.engine.execute( "SELECT id, mendeley_id, cache_document_id, owner_mendeley_id," " title, doc_type, created, last_modified, abstract, source, pub_year " "FROM document ").fetchall() # Check first row self.assertEqual(rows[0]["mendeley_id"], "doc1") self.assertEqual(rows[0]["cache_document_id"], generate_id("newtitle1")) self.assertEqual(rows[0]["owner_mendeley_id"], "id1") self.assertEqual(rows[0]['title'], "newtitle1") self.assertEqual(rows[0]["doc_type"], "conference_proceedings") self.assertEqual(rows[0]['abstract'], "blablaNew") self.assertEqual(rows[0]["source"], "ACM xyz1") self.assertEqual(rows[0]["pub_year"], 2015)
def test_assert_drop(self): sqlite_in_memory = SQLiteConfiguration("") ctrl = DataController(sqlite_in_memory) ctrl.assert_schema() self.assertTrue(ctrl.is_initialized()) ctrl.drop_all() self.assertFalse(ctrl.is_initialized()) ctrl.assert_schema() self.assertTrue(ctrl.is_initialized()) with ctrl.engine.begin() as conn: conn.execute("DROP TABLE cache_document") self.assertFalse(ctrl.is_initialized()) ctrl.assert_schema() self.assertTrue(ctrl.is_initialized())
def test_is_initialized(self): sqlite_in_memory = SQLiteConfiguration("") ctrl = DataController(sqlite_in_memory) ctrl.run_schema() self.assertTrue(ctrl.is_initialized())
def test_run_schema(self): sqlite_in_memory = SQLiteConfiguration("") ctrl = DataController(sqlite_in_memory) # First check that none of the tables exists self.assertFalse(ctrl.table_exists('profile')) self.assertFalse(ctrl.table_exists('document')) self.assertFalse(ctrl.table_exists('cache_profile')) self.assertFalse(ctrl.table_exists('cache_document')) self.assertFalse(ctrl.table_exists('cache_field')) self.assertFalse(ctrl.table_exists('cache_document_has_cache_field')) self.assertFalse(ctrl.table_exists('cache_profile_has_cache_document')) self.assertFalse(ctrl.table_exists('update_log')) # Create schema ctrl.run_schema() # After schema creation all tables need to exist self.assertTrue(ctrl.table_exists('profile')) self.assertTrue(ctrl.table_exists('document')) self.assertTrue(ctrl.table_exists('cache_profile')) self.assertTrue(ctrl.table_exists('cache_document')) self.assertTrue(ctrl.table_exists('cache_field')) self.assertTrue(ctrl.table_exists('cache_document_has_cache_field')) self.assertTrue(ctrl.table_exists('cache_profile_has_cache_document')) self.assertTrue(ctrl.table_exists('update_log'))
def sample_pipeline(app_id=None, app_secret=None): from mendeleycache.crawler.sdk_crawler import SDKCrawler sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() crawler = None if app_id is None and app_secret is None: crawler = FileCrawler() else: crawler = SDKCrawler(app_id=app_id, app_secret=app_secret) crawl_controller = CrawlController(crawler, "d0b7f41f-ad37-3b47-ab70-9feac35557cc") analysis_controller = AnalysisController() pipeline_controller = PipelineController( data_controller=data_controller, crawl_controller=crawl_controller, analysis_controller=analysis_controller ) # Clean run shall not crash pipeline_controller.execute() rows = data_controller.engine.execute("SELECT * FROM profile").fetchall() print() print("Profiles:") for row in rows: print(row) rows = data_controller.engine.execute("SELECT * FROM cache_profile").fetchall() print() print("Cache profiles:") for row in rows: print(row) rows = data_controller.engine.execute("SELECT id, owner_mendeley_id, title, authors, tags FROM document").fetchall() print() print("Documents:") for row in rows: print(row) rows = data_controller.engine.execute("SELECT * FROM cache_document").fetchall() print() print("Cache documents:") for row in rows: print(row) rows = data_controller.engine.execute("SELECT * FROM cache_field").fetchall() print() print("Cache fields:") for row in rows: print(row) rows = data_controller.engine.execute("SELECT * FROM cache_document_has_cache_field").fetchall() print() print("LINK: Cache document -> Cache field:") for row in rows: print(row) rows = data_controller.engine.execute("SELECT * FROM cache_profile_has_cache_document").fetchall() print() print("LINK: Cache profile -> Cache document:") for row in rows: print(row) print()
def test_update_cache_documents(self): sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() # The call shall not crash with empty input r = data_controller.crawl_data.update_cache_documents(dict()) self.assertIsNone(r) document1 = Document( core_id="doc1", core_profile_id="id1", core_title="sametitle1", core_type="conference_proceedings", core_created=datetime.now(), core_last_modified=datetime.now(), core_abstract="Older Abtract", core_source="Older source", core_year=2015, core_authors=[], core_keywords=[], doc_website="", conf_website="", conf_pages="", conf_month=0, conf_city="", tags=[] ) document2 = Document( core_id="doc2", core_profile_id="id2", core_title="title2", core_type="conference_proceedings", core_created=datetime.now(), core_last_modified=datetime.now(), core_abstract="blabla2", core_source="ACM xyz", core_year=2014, core_authors=[], core_keywords=[], doc_website="", conf_website="", conf_pages="", conf_month=0, conf_city="", tags=[] ) document3 = Document( core_id="doc3", core_profile_id="id3", core_title="sametitle1", core_type="conference_proceedings", core_created=datetime.now(), core_last_modified=datetime.now(), core_abstract="Newer abstract", core_source="Newer source", core_year=2015, core_authors=[], core_keywords=[], doc_website="", conf_website="", conf_pages="", conf_month=0, conf_city="", tags=[] ) unified_document_title_to_documents = dict() unified_document_title_to_documents["samtetitle1"] = [document1, document3] unified_document_title_to_documents["title2"] = [document2] # Trigger cache document update data_controller.crawl_data.update_cache_documents(unified_document_title_to_documents) # Check data count in the table cnt = data_controller.engine.execute("SELECT COUNT(*) FROM cache_document").fetchone() self.assertEqual(cnt[0], 2) # Then query sametitle row id = generate_id('sametitle1') row = data_controller.engine.execute( "SELECT id, document_id, title " "FROM cache_document " "WHERE id='%s'" % id ).fetchone() self.assertEqual(row["id"], id) self.assertEqual(row["title"], "sametitle1") # Now update documents and check if the link is set correctly data_controller.crawl_data.update_documents([document1, document2, document3]) row = data_controller.engine.execute( "SELECT cd.title as title " "FROM cache_document cd, document d " "WHERE cd.document_id = d.id " "AND cd.id='%s' " % id ).fetchone() self.assertEqual(row["title"], "sametitle1")
# Create suites all = loader.discover(start_dir=project_root) # Run suites runner.run(all) elif command == "prepare": log.info("Preparing environment for gunicorn workers") # Read configuration configuration = ServiceConfiguration() configuration.load() log.info("Configuration has been loaded") # Create data controller and assert schema # That will remove the race conditions of the gunicorn worker if it's done on every startup data_controller = DataController(configuration.database) data_controller.assert_schema() # Pipeline runner elif command == "pipeline": config = ServiceConfiguration() config.load() data_controller = DataController(config.database) if not data_controller.is_initialized(): log.critical("Database is not initialized") exit() crawler = None if not config.uses_mendeley: log.info("Pipeline uses FileCrawler")
def test_update_documents(self): sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() # The call shall not crash with empty input r = data_controller.crawl_data.update_documents([]) self.assertIsNone(r) document1 = Document( core_id="doc1", core_profile_id="id1", core_title="title1", core_type="conference_proceedings", core_created=datetime.now(), core_last_modified=datetime.now(), core_abstract="blabla", core_source="ACM xy", core_year=2015, core_authors=[("Hans", "Mustermann"), ("Nicht", "Existent")], core_keywords=[], doc_website="", conf_website="", conf_pages="", conf_month=0, conf_city="", tags=["t ag- 1"] ) document2 = Document( core_id="doc2", core_profile_id="id2", core_title="title2", core_type="conference_proceedings", core_created=datetime.now(), core_last_modified=datetime.now(), core_abstract="blabla2", core_source="ACM xyz", core_year=2014, core_authors=[], core_keywords=[], doc_website="", conf_website="", conf_pages="", conf_month=0, conf_city="", tags=[] ) data_controller.crawl_data.update_documents([document1, document2]) # Check data count in the table cnt = data_controller.engine.execute("SELECT COUNT(*) FROM document").fetchone() self.assertEqual(cnt[0], 2) # Then query rows rows = data_controller.engine.execute( "SELECT id, mendeley_id, cache_document_id, owner_mendeley_id," " title, doc_type, created, last_modified, abstract, source, pub_year " "FROM document " ).fetchall() # Check first row self.assertEqual(rows[0]["mendeley_id"], "doc1") self.assertEqual(rows[0]["cache_document_id"], generate_id("title1")) self.assertEqual(rows[0]["owner_mendeley_id"], "id1") self.assertEqual(rows[0]['title'], "title1") self.assertEqual(rows[0]["doc_type"], "conference_proceedings") self.assertEqual(rows[0]['abstract'], "blabla") self.assertEqual(rows[0]["source"], "ACM xy") self.assertEqual(rows[0]["pub_year"], 2015) # Check second row self.assertEqual(rows[1]["mendeley_id"], "doc2") self.assertEqual(rows[1]["cache_document_id"], generate_id("title2")) self.assertEqual(rows[1]["owner_mendeley_id"], "id2") self.assertEqual(rows[1]['title'], "title2") self.assertEqual(rows[1]["doc_type"], "conference_proceedings") self.assertEqual(rows[1]['abstract'], "blabla2") self.assertEqual(rows[1]["source"], "ACM xyz") self.assertEqual(rows[1]["pub_year"], 2014) document1 = Document( core_id="doc1", core_profile_id="id1", core_title="newtitle1", core_type="conference_proceedings", core_created=datetime.now(), core_last_modified=datetime.now(), core_abstract="blablaNew", core_source="ACM xyz1", core_year=2015, core_authors=[], core_keywords=[], doc_website="", conf_website="", conf_pages="", conf_month=0, conf_city="", tags=[] ) document2 = Document( core_id="doc2", core_profile_id="id2", core_title="title2", core_type="conference_proceedings", core_created=datetime.now(), core_last_modified=datetime.now(), core_abstract="blabla2", core_source="ACM xyz", core_year=2014, core_authors=[], core_keywords=[], doc_website="", conf_website="", conf_pages="", conf_month=0, conf_city="", tags=[] ) data_controller.crawl_data.update_documents([document1, document2]) # Check data count in the table cnt = data_controller.engine.execute("SELECT COUNT(*) FROM document").fetchone() self.assertEqual(cnt[0], 2) # Then query rows rows = data_controller.engine.execute( "SELECT id, mendeley_id, cache_document_id, owner_mendeley_id," " title, doc_type, created, last_modified, abstract, source, pub_year " "FROM document " ).fetchall() # Check first row self.assertEqual(rows[0]["mendeley_id"], "doc1") self.assertEqual(rows[0]["cache_document_id"], generate_id("newtitle1")) self.assertEqual(rows[0]["owner_mendeley_id"], "id1") self.assertEqual(rows[0]['title'], "newtitle1") self.assertEqual(rows[0]["doc_type"], "conference_proceedings") self.assertEqual(rows[0]['abstract'], "blablaNew") self.assertEqual(rows[0]["source"], "ACM xyz1") self.assertEqual(rows[0]["pub_year"], 2015)
def sample_pipeline(app_id=None, app_secret=None): from mendeleycache.crawler.sdk_crawler import SDKCrawler sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() crawler = None if app_id is None and app_secret is None: crawler = FileCrawler() else: crawler = SDKCrawler(app_id=app_id, app_secret=app_secret) crawl_controller = CrawlController(crawler, "d0b7f41f-ad37-3b47-ab70-9feac35557cc") analysis_controller = AnalysisController() pipeline_controller = PipelineController( data_controller=data_controller, crawl_controller=crawl_controller, analysis_controller=analysis_controller) # Clean run shall not crash pipeline_controller.execute() rows = data_controller.engine.execute("SELECT * FROM profile").fetchall() print() print("Profiles:") for row in rows: print(row) rows = data_controller.engine.execute( "SELECT * FROM cache_profile").fetchall() print() print("Cache profiles:") for row in rows: print(row) rows = data_controller.engine.execute( "SELECT id, owner_mendeley_id, title, authors, tags FROM document" ).fetchall() print() print("Documents:") for row in rows: print(row) rows = data_controller.engine.execute( "SELECT * FROM cache_document").fetchall() print() print("Cache documents:") for row in rows: print(row) rows = data_controller.engine.execute( "SELECT * FROM cache_field").fetchall() print() print("Cache fields:") for row in rows: print(row) rows = data_controller.engine.execute( "SELECT * FROM cache_document_has_cache_field").fetchall() print() print("LINK: Cache document -> Cache field:") for row in rows: print(row) rows = data_controller.engine.execute( "SELECT * FROM cache_profile_has_cache_document").fetchall() print() print("LINK: Cache profile -> Cache document:") for row in rows: print(row) print()
def test_update_cache_documents(self): sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() # The call shall not crash with empty input r = data_controller.crawl_data.update_cache_documents(dict()) self.assertIsNone(r) document1 = Document(core_id="doc1", core_profile_id="id1", core_title="sametitle1", core_type="conference_proceedings", core_created=datetime.now(), core_last_modified=datetime.now(), core_abstract="Older Abtract", core_source="Older source", core_year=2015, core_authors=[], core_keywords=[], doc_website="", conf_website="", conf_pages="", conf_month=0, conf_city="", tags=[]) document2 = Document(core_id="doc2", core_profile_id="id2", core_title="title2", core_type="conference_proceedings", core_created=datetime.now(), core_last_modified=datetime.now(), core_abstract="blabla2", core_source="ACM xyz", core_year=2014, core_authors=[], core_keywords=[], doc_website="", conf_website="", conf_pages="", conf_month=0, conf_city="", tags=[]) document3 = Document(core_id="doc3", core_profile_id="id3", core_title="sametitle1", core_type="conference_proceedings", core_created=datetime.now(), core_last_modified=datetime.now(), core_abstract="Newer abstract", core_source="Newer source", core_year=2015, core_authors=[], core_keywords=[], doc_website="", conf_website="", conf_pages="", conf_month=0, conf_city="", tags=[]) unified_document_title_to_documents = dict() unified_document_title_to_documents["samtetitle1"] = [ document1, document3 ] unified_document_title_to_documents["title2"] = [document2] # Trigger cache document update data_controller.crawl_data.update_cache_documents( unified_document_title_to_documents) # Check data count in the table cnt = data_controller.engine.execute( "SELECT COUNT(*) FROM cache_document").fetchone() self.assertEqual(cnt[0], 2) # Then query sametitle row id = generate_id('sametitle1') row = data_controller.engine.execute("SELECT id, document_id, title " "FROM cache_document " "WHERE id='%s'" % id).fetchone() self.assertEqual(row["id"], id) self.assertEqual(row["title"], "sametitle1") # Now update documents and check if the link is set correctly data_controller.crawl_data.update_documents( [document1, document2, document3]) row = data_controller.engine.execute( "SELECT cd.title as title " "FROM cache_document cd, document d " "WHERE cd.document_id = d.id " "AND cd.id='%s' " % id).fetchone() self.assertEqual(row["title"], "sametitle1")