class TestScraper(TestCase): def setUp(self): db_host = os.environ.get('DB_HOST') db_url = 'postgresql://{user}:{passwd}@{db_host}/{db}'.format( user='******', passwd='tester', db_host=db_host, db='idetect_test') engine = create_engine(db_url) Session.configure(bind=engine) Base.metadata.drop_all(engine) Base.metadata.create_all(engine) self.session = Session() def tearDown(self): self.session.rollback() for gkg in self.session.query(Gkg).all(): self.session.delete(gkg) self.session.commit() def test_scrape_html(self): gkg = Gkg( document_identifier="http://www.cnn.com/2013/08/23/us/hurricane-katrina-statistics-fast-facts/index.html") analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) self.session.commit() scrape(analysis) content = analysis.content self.assertEqual("text", content.content_type) self.assertTrue("Katrina" in content.content_clean) self.assertTrue("Louisiana" in content.content_clean) self.assertTrue("\n" not in content.content_clean) self.assertTrue(content.content_ts is not None) matches = ( self.session.query(DocumentContent) .filter(DocumentContent.content_ts.match('Katrina & Louisiana')).all() ) self.assertIn(content, matches) def test_scrape_pdf(self): gkg = Gkg( document_identifier="https://www1.ncdc.noaa.gov/pub/data/extremeevents/specialreports/Hurricane-Katrina.pdf") analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) self.session.commit() scrape(analysis) content = analysis.content self.assertEqual("pdf", content.content_type) self.assertTrue("Katrina" in content.content) self.assertTrue("Louisiana" in content.content) self.assertTrue("\n" not in content.content)
class TestFactExtractor(TestCase): def setUp(self): db_host = os.environ.get('DB_HOST') db_url = 'postgresql://{user}:{passwd}@{db_host}/{db}'.format( user='******', passwd='tester', db_host=db_host, db='idetect_test') engine = create_engine(db_url) Session.configure(bind=engine) Base.metadata.drop_all(engine) Base.metadata.create_all(engine) self.session = Session() load_countries(self.session) load_terms(self.session) def tearDown(self): self.session.rollback() for article in self.session.query(Gkg).all(): self.session.delete(article) self.session.commit() def test_extract_facts_simple(self): """Extracts simple facts when present and saves to DB""" gkg = Gkg() analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) content = DocumentContent( content_clean= "It was early Saturday when a flash flood hit the area and washed away more than 500 houses" ) self.session.add(content) self.session.commit() analysis.content_id = content.id self.session.commit() extract_facts(analysis) self.assertEqual(1, len(analysis.facts)) def test_extract_refugee_facts(self): """Extracts refugee-related facts with Refugee Term""" gkg = Gkg() analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) content = DocumentContent( content_clean= "It was early Saturday when government troops entered the area and forced more than 20000 refugees to flee." ) self.session.add(content) self.session.commit() analysis.content_id = content.id self.session.commit() extract_facts(analysis) self.assertEqual(FactTerm.REFUGEE, analysis.facts[0].term) def test_extract_evicted_facts(self): """Extracts eviction-related facts with eviction Term""" gkg = Gkg() analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) content = DocumentContent( content_clean= "2000 people have been evicted from their homes in Bosnia") self.session.add(content) self.session.commit() analysis.content_id = content.id self.session.commit() extract_facts(analysis) self.assertEqual(FactTerm.EVICTED, analysis.facts[0].term) def test_extract_eviction_facts(self): """Extracts eviction-related facts with eviction Term""" gkg = Gkg() analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) content = DocumentContent( content_clean= "ordered eviction for 2000 people from their homes in Bosnia") self.session.add(content) self.session.commit() analysis.content_id = content.id self.session.commit() extract_facts(analysis) self.assertEqual(FactTerm.EVICTED, analysis.facts[0].term) def test_extract_forced_eviction_facts(self): """Extracts eviction-related facts with eviction Term""" gkg = Gkg() analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) content = DocumentContent( content_clean= "ordered forced eviction for 2000 people from their homes in Bosnia" ) self.session.add(content) self.session.commit() analysis.content_id = content.id self.session.commit() extract_facts(analysis) self.assertEqual(FactTerm.EVICTED, analysis.facts[0].term) def test_extract_forcibly_evicted_facts(self): """Extracts eviction-related facts with eviction Term""" gkg = Gkg() analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) content = DocumentContent( content_clean= "2000 people were forcibly evicted from their homes in Bosnia") self.session.add(content) self.session.commit() analysis.content_id = content.id self.session.commit() extract_facts(analysis) self.assertEqual(FactTerm.EVICTED, analysis.facts[0].term) def test_extract_sacked_facts(self): """Extracts sacked-related facts with eviction Term""" gkg = Gkg() analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) content = DocumentContent( content_clean= "last week 2000 people have been sacked from their homes in Nigeria" ) self.session.add(content) self.session.commit() analysis.content_id = content.id self.session.commit() extract_facts(analysis) self.assertEqual(FactTerm.SACKED, analysis.facts[0].term) def test_create_locations_with_names(self): """Creates locations for facts only with location names""" gkg = Gkg() analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) content = DocumentContent( content_clean= "It was early Saturday when a flash flood hit large parts of London and Middlesex and washed away more than 500 houses" ) self.session.add(content) self.session.commit() analysis.content_id = content.id self.session.commit() extract_facts(analysis) facts = analysis.facts self.assertEqual(1, len(facts)) fact = facts[0] self.assertEqual(2, len(fact.locations)) loc_names = [loc.location_name for loc in fact.locations] self.assertIn('London', loc_names) self.assertIn('Middlesex', loc_names) self.assertEqual([None, None], [loc.country for loc in fact.locations]) def test_use_existing_location(self): """Uses existing locations when they exist""" gkg = Gkg() analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) content = DocumentContent( content_clean= "It was early Saturday when a flash flood hit large parts of Bosnia and washed away more than 500 houses" ) self.session.add(content) location = Location(location_name='Bosnia') self.session.add(location) self.session.commit() analysis.content_id = content.id self.session.commit() extract_facts(analysis) fact = analysis.facts[0] extracted_location = fact.locations[0] self.assertEqual(location.id, extracted_location.id)
class TestModel(TestCase): def setUp(self): db_host = os.environ.get('DB_HOST') db_url = 'postgresql://{user}:{passwd}@{db_host}/{db}'.format( user='******', passwd='tester', db_host=db_host, db='idetect_test') engine = create_engine(db_url) Session.configure(bind=engine) Base.metadata.drop_all(engine) Base.metadata.create_all(engine) self.session = Session() self.sample_data() def sample_data(self): gkg1 = Gkg( id=3771256, gkgrecordid="20170215174500-2503", date=20170215174500, document_identifier= "http://www.philstar.com/headlines/2017/02/16/1672746/yasay-harris-affirm-stronger-phl-us-ties" ) self.session.add(gkg1) gkg2 = Gkg( id=3771257, gkgrecordid="20170215174500-1536", date=20170215174500, document_identifier= "http://wynkcountry.iheart.com/onair/cmt-cody-alan-54719/thomas-rhett-and-lauren-akins-are-15565244/" ) self.session.add(gkg2) self.session.commit() def tearDown(self): self.session.rollback() for gkg in self.session.query(Gkg).all(): self.session.delete(gkg) self.session.commit() def test_status_update(self): gkg = self.session.query(Gkg).first() analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) self.session.commit() analysis.create_new_version(Status.SCRAPING) self.assertEqual(analysis.status, Status.SCRAPING) # meanwhile, some other process changed the status of this... session2 = Session() try: other = session2.query(Analysis).get(analysis.gkg_id) other.create_new_version(Status.SCRAPING_FAILED) finally: session2.rollback() with self.assertRaises(NotLatestException): analysis.create_new_version(Status.SCRAPED) def test_version_lifecycle(self): gkg = self.session.query(Gkg).first() analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) self.session.commit() analysis.create_new_version(Status.SCRAPING) history = self.session.query(AnalysisHistory).filter( AnalysisHistory.gkg == gkg) self.assertEqual(1, history.count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.NEW).count()) content = DocumentContent(content_type="text/html", content="Lorem ipsum") analysis.content = content analysis.create_new_version(Status.SCRAPED) self.assertEqual(2, history.count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.NEW).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.SCRAPING).count()) analysis.create_new_version(Status.EXTRACTING) self.assertEqual(3, history.count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.NEW).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.SCRAPING).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.SCRAPED).count()) # content is preserved scraped = history.filter( AnalysisHistory.status == Status.SCRAPED).one_or_none() self.assertEqual(analysis.content, scraped.content) fact = Fact(analysis_date=datetime.now()) analysis.facts = [fact] analysis.create_new_version(Status.EXTRACTED) self.assertEqual(4, history.count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.NEW).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.SCRAPING).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.SCRAPED).count()) self.assertEqual( 1, history.filter( AnalysisHistory.status == Status.EXTRACTING).count()) # content still preserved extracting = history.filter( AnalysisHistory.status == Status.EXTRACTING).one_or_none() self.assertEqual(analysis.content, extracting.content) analysis.create_new_version(Status.EDITING) analysis.content = DocumentContent(content_type="text/html", content="Lorem edited") analysis.create_new_version(Status.EDITED) self.assertEqual(6, history.count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.NEW).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.SCRAPING).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.SCRAPED).count()) self.assertEqual( 1, history.filter( AnalysisHistory.status == Status.EXTRACTING).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.EXTRACTED).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.EDITING).count()) # content has changed, but reports are preserved extracted = history.filter( AnalysisHistory.status == Status.EXTRACTED).one_or_none() self.assertNotEqual(analysis.content.id, extracted.content.id) self.assertCountEqual([f.id for f in analysis.facts], [f.id for f in extracted.facts]) analysis.create_new_version(Status.EDITING) fact2 = Fact(analysis_date=datetime.now()) analysis.facts.append(fact2) analysis.create_new_version(Status.EDITED) self.assertEqual(8, history.count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.NEW).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.SCRAPING).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.SCRAPED).count()) self.assertEqual( 1, history.filter( AnalysisHistory.status == Status.EXTRACTING).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.EXTRACTED).count()) self.assertEqual( 2, history.filter(AnalysisHistory.status == Status.EDITING).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.EDITED).count()) edited = history.filter( AnalysisHistory.status == Status.EDITED).one_or_none() self.assertCountEqual([f.id for f in analysis.facts], [fact.id, fact2.id]) self.assertCountEqual([f.id for f in edited.facts], [fact.id]) def test_status_counts(self): gkgs = self.session.query(Gkg).all()[:2] analysis1 = Analysis(gkg=gkgs[0], status=Status.NEW) self.session.add(analysis1) self.session.commit() self.assertEqual(Analysis.status_counts(self.session), {Status.NEW: 1}) analysis1.create_new_version(Status.SCRAPING) self.assertEqual(Analysis.status_counts(self.session), {Status.SCRAPING: 1}) analysis2 = Analysis(gkg=gkgs[1], status=Status.NEW) self.session.add(analysis2) self.session.commit() self.assertEqual(Analysis.status_counts(self.session), { Status.NEW: 1, Status.SCRAPING: 1 }) analysis2.create_new_version(Status.SCRAPING) self.assertEqual(Analysis.status_counts(self.session), {Status.SCRAPING: 2}) analysis2.create_new_version(Status.SCRAPED) self.assertEqual(Analysis.status_counts(self.session), { Status.SCRAPED: 1, Status.SCRAPING: 1 }) def test_country_term(self): mmr = Country(iso3="MMR", preferred_term="Myanmar") myanmar = CountryTerm(term="Myanmar", country=mmr) burma = CountryTerm(term="Burma", country=mmr) yangon = Location(location_name="Yangon", location_type=LocationType.CITY, country=mmr, latlong="16°51′N 96°11′E") self.assertEqual(yangon.country, myanmar.country) self.assertEqual(yangon.country, burma.country)