def dont_geotag_if_detail_exists(self, nominatim): gkg = Gkg( id=3771256, gkgrecordid="20170215174500-2503", date=20170215174500, document_identifier= "http://www.philstar.com/headlines/2017/02/16/1672746/yasay-harris-affirm-stronger-phl-us-ties" ) self.session.add(gkg) analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) content = DocumentContent( content_clean= "It was early Saturday when a flash flood hit large parts of India and Pakistan and washed away more than 500 houses" ) self.session.add(content) self.session.commit() analysis.content_id = content.id self.session.commit() fact = Fact(unit='person', term='displaced') self.session.add(fact) self.session.commit() loc1 = self.session.query(Location).filter( Location.location_name == 'India').one_or_none() fact.locations.append(loc1) analysis.facts.append(fact) self.session.commit() process_locations(analysis) assert not nominatim.called
def test_create_duplicate_fact(self): """Creates duplicate fact if locations from multiple countries exist""" gkg = Gkg( id=3771256, gkgrecordid="20170215174500-2503", date=20170215174500, document_identifier= "http://www.philstar.com/headlines/2017/02/16/1672746/yasay-harris-affirm-stronger-phl-us-ties" ) self.session.add(gkg) analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) self.session.commit() fact = Fact(unit='person', term='displaced') self.session.add(fact) self.session.commit() loc1 = self.session.query(Location).filter( Location.location_name == 'India').one_or_none() loc2 = self.session.query(Location).filter( Location.location_name == 'Pakistan').one_or_none() fact.locations.append(loc1) fact.locations.append(loc2) analysis.facts.append(fact) self.session.commit() self.assertEqual(1, len(analysis.facts)) process_locations(analysis) self.assertEqual(2, len(analysis.facts)) fact_countries = [f.iso3 for f in analysis.facts] self.assertIn('IND', fact_countries) self.assertIn('PAK', fact_countries) self.assertEqual(1, len(analysis.facts[0].locations)) self.assertEqual(1, len(analysis.facts[1].locations))
def sample_data(self): gkg1 = Gkg( id=3771256, gkgrecordid="20170215174500-2503", date=20170215174500, document_identifier= "http://www.philstar.com/headlines/2017/02/16/1672746/yasay-harris-affirm-stronger-phl-us-ties" ) self.session.add(gkg1) gkg2 = Gkg( id=3771257, gkgrecordid="20170215174500-1536", date=20170215174500, document_identifier= "http://wynkcountry.iheart.com/onair/cmt-cody-alan-54719/thomas-rhett-and-lauren-akins-are-15565244/" ) self.session.add(gkg2) self.session.commit()
def test_scrape_pdf(self): gkg = Gkg( document_identifier="https://www1.ncdc.noaa.gov/pub/data/extremeevents/specialreports/Hurricane-Katrina.pdf") analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) self.session.commit() scrape(analysis) content = analysis.content self.assertEqual("pdf", content.content_type) self.assertTrue("Katrina" in content.content) self.assertTrue("Louisiana" in content.content) self.assertTrue("\n" not in content.content)
def create_new_analysis_from_url(session, url): scn = get_scn_from_url(url) now = datetime.datetime.now() gkg_date = ('{:04d}{:02d}{:02d}{:02d}{:02d}{:02d}'.format( now.year, now.month, now.day, now.hour, now.minute, now.second)) article = Gkg(document_identifier=url, date=gkg_date, source_common_name=scn) analysis = Analysis(gkg=article, status=Status.NEW, retrieval_attempts=0) session.add(analysis) session.commit() return analysis
def test_extract_eviction_facts(self): """Extracts eviction-related facts with eviction Term""" gkg = Gkg() analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) content = DocumentContent( content_clean= "ordered eviction for 2000 people from their homes in Bosnia") self.session.add(content) self.session.commit() analysis.content_id = content.id self.session.commit() extract_facts(analysis) self.assertEqual(FactTerm.EVICTED, analysis.facts[0].term)
def test_extract_refugee_facts(self): """Extracts refugee-related facts with Refugee Term""" gkg = Gkg() analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) content = DocumentContent( content_clean= "It was early Saturday when government troops entered the area and forced more than 20000 refugees to flee." ) self.session.add(content) self.session.commit() analysis.content_id = content.id self.session.commit() extract_facts(analysis) self.assertEqual(FactTerm.REFUGEE, analysis.facts[0].term)
def test_extract_facts_simple(self): """Extracts simple facts when present and saves to DB""" gkg = Gkg() analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) content = DocumentContent( content_clean= "It was early Saturday when a flash flood hit the area and washed away more than 500 houses" ) self.session.add(content) self.session.commit() analysis.content_id = content.id self.session.commit() extract_facts(analysis) self.assertEqual(1, len(analysis.facts))
def test_extract_sacked_facts(self): """Extracts sacked-related facts with eviction Term""" gkg = Gkg() analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) content = DocumentContent( content_clean= "last week 2000 people have been sacked from their homes in Nigeria" ) self.session.add(content) self.session.commit() analysis.content_id = content.id self.session.commit() extract_facts(analysis) self.assertEqual(FactTerm.SACKED, analysis.facts[0].term)
def add_url(): url = request.form['url'] logger.info("Scraping by url: {url}".format(url=url)) if url is None: flash(u'Something went wrong. Please try again.', 'danger') return redirect(url_for('/')) article = Gkg(document_identifier=url) session = Session() try: session.add(article) session.commit() flash(u"{} was successfully added".format(url), 'success') return redirect('/') finally: session.close()
def test_scrape_html(self): gkg = Gkg( document_identifier="http://www.cnn.com/2013/08/23/us/hurricane-katrina-statistics-fast-facts/index.html") analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) self.session.commit() scrape(analysis) content = analysis.content self.assertEqual("text", content.content_type) self.assertTrue("Katrina" in content.content_clean) self.assertTrue("Louisiana" in content.content_clean) self.assertTrue("\n" not in content.content_clean) self.assertTrue(content.content_ts is not None) matches = ( self.session.query(DocumentContent) .filter(DocumentContent.content_ts.match('Katrina & Louisiana')).all() ) self.assertIn(content, matches)
def test_use_existing_location(self): """Uses existing locations when they exist""" gkg = Gkg() analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) content = DocumentContent( content_clean= "It was early Saturday when a flash flood hit large parts of Bosnia and washed away more than 500 houses" ) self.session.add(content) location = Location(location_name='Bosnia') self.session.add(location) self.session.commit() analysis.content_id = content.id self.session.commit() extract_facts(analysis) fact = analysis.facts[0] extracted_location = fact.locations[0] self.assertEqual(location.id, extracted_location.id)
def test_create_locations_with_names(self): """Creates locations for facts only with location names""" gkg = Gkg() analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) content = DocumentContent( content_clean= "It was early Saturday when a flash flood hit large parts of London and Middlesex and washed away more than 500 houses" ) self.session.add(content) self.session.commit() analysis.content_id = content.id self.session.commit() extract_facts(analysis) facts = analysis.facts self.assertEqual(1, len(facts)) fact = facts[0] self.assertEqual(2, len(fact.locations)) loc_names = [loc.location_name for loc in fact.locations] self.assertIn('London', loc_names) self.assertIn('Middlesex', loc_names) self.assertEqual([None, None], [loc.country for loc in fact.locations])
def test_fail_if_geotagging_fails(self, nominatim): """Location processing should fail if geotagging fails""" nominatim.side_effect = GeotagException() gkg = Gkg( id=3771256, gkgrecordid="20170215174500-2503", date=20170215174500, document_identifier= "http://www.philstar.com/headlines/2017/02/16/1672746/yasay-harris-affirm-stronger-phl-us-ties" ) self.session.add(gkg) analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) self.session.commit() fact = Fact(unit='person', term='displaced') self.session.add(fact) self.session.commit() loc1 = Location(location_name="Ruislip") fact.locations.append(loc1) analysis.facts.append(fact) self.session.commit() with self.assertRaises(GeotagException): process_locations(analysis)