Exemple #1
0
 def dont_geotag_if_detail_exists(self, nominatim):
     gkg = Gkg(
         id=3771256,
         gkgrecordid="20170215174500-2503",
         date=20170215174500,
         document_identifier=
         "http://www.philstar.com/headlines/2017/02/16/1672746/yasay-harris-affirm-stronger-phl-us-ties"
     )
     self.session.add(gkg)
     analysis = Analysis(gkg=gkg, status=Status.NEW)
     self.session.add(analysis)
     content = DocumentContent(
         content_clean=
         "It was early Saturday when a flash flood hit large parts of India and Pakistan and washed away more than 500 houses"
     )
     self.session.add(content)
     self.session.commit()
     analysis.content_id = content.id
     self.session.commit()
     fact = Fact(unit='person', term='displaced')
     self.session.add(fact)
     self.session.commit()
     loc1 = self.session.query(Location).filter(
         Location.location_name == 'India').one_or_none()
     fact.locations.append(loc1)
     analysis.facts.append(fact)
     self.session.commit()
     process_locations(analysis)
     assert not nominatim.called
Exemple #2
0
 def test_create_duplicate_fact(self):
     """Creates duplicate fact if locations from multiple countries exist"""
     gkg = Gkg(
         id=3771256,
         gkgrecordid="20170215174500-2503",
         date=20170215174500,
         document_identifier=
         "http://www.philstar.com/headlines/2017/02/16/1672746/yasay-harris-affirm-stronger-phl-us-ties"
     )
     self.session.add(gkg)
     analysis = Analysis(gkg=gkg, status=Status.NEW)
     self.session.add(analysis)
     self.session.commit()
     fact = Fact(unit='person', term='displaced')
     self.session.add(fact)
     self.session.commit()
     loc1 = self.session.query(Location).filter(
         Location.location_name == 'India').one_or_none()
     loc2 = self.session.query(Location).filter(
         Location.location_name == 'Pakistan').one_or_none()
     fact.locations.append(loc1)
     fact.locations.append(loc2)
     analysis.facts.append(fact)
     self.session.commit()
     self.assertEqual(1, len(analysis.facts))
     process_locations(analysis)
     self.assertEqual(2, len(analysis.facts))
     fact_countries = [f.iso3 for f in analysis.facts]
     self.assertIn('IND', fact_countries)
     self.assertIn('PAK', fact_countries)
     self.assertEqual(1, len(analysis.facts[0].locations))
     self.assertEqual(1, len(analysis.facts[1].locations))
Exemple #3
0
    def sample_data(self):
        gkg1 = Gkg(
            id=3771256,
            gkgrecordid="20170215174500-2503",
            date=20170215174500,
            document_identifier=
            "http://www.philstar.com/headlines/2017/02/16/1672746/yasay-harris-affirm-stronger-phl-us-ties"
        )
        self.session.add(gkg1)

        gkg2 = Gkg(
            id=3771257,
            gkgrecordid="20170215174500-1536",
            date=20170215174500,
            document_identifier=
            "http://wynkcountry.iheart.com/onair/cmt-cody-alan-54719/thomas-rhett-and-lauren-akins-are-15565244/"
        )
        self.session.add(gkg2)
        self.session.commit()
Exemple #4
0
 def test_scrape_pdf(self):
     gkg = Gkg(
         document_identifier="https://www1.ncdc.noaa.gov/pub/data/extremeevents/specialreports/Hurricane-Katrina.pdf")
     analysis = Analysis(gkg=gkg, status=Status.NEW)
     self.session.add(analysis)
     self.session.commit()
     scrape(analysis)
     content = analysis.content
     self.assertEqual("pdf", content.content_type)
     self.assertTrue("Katrina" in content.content)
     self.assertTrue("Louisiana" in content.content)
     self.assertTrue("\n" not in content.content)
Exemple #5
0
def create_new_analysis_from_url(session, url):
    scn = get_scn_from_url(url)
    now = datetime.datetime.now()
    gkg_date = ('{:04d}{:02d}{:02d}{:02d}{:02d}{:02d}'.format(
        now.year, now.month, now.day, now.hour, now.minute, now.second))
    article = Gkg(document_identifier=url,
                  date=gkg_date,
                  source_common_name=scn)
    analysis = Analysis(gkg=article, status=Status.NEW, retrieval_attempts=0)
    session.add(analysis)
    session.commit()
    return analysis
Exemple #6
0
 def test_extract_eviction_facts(self):
     """Extracts eviction-related facts with eviction Term"""
     gkg = Gkg()
     analysis = Analysis(gkg=gkg, status=Status.NEW)
     self.session.add(analysis)
     content = DocumentContent(
         content_clean=
         "ordered eviction for 2000 people from their homes in Bosnia")
     self.session.add(content)
     self.session.commit()
     analysis.content_id = content.id
     self.session.commit()
     extract_facts(analysis)
     self.assertEqual(FactTerm.EVICTED, analysis.facts[0].term)
Exemple #7
0
 def test_extract_refugee_facts(self):
     """Extracts refugee-related facts with Refugee Term"""
     gkg = Gkg()
     analysis = Analysis(gkg=gkg, status=Status.NEW)
     self.session.add(analysis)
     content = DocumentContent(
         content_clean=
         "It was early Saturday when government troops entered the area and forced more than 20000 refugees to flee."
     )
     self.session.add(content)
     self.session.commit()
     analysis.content_id = content.id
     self.session.commit()
     extract_facts(analysis)
     self.assertEqual(FactTerm.REFUGEE, analysis.facts[0].term)
Exemple #8
0
 def test_extract_facts_simple(self):
     """Extracts simple facts when present and saves to DB"""
     gkg = Gkg()
     analysis = Analysis(gkg=gkg, status=Status.NEW)
     self.session.add(analysis)
     content = DocumentContent(
         content_clean=
         "It was early Saturday when a flash flood hit the area and washed away more than 500 houses"
     )
     self.session.add(content)
     self.session.commit()
     analysis.content_id = content.id
     self.session.commit()
     extract_facts(analysis)
     self.assertEqual(1, len(analysis.facts))
Exemple #9
0
 def test_extract_sacked_facts(self):
     """Extracts sacked-related facts with eviction Term"""
     gkg = Gkg()
     analysis = Analysis(gkg=gkg, status=Status.NEW)
     self.session.add(analysis)
     content = DocumentContent(
         content_clean=
         "last week 2000 people have been sacked from their homes in Nigeria"
     )
     self.session.add(content)
     self.session.commit()
     analysis.content_id = content.id
     self.session.commit()
     extract_facts(analysis)
     self.assertEqual(FactTerm.SACKED, analysis.facts[0].term)
Exemple #10
0
def add_url():
    url = request.form['url']
    logger.info("Scraping by url: {url}".format(url=url))
    if url is None:
        flash(u'Something went wrong. Please try again.', 'danger')
        return redirect(url_for('/'))
    article = Gkg(document_identifier=url)
    session = Session()
    try:
        session.add(article)
        session.commit()
        flash(u"{} was successfully added".format(url), 'success')
        return redirect('/')
    finally:
        session.close()
Exemple #11
0
 def test_scrape_html(self):
     gkg = Gkg(
         document_identifier="http://www.cnn.com/2013/08/23/us/hurricane-katrina-statistics-fast-facts/index.html")
     analysis = Analysis(gkg=gkg, status=Status.NEW)
     self.session.add(analysis)
     self.session.commit()
     scrape(analysis)
     content = analysis.content
     self.assertEqual("text", content.content_type)
     self.assertTrue("Katrina" in content.content_clean)
     self.assertTrue("Louisiana" in content.content_clean)
     self.assertTrue("\n" not in content.content_clean)
     self.assertTrue(content.content_ts is not None)
     matches = (
         self.session.query(DocumentContent)
             .filter(DocumentContent.content_ts.match('Katrina & Louisiana')).all()
     )
     self.assertIn(content, matches)
Exemple #12
0
 def test_use_existing_location(self):
     """Uses existing locations when they exist"""
     gkg = Gkg()
     analysis = Analysis(gkg=gkg, status=Status.NEW)
     self.session.add(analysis)
     content = DocumentContent(
         content_clean=
         "It was early Saturday when a flash flood hit large parts of Bosnia and washed away more than 500 houses"
     )
     self.session.add(content)
     location = Location(location_name='Bosnia')
     self.session.add(location)
     self.session.commit()
     analysis.content_id = content.id
     self.session.commit()
     extract_facts(analysis)
     fact = analysis.facts[0]
     extracted_location = fact.locations[0]
     self.assertEqual(location.id, extracted_location.id)
Exemple #13
0
 def test_create_locations_with_names(self):
     """Creates locations for facts only with location names"""
     gkg = Gkg()
     analysis = Analysis(gkg=gkg, status=Status.NEW)
     self.session.add(analysis)
     content = DocumentContent(
         content_clean=
         "It was early Saturday when a flash flood hit large parts of London and Middlesex and washed away more than 500 houses"
     )
     self.session.add(content)
     self.session.commit()
     analysis.content_id = content.id
     self.session.commit()
     extract_facts(analysis)
     facts = analysis.facts
     self.assertEqual(1, len(facts))
     fact = facts[0]
     self.assertEqual(2, len(fact.locations))
     loc_names = [loc.location_name for loc in fact.locations]
     self.assertIn('London', loc_names)
     self.assertIn('Middlesex', loc_names)
     self.assertEqual([None, None], [loc.country for loc in fact.locations])
Exemple #14
0
 def test_fail_if_geotagging_fails(self, nominatim):
     """Location processing should fail if geotagging fails"""
     nominatim.side_effect = GeotagException()
     gkg = Gkg(
         id=3771256,
         gkgrecordid="20170215174500-2503",
         date=20170215174500,
         document_identifier=
         "http://www.philstar.com/headlines/2017/02/16/1672746/yasay-harris-affirm-stronger-phl-us-ties"
     )
     self.session.add(gkg)
     analysis = Analysis(gkg=gkg, status=Status.NEW)
     self.session.add(analysis)
     self.session.commit()
     fact = Fact(unit='person', term='displaced')
     self.session.add(fact)
     self.session.commit()
     loc1 = Location(location_name="Ruislip")
     fact.locations.append(loc1)
     analysis.facts.append(fact)
     self.session.commit()
     with self.assertRaises(GeotagException):
         process_locations(analysis)