Exemple #1
0
class TestScraper(TestCase):
    def setUp(self):
        db_host = os.environ.get('DB_HOST')
        db_url = 'postgresql://{user}:{passwd}@{db_host}/{db}'.format(
            user='******', passwd='tester', db_host=db_host, db='idetect_test')
        engine = create_engine(db_url)
        Session.configure(bind=engine)
        Base.metadata.drop_all(engine)
        Base.metadata.create_all(engine)
        self.session = Session()

    def tearDown(self):
        self.session.rollback()
        for gkg in self.session.query(Gkg).all():
            self.session.delete(gkg)
        self.session.commit()

    def test_scrape_html(self):
        gkg = Gkg(
            document_identifier="http://www.cnn.com/2013/08/23/us/hurricane-katrina-statistics-fast-facts/index.html")
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        self.session.commit()
        scrape(analysis)
        content = analysis.content
        self.assertEqual("text", content.content_type)
        self.assertTrue("Katrina" in content.content_clean)
        self.assertTrue("Louisiana" in content.content_clean)
        self.assertTrue("\n" not in content.content_clean)
        self.assertTrue(content.content_ts is not None)
        matches = (
            self.session.query(DocumentContent)
                .filter(DocumentContent.content_ts.match('Katrina & Louisiana')).all()
        )
        self.assertIn(content, matches)

    def test_scrape_pdf(self):
        gkg = Gkg(
            document_identifier="https://www1.ncdc.noaa.gov/pub/data/extremeevents/specialreports/Hurricane-Katrina.pdf")
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        self.session.commit()
        scrape(analysis)
        content = analysis.content
        self.assertEqual("pdf", content.content_type)
        self.assertTrue("Katrina" in content.content)
        self.assertTrue("Louisiana" in content.content)
        self.assertTrue("\n" not in content.content)
Exemple #2
0
class TestFactExtractor(TestCase):
    def setUp(self):
        db_host = os.environ.get('DB_HOST')
        db_url = 'postgresql://{user}:{passwd}@{db_host}/{db}'.format(
            user='******', passwd='tester', db_host=db_host, db='idetect_test')
        engine = create_engine(db_url)
        Session.configure(bind=engine)
        Base.metadata.drop_all(engine)
        Base.metadata.create_all(engine)
        self.session = Session()
        load_countries(self.session)
        load_terms(self.session)

    def tearDown(self):
        self.session.rollback()
        for article in self.session.query(Gkg).all():
            self.session.delete(article)
        self.session.commit()

    def test_extract_facts_simple(self):
        """Extracts simple facts when present and saves to DB"""
        gkg = Gkg()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "It was early Saturday when a flash flood hit the area and washed away more than 500 houses"
        )
        self.session.add(content)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        extract_facts(analysis)
        self.assertEqual(1, len(analysis.facts))

    def test_extract_refugee_facts(self):
        """Extracts refugee-related facts with Refugee Term"""
        gkg = Gkg()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "It was early Saturday when government troops entered the area and forced more than 20000 refugees to flee."
        )
        self.session.add(content)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        extract_facts(analysis)
        self.assertEqual(FactTerm.REFUGEE, analysis.facts[0].term)

    def test_extract_evicted_facts(self):
        """Extracts eviction-related facts with eviction Term"""
        gkg = Gkg()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "2000 people have been evicted from their homes in Bosnia")
        self.session.add(content)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        extract_facts(analysis)
        self.assertEqual(FactTerm.EVICTED, analysis.facts[0].term)

    def test_extract_eviction_facts(self):
        """Extracts eviction-related facts with eviction Term"""
        gkg = Gkg()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "ordered eviction for 2000 people from their homes in Bosnia")
        self.session.add(content)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        extract_facts(analysis)
        self.assertEqual(FactTerm.EVICTED, analysis.facts[0].term)

    def test_extract_forced_eviction_facts(self):
        """Extracts eviction-related facts with eviction Term"""
        gkg = Gkg()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "ordered forced eviction for 2000 people from their homes in Bosnia"
        )
        self.session.add(content)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        extract_facts(analysis)
        self.assertEqual(FactTerm.EVICTED, analysis.facts[0].term)

    def test_extract_forcibly_evicted_facts(self):
        """Extracts eviction-related facts with eviction Term"""
        gkg = Gkg()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "2000 people were forcibly evicted from their homes in Bosnia")
        self.session.add(content)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        extract_facts(analysis)
        self.assertEqual(FactTerm.EVICTED, analysis.facts[0].term)

    def test_extract_sacked_facts(self):
        """Extracts sacked-related facts with eviction Term"""
        gkg = Gkg()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "last week 2000 people have been sacked from their homes in Nigeria"
        )
        self.session.add(content)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        extract_facts(analysis)
        self.assertEqual(FactTerm.SACKED, analysis.facts[0].term)

    def test_create_locations_with_names(self):
        """Creates locations for facts only with location names"""
        gkg = Gkg()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "It was early Saturday when a flash flood hit large parts of London and Middlesex and washed away more than 500 houses"
        )
        self.session.add(content)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        extract_facts(analysis)
        facts = analysis.facts
        self.assertEqual(1, len(facts))
        fact = facts[0]
        self.assertEqual(2, len(fact.locations))
        loc_names = [loc.location_name for loc in fact.locations]
        self.assertIn('London', loc_names)
        self.assertIn('Middlesex', loc_names)
        self.assertEqual([None, None], [loc.country for loc in fact.locations])

    def test_use_existing_location(self):
        """Uses existing locations when they exist"""
        gkg = Gkg()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "It was early Saturday when a flash flood hit large parts of Bosnia and washed away more than 500 houses"
        )
        self.session.add(content)
        location = Location(location_name='Bosnia')
        self.session.add(location)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        extract_facts(analysis)
        fact = analysis.facts[0]
        extracted_location = fact.locations[0]
        self.assertEqual(location.id, extracted_location.id)
Exemple #3
0
class TestModel(TestCase):
    def setUp(self):
        db_host = os.environ.get('DB_HOST')
        db_url = 'postgresql://{user}:{passwd}@{db_host}/{db}'.format(
            user='******', passwd='tester', db_host=db_host, db='idetect_test')
        engine = create_engine(db_url)
        Session.configure(bind=engine)
        Base.metadata.drop_all(engine)
        Base.metadata.create_all(engine)
        self.session = Session()
        self.sample_data()

    def sample_data(self):
        gkg1 = Gkg(
            id=3771256,
            gkgrecordid="20170215174500-2503",
            date=20170215174500,
            document_identifier=
            "http://www.philstar.com/headlines/2017/02/16/1672746/yasay-harris-affirm-stronger-phl-us-ties"
        )
        self.session.add(gkg1)

        gkg2 = Gkg(
            id=3771257,
            gkgrecordid="20170215174500-1536",
            date=20170215174500,
            document_identifier=
            "http://wynkcountry.iheart.com/onair/cmt-cody-alan-54719/thomas-rhett-and-lauren-akins-are-15565244/"
        )
        self.session.add(gkg2)
        self.session.commit()

    def tearDown(self):
        self.session.rollback()
        for gkg in self.session.query(Gkg).all():
            self.session.delete(gkg)
        self.session.commit()

    def test_status_update(self):
        gkg = self.session.query(Gkg).first()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        self.session.commit()

        analysis.create_new_version(Status.SCRAPING)
        self.assertEqual(analysis.status, Status.SCRAPING)

        # meanwhile, some other process changed the status of this...
        session2 = Session()
        try:
            other = session2.query(Analysis).get(analysis.gkg_id)
            other.create_new_version(Status.SCRAPING_FAILED)
        finally:
            session2.rollback()

        with self.assertRaises(NotLatestException):
            analysis.create_new_version(Status.SCRAPED)

    def test_version_lifecycle(self):
        gkg = self.session.query(Gkg).first()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        self.session.commit()

        analysis.create_new_version(Status.SCRAPING)

        history = self.session.query(AnalysisHistory).filter(
            AnalysisHistory.gkg == gkg)
        self.assertEqual(1, history.count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.NEW).count())

        content = DocumentContent(content_type="text/html",
                                  content="Lorem ipsum")
        analysis.content = content
        analysis.create_new_version(Status.SCRAPED)

        self.assertEqual(2, history.count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.NEW).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPING).count())

        analysis.create_new_version(Status.EXTRACTING)

        self.assertEqual(3, history.count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.NEW).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPED).count())

        # content is preserved
        scraped = history.filter(
            AnalysisHistory.status == Status.SCRAPED).one_or_none()
        self.assertEqual(analysis.content, scraped.content)

        fact = Fact(analysis_date=datetime.now())
        analysis.facts = [fact]
        analysis.create_new_version(Status.EXTRACTED)

        self.assertEqual(4, history.count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.NEW).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPED).count())
        self.assertEqual(
            1,
            history.filter(
                AnalysisHistory.status == Status.EXTRACTING).count())

        # content still preserved
        extracting = history.filter(
            AnalysisHistory.status == Status.EXTRACTING).one_or_none()
        self.assertEqual(analysis.content, extracting.content)

        analysis.create_new_version(Status.EDITING)
        analysis.content = DocumentContent(content_type="text/html",
                                           content="Lorem edited")
        analysis.create_new_version(Status.EDITED)

        self.assertEqual(6, history.count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.NEW).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPED).count())
        self.assertEqual(
            1,
            history.filter(
                AnalysisHistory.status == Status.EXTRACTING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.EXTRACTED).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.EDITING).count())

        # content has changed, but reports are preserved
        extracted = history.filter(
            AnalysisHistory.status == Status.EXTRACTED).one_or_none()
        self.assertNotEqual(analysis.content.id, extracted.content.id)
        self.assertCountEqual([f.id for f in analysis.facts],
                              [f.id for f in extracted.facts])

        analysis.create_new_version(Status.EDITING)
        fact2 = Fact(analysis_date=datetime.now())
        analysis.facts.append(fact2)
        analysis.create_new_version(Status.EDITED)

        self.assertEqual(8, history.count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.NEW).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPED).count())
        self.assertEqual(
            1,
            history.filter(
                AnalysisHistory.status == Status.EXTRACTING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.EXTRACTED).count())
        self.assertEqual(
            2,
            history.filter(AnalysisHistory.status == Status.EDITING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.EDITED).count())

        edited = history.filter(
            AnalysisHistory.status == Status.EDITED).one_or_none()
        self.assertCountEqual([f.id for f in analysis.facts],
                              [fact.id, fact2.id])
        self.assertCountEqual([f.id for f in edited.facts], [fact.id])

    def test_status_counts(self):
        gkgs = self.session.query(Gkg).all()[:2]
        analysis1 = Analysis(gkg=gkgs[0], status=Status.NEW)
        self.session.add(analysis1)
        self.session.commit()

        self.assertEqual(Analysis.status_counts(self.session), {Status.NEW: 1})

        analysis1.create_new_version(Status.SCRAPING)

        self.assertEqual(Analysis.status_counts(self.session),
                         {Status.SCRAPING: 1})

        analysis2 = Analysis(gkg=gkgs[1], status=Status.NEW)
        self.session.add(analysis2)
        self.session.commit()

        self.assertEqual(Analysis.status_counts(self.session), {
            Status.NEW: 1,
            Status.SCRAPING: 1
        })

        analysis2.create_new_version(Status.SCRAPING)

        self.assertEqual(Analysis.status_counts(self.session),
                         {Status.SCRAPING: 2})

        analysis2.create_new_version(Status.SCRAPED)

        self.assertEqual(Analysis.status_counts(self.session), {
            Status.SCRAPED: 1,
            Status.SCRAPING: 1
        })

    def test_country_term(self):
        mmr = Country(iso3="MMR", preferred_term="Myanmar")
        myanmar = CountryTerm(term="Myanmar", country=mmr)
        burma = CountryTerm(term="Burma", country=mmr)
        yangon = Location(location_name="Yangon",
                          location_type=LocationType.CITY,
                          country=mmr,
                          latlong="16°51′N 96°11′E")

        self.assertEqual(yangon.country, myanmar.country)
        self.assertEqual(yangon.country, burma.country)