Ejemplo n.º 1
0
    def work(self):
        """
        Look for Documents in the given session Return for which no Analysis exists and
        creates one with Status.New. Returns True iff some Analyses were created
        """
        # start a new session for each job
        session = Session()
        try:
            # Get a Document
            # ... for which no Analysis exists
            # ... and lock it for updates
            # ... sort by created date
            # ... pick the first (oldest)
            gkgs = session.query(Gkg) \
                .filter(~session.query(Analysis).filter(Gkg.id == Analysis.gkg_id).exists()) \
                .with_for_update() \
                .order_by(Gkg.date) \
                .limit(1000).all()
            if len(gkgs) == 0:
                return False  # no work to be done
            for gkg in gkgs:
                analysis = Analysis(gkg=gkg, status=Status.NEW)
                session.add(analysis)
                session.commit()
                logger.info(
                    "Worker {} created Analysis {} in status {}".format(
                        os.getpid(), analysis.gkg_id, analysis.status))
        finally:
            # make sure to release a FOR UPDATE lock, if we got one
            if session is not None:
                session.rollback()
                session.close()

        return True
Ejemplo n.º 2
0
def add_url():
    url = request.form['url']
    logger.info("Scraping by url: {url}".format(url=url))
    if url is None:
        flash(u'Something went wrong. Please try again.', 'danger')
        return redirect(url_for('/'))
    article = Gkg(document_identifier=url)
    session = Session()
    try:
        session.add(article)
        session.commit()
        flash(u"{} was successfully added".format(url), 'success')
        return redirect('/')
    finally:
        session.close()
Ejemplo n.º 3
0
class TestScraper(TestCase):
    def setUp(self):
        db_host = os.environ.get('DB_HOST')
        db_url = 'postgresql://{user}:{passwd}@{db_host}/{db}'.format(
            user='******', passwd='tester', db_host=db_host, db='idetect_test')
        engine = create_engine(db_url)
        Session.configure(bind=engine)
        Base.metadata.drop_all(engine)
        Base.metadata.create_all(engine)
        self.session = Session()

    def tearDown(self):
        self.session.rollback()
        for gkg in self.session.query(Gkg).all():
            self.session.delete(gkg)
        self.session.commit()

    def test_scrape_html(self):
        gkg = Gkg(
            document_identifier="http://www.cnn.com/2013/08/23/us/hurricane-katrina-statistics-fast-facts/index.html")
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        self.session.commit()
        scrape(analysis)
        content = analysis.content
        self.assertEqual("text", content.content_type)
        self.assertTrue("Katrina" in content.content_clean)
        self.assertTrue("Louisiana" in content.content_clean)
        self.assertTrue("\n" not in content.content_clean)
        self.assertTrue(content.content_ts is not None)
        matches = (
            self.session.query(DocumentContent)
                .filter(DocumentContent.content_ts.match('Katrina & Louisiana')).all()
        )
        self.assertIn(content, matches)

    def test_scrape_pdf(self):
        gkg = Gkg(
            document_identifier="https://www1.ncdc.noaa.gov/pub/data/extremeevents/specialreports/Hurricane-Katrina.pdf")
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        self.session.commit()
        scrape(analysis)
        content = analysis.content
        self.assertEqual("pdf", content.content_type)
        self.assertTrue("Katrina" in content.content)
        self.assertTrue("Louisiana" in content.content)
        self.assertTrue("\n" not in content.content)
Ejemplo n.º 4
0
class TestGeoTagger(TestCase):
    def setUp(self):
        db_host = os.environ.get('DB_HOST')
        db_url = 'postgresql://{user}:{passwd}@{db_host}/{db}'.format(
            user='******', passwd='tester', db_host=db_host, db='idetect_test')
        engine = create_engine(db_url)
        Session.configure(bind=engine)
        Base.metadata.drop_all(engine)
        Base.metadata.create_all(engine)
        self.session = Session()
        load_countries(self.session)

    def tearDown(self):
        self.session.rollback()

    def test_sets_no_results_flag(self):
        """Sets no-results flag if nothing found"""
        results = get_geo_info("xghijdshfkljdes")
        self.assertEqual(results['flag'], "no-results")

    def test_returns_detail_for_places(self):
        """Returns sufficient level of detail for results"""
        results = get_geo_info("Paris")
        self.assertNotEqual(results['country_code'], '')
        self.assertNotEqual(results['coordinates'], '')
        self.assertNotEqual(results['type'], '')

    def test_accuracy(self):
        """Returns sufficient level of detail for results"""
        results = get_geo_info("Beijing")
        self.assertEqual(results['country_code'], 'CHN')
        self.assertEqual(results['coordinates'], "39.9059631,116.391248")

    def test_country_code(self):
        """Returns sufficient level of detail for results"""
        results = get_geo_info("Bidibidi")
        self.assertEqual(results['country_code'], 'UGA')
        results = get_geo_info("Marrakech")
        self.assertEqual(results['country_code'], 'MAR')
        results = get_geo_info("Fairfax County")
        self.assertEqual(results['country_code'], 'USA')

    def test_location_types(self):
        """Corectly distinguishes between Countries, Cities and Subdivisions"""
        results = get_geo_info("London")
        self.assertEqual(results['type'], LocationType.CITY)
        results = get_geo_info("India")
        self.assertEqual(results['type'], LocationType.COUNTRY)
        results = get_geo_info("Alaska")
        self.assertEqual(results['type'], LocationType.SUBDIVISION)

    # DONT RUN geotagging if detail already exists
    @mock.patch('idetect.geotagger.nominatim_coordinates')
    def dont_geotag_if_detail_exists(self, nominatim):
        gkg = Gkg(
            id=3771256,
            gkgrecordid="20170215174500-2503",
            date=20170215174500,
            document_identifier=
            "http://www.philstar.com/headlines/2017/02/16/1672746/yasay-harris-affirm-stronger-phl-us-ties"
        )
        self.session.add(gkg)
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "It was early Saturday when a flash flood hit large parts of India and Pakistan and washed away more than 500 houses"
        )
        self.session.add(content)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        fact = Fact(unit='person', term='displaced')
        self.session.add(fact)
        self.session.commit()
        loc1 = self.session.query(Location).filter(
            Location.location_name == 'India').one_or_none()
        fact.locations.append(loc1)
        analysis.facts.append(fact)
        self.session.commit()
        process_locations(analysis)
        assert not nominatim.called

    def test_create_duplicate_fact(self):
        """Creates duplicate fact if locations from multiple countries exist"""
        gkg = Gkg(
            id=3771256,
            gkgrecordid="20170215174500-2503",
            date=20170215174500,
            document_identifier=
            "http://www.philstar.com/headlines/2017/02/16/1672746/yasay-harris-affirm-stronger-phl-us-ties"
        )
        self.session.add(gkg)
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        self.session.commit()
        fact = Fact(unit='person', term='displaced')
        self.session.add(fact)
        self.session.commit()
        loc1 = self.session.query(Location).filter(
            Location.location_name == 'India').one_or_none()
        loc2 = self.session.query(Location).filter(
            Location.location_name == 'Pakistan').one_or_none()
        fact.locations.append(loc1)
        fact.locations.append(loc2)
        analysis.facts.append(fact)
        self.session.commit()
        self.assertEqual(1, len(analysis.facts))
        process_locations(analysis)
        self.assertEqual(2, len(analysis.facts))
        fact_countries = [f.iso3 for f in analysis.facts]
        self.assertIn('IND', fact_countries)
        self.assertIn('PAK', fact_countries)
        self.assertEqual(1, len(analysis.facts[0].locations))
        self.assertEqual(1, len(analysis.facts[1].locations))

    @mock.patch('idetect.geotagger.nominatim_coordinates')
    def test_fail_if_geotagging_fails(self, nominatim):
        """Location processing should fail if geotagging fails"""
        nominatim.side_effect = GeotagException()
        gkg = Gkg(
            id=3771256,
            gkgrecordid="20170215174500-2503",
            date=20170215174500,
            document_identifier=
            "http://www.philstar.com/headlines/2017/02/16/1672746/yasay-harris-affirm-stronger-phl-us-ties"
        )
        self.session.add(gkg)
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        self.session.commit()
        fact = Fact(unit='person', term='displaced')
        self.session.add(fact)
        self.session.commit()
        loc1 = Location(location_name="Ruislip")
        fact.locations.append(loc1)
        analysis.facts.append(fact)
        self.session.commit()
        with self.assertRaises(GeotagException):
            process_locations(analysis)
Ejemplo n.º 5
0
class TestFactExtractor(TestCase):
    def setUp(self):
        db_host = os.environ.get('DB_HOST')
        db_url = 'postgresql://{user}:{passwd}@{db_host}/{db}'.format(
            user='******', passwd='tester', db_host=db_host, db='idetect_test')
        engine = create_engine(db_url)
        Session.configure(bind=engine)
        Base.metadata.drop_all(engine)
        Base.metadata.create_all(engine)
        self.session = Session()
        load_countries(self.session)
        load_terms(self.session)

    def tearDown(self):
        self.session.rollback()
        for article in self.session.query(Gkg).all():
            self.session.delete(article)
        self.session.commit()

    def test_extract_facts_simple(self):
        """Extracts simple facts when present and saves to DB"""
        gkg = Gkg()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "It was early Saturday when a flash flood hit the area and washed away more than 500 houses"
        )
        self.session.add(content)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        extract_facts(analysis)
        self.assertEqual(1, len(analysis.facts))

    def test_extract_refugee_facts(self):
        """Extracts refugee-related facts with Refugee Term"""
        gkg = Gkg()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "It was early Saturday when government troops entered the area and forced more than 20000 refugees to flee."
        )
        self.session.add(content)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        extract_facts(analysis)
        self.assertEqual(FactTerm.REFUGEE, analysis.facts[0].term)

    def test_extract_evicted_facts(self):
        """Extracts eviction-related facts with eviction Term"""
        gkg = Gkg()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "2000 people have been evicted from their homes in Bosnia")
        self.session.add(content)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        extract_facts(analysis)
        self.assertEqual(FactTerm.EVICTED, analysis.facts[0].term)

    def test_extract_eviction_facts(self):
        """Extracts eviction-related facts with eviction Term"""
        gkg = Gkg()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "ordered eviction for 2000 people from their homes in Bosnia")
        self.session.add(content)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        extract_facts(analysis)
        self.assertEqual(FactTerm.EVICTED, analysis.facts[0].term)

    def test_extract_forced_eviction_facts(self):
        """Extracts eviction-related facts with eviction Term"""
        gkg = Gkg()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "ordered forced eviction for 2000 people from their homes in Bosnia"
        )
        self.session.add(content)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        extract_facts(analysis)
        self.assertEqual(FactTerm.EVICTED, analysis.facts[0].term)

    def test_extract_forcibly_evicted_facts(self):
        """Extracts eviction-related facts with eviction Term"""
        gkg = Gkg()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "2000 people were forcibly evicted from their homes in Bosnia")
        self.session.add(content)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        extract_facts(analysis)
        self.assertEqual(FactTerm.EVICTED, analysis.facts[0].term)

    def test_extract_sacked_facts(self):
        """Extracts sacked-related facts with eviction Term"""
        gkg = Gkg()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "last week 2000 people have been sacked from their homes in Nigeria"
        )
        self.session.add(content)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        extract_facts(analysis)
        self.assertEqual(FactTerm.SACKED, analysis.facts[0].term)

    def test_create_locations_with_names(self):
        """Creates locations for facts only with location names"""
        gkg = Gkg()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "It was early Saturday when a flash flood hit large parts of London and Middlesex and washed away more than 500 houses"
        )
        self.session.add(content)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        extract_facts(analysis)
        facts = analysis.facts
        self.assertEqual(1, len(facts))
        fact = facts[0]
        self.assertEqual(2, len(fact.locations))
        loc_names = [loc.location_name for loc in fact.locations]
        self.assertIn('London', loc_names)
        self.assertIn('Middlesex', loc_names)
        self.assertEqual([None, None], [loc.country for loc in fact.locations])

    def test_use_existing_location(self):
        """Uses existing locations when they exist"""
        gkg = Gkg()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "It was early Saturday when a flash flood hit large parts of Bosnia and washed away more than 500 houses"
        )
        self.session.add(content)
        location = Location(location_name='Bosnia')
        self.session.add(location)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        extract_facts(analysis)
        fact = analysis.facts[0]
        extracted_location = fact.locations[0]
        self.assertEqual(location.id, extracted_location.id)
Ejemplo n.º 6
0
class TestModel(TestCase):
    def setUp(self):
        db_host = os.environ.get('DB_HOST')
        db_url = 'postgresql://{user}:{passwd}@{db_host}/{db}'.format(
            user='******', passwd='tester', db_host=db_host, db='idetect_test')
        engine = create_engine(db_url)
        Session.configure(bind=engine)
        Base.metadata.drop_all(engine)
        Base.metadata.create_all(engine)
        self.session = Session()
        self.sample_data()

    def sample_data(self):
        gkg1 = Gkg(
            id=3771256,
            gkgrecordid="20170215174500-2503",
            date=20170215174500,
            document_identifier=
            "http://www.philstar.com/headlines/2017/02/16/1672746/yasay-harris-affirm-stronger-phl-us-ties"
        )
        self.session.add(gkg1)

        gkg2 = Gkg(
            id=3771257,
            gkgrecordid="20170215174500-1536",
            date=20170215174500,
            document_identifier=
            "http://wynkcountry.iheart.com/onair/cmt-cody-alan-54719/thomas-rhett-and-lauren-akins-are-15565244/"
        )
        self.session.add(gkg2)
        self.session.commit()

    def tearDown(self):
        self.session.rollback()
        for gkg in self.session.query(Gkg).all():
            self.session.delete(gkg)
        self.session.commit()

    def test_status_update(self):
        gkg = self.session.query(Gkg).first()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        self.session.commit()

        analysis.create_new_version(Status.SCRAPING)
        self.assertEqual(analysis.status, Status.SCRAPING)

        # meanwhile, some other process changed the status of this...
        session2 = Session()
        try:
            other = session2.query(Analysis).get(analysis.gkg_id)
            other.create_new_version(Status.SCRAPING_FAILED)
        finally:
            session2.rollback()

        with self.assertRaises(NotLatestException):
            analysis.create_new_version(Status.SCRAPED)

    def test_version_lifecycle(self):
        gkg = self.session.query(Gkg).first()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        self.session.commit()

        analysis.create_new_version(Status.SCRAPING)

        history = self.session.query(AnalysisHistory).filter(
            AnalysisHistory.gkg == gkg)
        self.assertEqual(1, history.count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.NEW).count())

        content = DocumentContent(content_type="text/html",
                                  content="Lorem ipsum")
        analysis.content = content
        analysis.create_new_version(Status.SCRAPED)

        self.assertEqual(2, history.count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.NEW).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPING).count())

        analysis.create_new_version(Status.EXTRACTING)

        self.assertEqual(3, history.count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.NEW).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPED).count())

        # content is preserved
        scraped = history.filter(
            AnalysisHistory.status == Status.SCRAPED).one_or_none()
        self.assertEqual(analysis.content, scraped.content)

        fact = Fact(analysis_date=datetime.now())
        analysis.facts = [fact]
        analysis.create_new_version(Status.EXTRACTED)

        self.assertEqual(4, history.count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.NEW).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPED).count())
        self.assertEqual(
            1,
            history.filter(
                AnalysisHistory.status == Status.EXTRACTING).count())

        # content still preserved
        extracting = history.filter(
            AnalysisHistory.status == Status.EXTRACTING).one_or_none()
        self.assertEqual(analysis.content, extracting.content)

        analysis.create_new_version(Status.EDITING)
        analysis.content = DocumentContent(content_type="text/html",
                                           content="Lorem edited")
        analysis.create_new_version(Status.EDITED)

        self.assertEqual(6, history.count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.NEW).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPED).count())
        self.assertEqual(
            1,
            history.filter(
                AnalysisHistory.status == Status.EXTRACTING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.EXTRACTED).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.EDITING).count())

        # content has changed, but reports are preserved
        extracted = history.filter(
            AnalysisHistory.status == Status.EXTRACTED).one_or_none()
        self.assertNotEqual(analysis.content.id, extracted.content.id)
        self.assertCountEqual([f.id for f in analysis.facts],
                              [f.id for f in extracted.facts])

        analysis.create_new_version(Status.EDITING)
        fact2 = Fact(analysis_date=datetime.now())
        analysis.facts.append(fact2)
        analysis.create_new_version(Status.EDITED)

        self.assertEqual(8, history.count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.NEW).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPED).count())
        self.assertEqual(
            1,
            history.filter(
                AnalysisHistory.status == Status.EXTRACTING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.EXTRACTED).count())
        self.assertEqual(
            2,
            history.filter(AnalysisHistory.status == Status.EDITING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.EDITED).count())

        edited = history.filter(
            AnalysisHistory.status == Status.EDITED).one_or_none()
        self.assertCountEqual([f.id for f in analysis.facts],
                              [fact.id, fact2.id])
        self.assertCountEqual([f.id for f in edited.facts], [fact.id])

    def test_status_counts(self):
        gkgs = self.session.query(Gkg).all()[:2]
        analysis1 = Analysis(gkg=gkgs[0], status=Status.NEW)
        self.session.add(analysis1)
        self.session.commit()

        self.assertEqual(Analysis.status_counts(self.session), {Status.NEW: 1})

        analysis1.create_new_version(Status.SCRAPING)

        self.assertEqual(Analysis.status_counts(self.session),
                         {Status.SCRAPING: 1})

        analysis2 = Analysis(gkg=gkgs[1], status=Status.NEW)
        self.session.add(analysis2)
        self.session.commit()

        self.assertEqual(Analysis.status_counts(self.session), {
            Status.NEW: 1,
            Status.SCRAPING: 1
        })

        analysis2.create_new_version(Status.SCRAPING)

        self.assertEqual(Analysis.status_counts(self.session),
                         {Status.SCRAPING: 2})

        analysis2.create_new_version(Status.SCRAPED)

        self.assertEqual(Analysis.status_counts(self.session), {
            Status.SCRAPED: 1,
            Status.SCRAPING: 1
        })

    def test_country_term(self):
        mmr = Country(iso3="MMR", preferred_term="Myanmar")
        myanmar = CountryTerm(term="Myanmar", country=mmr)
        burma = CountryTerm(term="Burma", country=mmr)
        yangon = Location(location_name="Yangon",
                          location_type=LocationType.CITY,
                          country=mmr,
                          latlong="16°51′N 96°11′E")

        self.assertEqual(yangon.country, myanmar.country)
        self.assertEqual(yangon.country, burma.country)
Ejemplo n.º 7
0
engine = create_engine(db_url())
Session.configure(bind=engine)

# create the DB schema, if it doesn't already exist
Base.metadata.create_all(engine)

if __name__ == "__main__":

    session = Session()
    with open('/home/idetect/data/input_urls.csv',
              newline='',
              encoding='utf-8') as f:
        c = csv.reader(f)
        i = 0
        for l in c:
            url_id, gkgrecordid, date, source_common_name, document_identifier, locations, v2_counts, v2_themes = l
            if not document_identifier.startswith('http'):
                continue
            try:
                article = Document(legacy_id=url_id,
                                   url=document_identifier,
                                   name="New Document",
                                   type=DocumentType.WEB)
                session.add(article)
                session.commit()
            except:
                pass
            i += 1
            if i % 10 == 0:
                print("{} {}".format(i, document_identifier))