def work(self): """ Look for Documents in the given session Return for which no Analysis exists and creates one with Status.New. Returns True iff some Analyses were created """ # start a new session for each job session = Session() try: # Get a Document # ... for which no Analysis exists # ... and lock it for updates # ... sort by created date # ... pick the first (oldest) gkgs = session.query(Gkg) \ .filter(~session.query(Analysis).filter(Gkg.id == Analysis.gkg_id).exists()) \ .with_for_update() \ .order_by(Gkg.date) \ .limit(1000).all() if len(gkgs) == 0: return False # no work to be done for gkg in gkgs: analysis = Analysis(gkg=gkg, status=Status.NEW) session.add(analysis) session.commit() logger.info( "Worker {} created Analysis {} in status {}".format( os.getpid(), analysis.gkg_id, analysis.status)) finally: # make sure to release a FOR UPDATE lock, if we got one if session is not None: session.rollback() session.close() return True
class TestScraper(TestCase): def setUp(self): db_host = os.environ.get('DB_HOST') db_url = 'postgresql://{user}:{passwd}@{db_host}/{db}'.format( user='******', passwd='tester', db_host=db_host, db='idetect_test') engine = create_engine(db_url) Session.configure(bind=engine) Base.metadata.drop_all(engine) Base.metadata.create_all(engine) self.session = Session() def tearDown(self): self.session.rollback() for gkg in self.session.query(Gkg).all(): self.session.delete(gkg) self.session.commit() def test_scrape_html(self): gkg = Gkg( document_identifier="http://www.cnn.com/2013/08/23/us/hurricane-katrina-statistics-fast-facts/index.html") analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) self.session.commit() scrape(analysis) content = analysis.content self.assertEqual("text", content.content_type) self.assertTrue("Katrina" in content.content_clean) self.assertTrue("Louisiana" in content.content_clean) self.assertTrue("\n" not in content.content_clean) self.assertTrue(content.content_ts is not None) matches = ( self.session.query(DocumentContent) .filter(DocumentContent.content_ts.match('Katrina & Louisiana')).all() ) self.assertIn(content, matches) def test_scrape_pdf(self): gkg = Gkg( document_identifier="https://www1.ncdc.noaa.gov/pub/data/extremeevents/specialreports/Hurricane-Katrina.pdf") analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) self.session.commit() scrape(analysis) content = analysis.content self.assertEqual("pdf", content.content_type) self.assertTrue("Katrina" in content.content) self.assertTrue("Louisiana" in content.content) self.assertTrue("\n" not in content.content)
def work(self): """ Look for analyses in the given session and run function on them if any are found, managing status appropriately. Return True iff some Analyses were processed (successfully or not) """ # start a new session for each job session = Session() try: # Get an analysis # ... and lock it for updates # ... that meets the conditions specified in the filter function # ... sort by updated date # ... pick the first (oldest) analysis = self.filter_function(session.query(Analysis)) \ .with_for_update() \ .order_by(Analysis.updated) \ .first() if analysis is None: return False # no work to be done analysis_status = analysis.status analysis.create_new_version(self.working_status) logger.info("Worker {} claimed Analysis {} in status {}".format( os.getpid(), analysis.gkg_id, analysis_status)) finally: # make sure to release a FOR UPDATE lock, if we got one session.rollback() start = time.time() try: # set a timeout so if this worker stalls, we recover signal.alarm(self.timeout_seconds) # actually run the work function on this analysis self.function(analysis) delta = time.time() - start logger.info("Worker {} processed Analysis {} {} -> {} {}s".format( os.getpid(), analysis.gkg_id, analysis_status, self.success_status, delta)) analysis.error_msg = None analysis.processing_time = delta analysis.create_new_version(self.success_status) except Exception as e: delta = time.time() - start logger.warning( "Worker {} failed to process Analysis {} {} -> {}".format( os.getpid(), analysis.gkg_id, analysis_status, self.failure_status), exc_info=e) analysis.error_msg = str(e) analysis.processing_time = delta analysis.create_new_version(self.failure_status) session.commit() finally: # clear the timeout signal.alarm(0) if session is not None: session.rollback() session.close() return True
def homepage(): session = Session() try: articles = session.query(Analysis).order_by(desc( Analysis.updated)).limit(10).all() counts = Analysis.status_counts(session) cat_counts = Analysis.category_counts(session) return render_template('index.html', articles=articles, counts=counts, cat_counts=cat_counts) finally: session.close()
def article(doc_id): session = Session() try: analysis = session.query(Analysis) \ .filter(Analysis.gkg_id == doc_id).one() coords = { tuple(l.latlong.split(",")) for f in analysis.facts for l in f.locations if l.latlong is not None } return render_template('article.html', article=analysis, coords=list(coords)) finally: session.close()
def test_status_update(self): gkg = self.session.query(Gkg).first() analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) self.session.commit() analysis.create_new_version(Status.SCRAPING) self.assertEqual(analysis.status, Status.SCRAPING) # meanwhile, some other process changed the status of this... session2 = Session() try: other = session2.query(Analysis).get(analysis.gkg_id) other.create_new_version(Status.SCRAPING_FAILED) finally: session2.rollback() with self.assertRaises(NotLatestException): analysis.create_new_version(Status.SCRAPED)
def search_url(): url = request.args.get('url') if url is None: return json.dumps({'success': False}), 422, { 'ContentType': 'application/json' } session = Session() try: gkg = session.query(Gkg).filter( Gkg.document_identifier.like("%" + url + "%")).order_by( Gkg.date.desc()).first() if gkg: resp = jsonify({'doc_id': gkg.id}) resp.status_code = 200 return resp else: return json.dumps({'success': False}), 422, { 'ContentType': 'application/json' } finally: session.close()
class TestSyriaYear(TestCase): syria_location_ids = [ 127, 270, 281, 284, 307, 332, 372, 412, 429, 431, 531, 591, 612, 618, 644, 671, 764, 807, 905, 958, 996, 1018, 1019, 1188, 1190, 1212, 1352, 1357, 1524, 1678, 1898, 1981, 1990, 2058, 2060, 2272, 2378, 2735, 2933, 3262, 3323, 3327, 3372, 3391, 3404, 3660, 3708, 3725, 3834, 3915, 3924, 4069, 4172, 4399, 4509, 4648, 4824, 4890, 5017, 5285, 5833, 6053, 6070, 6270, 6760, 6832, 7121, 7122, 7151, 7222, 7244, 7248, 7641, 7723, 7749, 7757, 7827, 7919, 7970, 8078, 8107, 8131, 8166, 8176, 8210, 8222, 8240, 8254, 8367, 8442, 8659, 8660, 8730, 8788, 8793, 8941, 9045, 9167, 9285, 9370, 9531, 9606, 9775, 9909, 9913, 9916, 9917, 9933, 10136, 10312, 10464, 10532, 10795, 10971, 11052, 11076, 11174, 11194, 11216, 11250, 11311, 11501, 11703, 11727, 11916, 11933, 12242, 12387, 12990, 13126, 13130, 13142, 13171, 13348, 13531, 13659, 13722, 14225, 14718, 14732, 14737, 14917, 14930, 14988, 15215, 15257, 15984, 15993, 16188, 17034, 17090, 17373, 17404, 17873, 18019, 18131, 18267, 18396, 18403, 18578, 19550, 19641, 19721, 20180, 21339, 21894, 22003, 22022, 22162, 22201, 22850, 23189, 23414, 23532, 23875, 24851, 25171, 25415, 25894, 25927, 26024, 26283, 26458, 26545, 26909, 27027, 27393, 27507, 28185, 28626, 28628, 29703, 29704, 29754, 29942, 30210, 30286, 30302, 30442, 30993, 31492, 31743 ] start_date = '2017-01-01' plus_1_yr = '2018-01-01' plus_6_mo = '2017-07-01' plus_3_mo = '2017-04-01' plus_1_mo = '2017-02-01' def setUp(self): logger.debug("setUp") worker_logger = logging.getLogger("idetect.worker") worker_logger.setLevel(logging.INFO) logger.debug("Connecting to DB") db_host = os.environ.get('DB_HOST') db_port = os.environ.get('DB_PORT', 5432) db_user = os.environ.get('DB_USER', 'tester') db_pass = os.environ.get('DB_PASSWORD', 'tester') db_url = 'postgresql://{user}:{passwd}@{db_host}:{db_port}/{db}'.format( user=db_user, passwd=db_pass, db_host=db_host, db_port=db_port, db='idetect') self.engine = create_engine(db_url, echo=False) Session.configure(bind=self.engine) self.session = Session() self.session.query(FactApi).count() logger.debug("setUp complete") def tearDown(self): logger.debug("tearDown") self.session.rollback() logger.debug("sessions rolled back") logger.debug("tearDown complete") def test_categories(self): syr_year_by_category = add_filters( self.session.query(func.count(FactApi.fact), FactApi.category), fromdate=self.start_date, todate=self.plus_1_yr, location_ids=self.syria_location_ids).group_by(FactApi.category) t0 = time.time() result = { category: count for count, category in syr_year_by_category.all() } t1 = time.time() # print(result) self.assertEqual(set(result.keys()), {'Conflict', 'Disaster', 'Other'}) # print(explain_text(self.session, syr_year_by_category)) print(t1 - t0) self.assertLess(t1 - t0, 1.0) def test_filter_counts(self): f_c = get_filter_counts(self.session, fromdate=self.start_date, todate=self.plus_1_yr, location_ids=self.syria_location_ids) # print(f_c) self.assertGreater(len(f_c), 1000) def test_filter_counts_speed(self): for end_date in (self.plus_1_mo, self.plus_3_mo, self.plus_6_mo): # Adding this usually fails: , self.plus_1_yr): t0 = time.time() f_c = get_filter_counts(self.session, fromdate=self.start_date, todate=end_date, location_ids=self.syria_location_ids) t1 = time.time() print('{} - {}: {}s'.format(self.start_date, end_date, t1 - t0)) self.assertLess( t1 - t0, 1.0, 'Calculating filter counts {} - {} took too long'.format( self.start_date, end_date)) def test_timeline(self): t0 = time.time() counts = get_timeline_counts(self.session, fromdate=self.start_date, todate=self.plus_1_yr, location_ids=self.syria_location_ids) t1 = time.time() days = {t['gdelt_day'] for t in counts} self.assertGreater(len(days), 180) categories = {t['category'] for t in counts} self.assertEqual(categories, {'Conflict', 'Disaster', 'Other'}) print(t1 - t0) self.assertLess( t1 - t0, 1.0, 'Calculating timeline counts {} - {} took too long'.format( self.start_date, self.plus_1_yr)) def test_histogram(self): t0 = time.time() counts = get_histogram_counts(self.session, fromdate=self.start_date, todate=self.plus_1_yr, location_ids=self.syria_location_ids) t1 = time.time() print(len(counts)) figures = { t['specific_reported_figure'] for t in counts if t['specific_reported_figure'] } self.assertLess(min(figures), 10) self.assertGreater(max(figures), 1000000) units = {t['unit'] for t in counts} self.assertEqual(units, {'Household', 'Person'}) print(t1 - t0) self.assertLess( t1 - t0, 1.0, 'Calculating histogram counts {} - {} took too long'.format( self.start_date, self.plus_1_yr)) def test_wordcloud(self): t0 = time.time() terms = get_wordcloud(self.session, self.engine, fromdate=self.start_date, todate=self.plus_1_yr, location_ids=self.syria_location_ids) t1 = time.time() print(t1 - t0) print(len(terms)) print(tabulate(terms)) self.assertLess( t1 - t0, 5.0, 'Calculating wordcloud {} - {} took too long'.format( self.start_date, self.plus_1_yr)) def test_none_location(self): # TODO this isn't about Syria, move it somewhere else counts = get_filter_counts(self.session, location_ids=['NULL']) self.assertGreater(len(counts), 1000) self.assertEqual( counts, get_filter_counts(self.session, location_ids=['null'])) self.assertEqual(counts, get_filter_counts(self.session, location_ids=[None])) counts2 = get_filter_counts(self.session, location_ids=['NULL', 1]) self.assertGreater(len(counts2), len(counts)) def test_none_specific_reported_figure_1(self): # TODO this isn't about Syria, move it somewhere else counts = get_filter_counts(self.session, specific_reported_figures=['NULL']) srf_counts = [ c for c in counts if c['filter_type'] == 'specific_reported_figure' ] self.assertEqual(len(srf_counts), 1) self.assertTrue([c for c in srf_counts if c['value'] is None]) self.assertEqual( counts, get_filter_counts(self.session, specific_reported_figures=['null'])) self.assertEqual( counts, get_filter_counts(self.session, specific_reported_figures=[None])) def test_none_specific_reported_figure_2(self): # TODO this isn't about Syria, move it somewhere else counts = get_filter_counts(self.session, specific_reported_figures=['NULL', 1]) srf_counts = [ c for c in counts if c['filter_type'] == 'specific_reported_figure' ] self.assertEqual(len(srf_counts), 2) self.assertTrue([c for c in srf_counts if c['value'] is None]) self.assertTrue([c for c in srf_counts if c['value'] == 1]) def test_none_specific_reported_figure_3(self): # TODO this isn't about Syria, move it somewhere else counts = get_filter_counts(self.session, specific_reported_figures=['NULL', 1, 1000]) srf_counts = [ c for c in counts if c['filter_type'] == 'specific_reported_figure' ] self.assertGreater(len(srf_counts), 2) self.assertTrue([c for c in srf_counts if c['value'] is None]) self.assertTrue([c for c in srf_counts if c['value'] == 1]) def test_specific_reported_figure(self): # TODO this isn't about Syria, move it somewhere else counts = get_filter_counts(self.session, specific_reported_figures=[1, 1000]) srf_counts = [ c for c in counts if c['filter_type'] == 'specific_reported_figure' ] self.assertGreater(len(srf_counts), 2) self.assertFalse([c for c in srf_counts if c['value'] is None]) self.assertTrue([c for c in srf_counts if c['value'] == 1]) def test_filter_ts(self): t0 = time.time() query = add_filters(self.session.query(FactApi.content_id, DocumentContent.content_clean), fromdate=self.start_date, todate=self.plus_1_yr, location_ids=self.syria_location_ids, ts='Jordan').order_by(FactApi.gdelt_day).limit(32) results = query.all() t1 = time.time() print(t1 - t0) for id, content_clean in results: self.assertTrue('jordan' in content_clean.lower()) @skip("Too slow in practice") def test_filter_ts_exhaustive(self): # make sure that the query found everything that it was supposed to t0 = time.time() query = add_filters(self.session.query(FactApi.content_id, DocumentContent.content_clean, FactApi.gdelt_day), fromdate=self.start_date, todate=self.plus_1_yr, location_ids=self.syria_location_ids, ts='Jordan').order_by(FactApi.gdelt_day).limit(32) results = query.all() t1 = time.time() print(t1 - t0) matched = set() max_day = date.min for id, content_clean, gdelt_day in results: self.assertTrue('jordan' in content_clean.lower()) matched.add(id) max_day = max(max_day, gdelt_day) t2 = time.time() query = add_filters(self.session.query(FactApi.content_id, DocumentContent.content_clean), fromdate=self.start_date, todate=self.plus_1_yr, location_ids=self.syria_location_ids).filter( FactApi.gdelt_day <= max_day) results = query.all() t3 = time.time() print(t2 - t3) print(len(results)) for id, content_clean in results: self.assertEqual(id in matched, 'jordan' in content_clean.lower()) def test_urllist(self): t0 = time.time() result1 = get_urllist(self.session, fromdate=self.start_date, todate=self.plus_1_yr, location_ids=self.syria_location_ids) t1 = time.time() print(t1 - t0) self.assertEqual(32, len(list(result1))) t2 = time.time() result2 = get_urllist(self.session, offset=32, limit=100, fromdate=self.start_date, todate=self.plus_1_yr, location_ids=self.syria_location_ids) t3 = time.time() print(t3 - t2) self.assertEqual(100, len(list(result2))) for r1 in result1: # print(r1) for r2 in result2: self.assertLessEqual( (r1['gdelt_day'], r1['gkg_id']), (r2['gdelt_day'], r2['gkg_id']), ) self.assertIn('display_color', result1[0]) def test_urllist_unique_fact_id(self): t0 = time.time() result1 = get_urllist(self.session, fromdate=self.start_date, todate=self.plus_1_yr, limit=1000000, location_ids=self.syria_location_ids) t1 = time.time() print(t1 - t0) ids = [f['fact_id'] for f in result1] self.assertGreater(len(ids), 1000) self.assertEqual(len(ids), len(set(ids))) def test_urllist_ts(self): t0 = time.time() result1 = get_urllist(self.session, fromdate=self.start_date, todate=self.plus_1_yr, location_ids=self.syria_location_ids, ts='Jordan') t1 = time.time() print(t1 - t0) self.assertEqual(32, len(list(result1))) t2 = time.time() result2 = get_urllist(self.session, offset=32, limit=100, fromdate=self.start_date, todate=self.plus_1_yr, location_ids=self.syria_location_ids, ts='Jordan') t3 = time.time() print(t3 - t2) self.assertEqual(100, len(list(result2))) for r1 in result1: # print(r1) for r2 in result2: self.assertLessEqual( (r1['gdelt_day'], r1['gkg_id']), (r2['gdelt_day'], r2['gkg_id']), ) def test_count_ts(self): t0 = time.time() c = get_count(self.session, fromdate=self.start_date, todate=self.plus_1_yr, location_ids=self.syria_location_ids, ts='Jordan') t1 = time.time() print(t1 - t0) self.assertGreater(c, 5000) self.assertLess(c, 10000) def test_urllist_grouped(self): t0 = time.time() result = get_urllist_grouped(self.session, fromdate=self.start_date, todate=self.plus_1_yr, location_ids=self.syria_location_ids, limit=100) t1 = time.time() print(t1 - t0) self.assertEqual(100, len(result)) for entry in result: self.assertEqual(entry['nfacts'], len(entry['entry'])) for fact in entry['entry']: self.assertEqual(entry['specific_reported_figure'], fact['specific_reported_figure']) self.assertEqual(entry['unit'], fact['unit']) self.assertEqual(entry['term'], fact['term']) self.assertGreater(len(fact['content_clean']), 0) fact_ids = [f['fact'] for f in entry['entry']] self.assertEqual(len(fact_ids), len(set(fact_ids)), "fact repeated")
class TestGeoTagger(TestCase): def setUp(self): db_host = os.environ.get('DB_HOST') db_url = 'postgresql://{user}:{passwd}@{db_host}/{db}'.format( user='******', passwd='tester', db_host=db_host, db='idetect_test') engine = create_engine(db_url) Session.configure(bind=engine) Base.metadata.drop_all(engine) Base.metadata.create_all(engine) self.session = Session() load_countries(self.session) def tearDown(self): self.session.rollback() def test_sets_no_results_flag(self): """Sets no-results flag if nothing found""" results = get_geo_info("xghijdshfkljdes") self.assertEqual(results['flag'], "no-results") def test_returns_detail_for_places(self): """Returns sufficient level of detail for results""" results = get_geo_info("Paris") self.assertNotEqual(results['country_code'], '') self.assertNotEqual(results['coordinates'], '') self.assertNotEqual(results['type'], '') def test_accuracy(self): """Returns sufficient level of detail for results""" results = get_geo_info("Beijing") self.assertEqual(results['country_code'], 'CHN') self.assertEqual(results['coordinates'], "39.9059631,116.391248") def test_country_code(self): """Returns sufficient level of detail for results""" results = get_geo_info("Bidibidi") self.assertEqual(results['country_code'], 'UGA') results = get_geo_info("Marrakech") self.assertEqual(results['country_code'], 'MAR') results = get_geo_info("Fairfax County") self.assertEqual(results['country_code'], 'USA') def test_location_types(self): """Corectly distinguishes between Countries, Cities and Subdivisions""" results = get_geo_info("London") self.assertEqual(results['type'], LocationType.CITY) results = get_geo_info("India") self.assertEqual(results['type'], LocationType.COUNTRY) results = get_geo_info("Alaska") self.assertEqual(results['type'], LocationType.SUBDIVISION) # DONT RUN geotagging if detail already exists @mock.patch('idetect.geotagger.nominatim_coordinates') def dont_geotag_if_detail_exists(self, nominatim): gkg = Gkg( id=3771256, gkgrecordid="20170215174500-2503", date=20170215174500, document_identifier= "http://www.philstar.com/headlines/2017/02/16/1672746/yasay-harris-affirm-stronger-phl-us-ties" ) self.session.add(gkg) analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) content = DocumentContent( content_clean= "It was early Saturday when a flash flood hit large parts of India and Pakistan and washed away more than 500 houses" ) self.session.add(content) self.session.commit() analysis.content_id = content.id self.session.commit() fact = Fact(unit='person', term='displaced') self.session.add(fact) self.session.commit() loc1 = self.session.query(Location).filter( Location.location_name == 'India').one_or_none() fact.locations.append(loc1) analysis.facts.append(fact) self.session.commit() process_locations(analysis) assert not nominatim.called def test_create_duplicate_fact(self): """Creates duplicate fact if locations from multiple countries exist""" gkg = Gkg( id=3771256, gkgrecordid="20170215174500-2503", date=20170215174500, document_identifier= "http://www.philstar.com/headlines/2017/02/16/1672746/yasay-harris-affirm-stronger-phl-us-ties" ) self.session.add(gkg) analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) self.session.commit() fact = Fact(unit='person', term='displaced') self.session.add(fact) self.session.commit() loc1 = self.session.query(Location).filter( Location.location_name == 'India').one_or_none() loc2 = self.session.query(Location).filter( Location.location_name == 'Pakistan').one_or_none() fact.locations.append(loc1) fact.locations.append(loc2) analysis.facts.append(fact) self.session.commit() self.assertEqual(1, len(analysis.facts)) process_locations(analysis) self.assertEqual(2, len(analysis.facts)) fact_countries = [f.iso3 for f in analysis.facts] self.assertIn('IND', fact_countries) self.assertIn('PAK', fact_countries) self.assertEqual(1, len(analysis.facts[0].locations)) self.assertEqual(1, len(analysis.facts[1].locations)) @mock.patch('idetect.geotagger.nominatim_coordinates') def test_fail_if_geotagging_fails(self, nominatim): """Location processing should fail if geotagging fails""" nominatim.side_effect = GeotagException() gkg = Gkg( id=3771256, gkgrecordid="20170215174500-2503", date=20170215174500, document_identifier= "http://www.philstar.com/headlines/2017/02/16/1672746/yasay-harris-affirm-stronger-phl-us-ties" ) self.session.add(gkg) analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) self.session.commit() fact = Fact(unit='person', term='displaced') self.session.add(fact) self.session.commit() loc1 = Location(location_name="Ruislip") fact.locations.append(loc1) analysis.facts.append(fact) self.session.commit() with self.assertRaises(GeotagException): process_locations(analysis)
class TestFactExtractor(TestCase): def setUp(self): db_host = os.environ.get('DB_HOST') db_url = 'postgresql://{user}:{passwd}@{db_host}/{db}'.format( user='******', passwd='tester', db_host=db_host, db='idetect_test') engine = create_engine(db_url) Session.configure(bind=engine) Base.metadata.drop_all(engine) Base.metadata.create_all(engine) self.session = Session() load_countries(self.session) load_terms(self.session) def tearDown(self): self.session.rollback() for article in self.session.query(Gkg).all(): self.session.delete(article) self.session.commit() def test_extract_facts_simple(self): """Extracts simple facts when present and saves to DB""" gkg = Gkg() analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) content = DocumentContent( content_clean= "It was early Saturday when a flash flood hit the area and washed away more than 500 houses" ) self.session.add(content) self.session.commit() analysis.content_id = content.id self.session.commit() extract_facts(analysis) self.assertEqual(1, len(analysis.facts)) def test_extract_refugee_facts(self): """Extracts refugee-related facts with Refugee Term""" gkg = Gkg() analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) content = DocumentContent( content_clean= "It was early Saturday when government troops entered the area and forced more than 20000 refugees to flee." ) self.session.add(content) self.session.commit() analysis.content_id = content.id self.session.commit() extract_facts(analysis) self.assertEqual(FactTerm.REFUGEE, analysis.facts[0].term) def test_extract_evicted_facts(self): """Extracts eviction-related facts with eviction Term""" gkg = Gkg() analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) content = DocumentContent( content_clean= "2000 people have been evicted from their homes in Bosnia") self.session.add(content) self.session.commit() analysis.content_id = content.id self.session.commit() extract_facts(analysis) self.assertEqual(FactTerm.EVICTED, analysis.facts[0].term) def test_extract_eviction_facts(self): """Extracts eviction-related facts with eviction Term""" gkg = Gkg() analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) content = DocumentContent( content_clean= "ordered eviction for 2000 people from their homes in Bosnia") self.session.add(content) self.session.commit() analysis.content_id = content.id self.session.commit() extract_facts(analysis) self.assertEqual(FactTerm.EVICTED, analysis.facts[0].term) def test_extract_forced_eviction_facts(self): """Extracts eviction-related facts with eviction Term""" gkg = Gkg() analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) content = DocumentContent( content_clean= "ordered forced eviction for 2000 people from their homes in Bosnia" ) self.session.add(content) self.session.commit() analysis.content_id = content.id self.session.commit() extract_facts(analysis) self.assertEqual(FactTerm.EVICTED, analysis.facts[0].term) def test_extract_forcibly_evicted_facts(self): """Extracts eviction-related facts with eviction Term""" gkg = Gkg() analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) content = DocumentContent( content_clean= "2000 people were forcibly evicted from their homes in Bosnia") self.session.add(content) self.session.commit() analysis.content_id = content.id self.session.commit() extract_facts(analysis) self.assertEqual(FactTerm.EVICTED, analysis.facts[0].term) def test_extract_sacked_facts(self): """Extracts sacked-related facts with eviction Term""" gkg = Gkg() analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) content = DocumentContent( content_clean= "last week 2000 people have been sacked from their homes in Nigeria" ) self.session.add(content) self.session.commit() analysis.content_id = content.id self.session.commit() extract_facts(analysis) self.assertEqual(FactTerm.SACKED, analysis.facts[0].term) def test_create_locations_with_names(self): """Creates locations for facts only with location names""" gkg = Gkg() analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) content = DocumentContent( content_clean= "It was early Saturday when a flash flood hit large parts of London and Middlesex and washed away more than 500 houses" ) self.session.add(content) self.session.commit() analysis.content_id = content.id self.session.commit() extract_facts(analysis) facts = analysis.facts self.assertEqual(1, len(facts)) fact = facts[0] self.assertEqual(2, len(fact.locations)) loc_names = [loc.location_name for loc in fact.locations] self.assertIn('London', loc_names) self.assertIn('Middlesex', loc_names) self.assertEqual([None, None], [loc.country for loc in fact.locations]) def test_use_existing_location(self): """Uses existing locations when they exist""" gkg = Gkg() analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) content = DocumentContent( content_clean= "It was early Saturday when a flash flood hit large parts of Bosnia and washed away more than 500 houses" ) self.session.add(content) location = Location(location_name='Bosnia') self.session.add(location) self.session.commit() analysis.content_id = content.id self.session.commit() extract_facts(analysis) fact = analysis.facts[0] extracted_location = fact.locations[0] self.assertEqual(location.id, extracted_location.id)
import string import numpy as np import pandas as pd from idetect.nlp_models.category import * from idetect.nlp_models.relevance import * from idetect.nlp_models.base_model import CustomSklLsiModel if __name__ == "__main__": # Create the Database engine = create_engine(db_url()) Session.configure(bind=engine) Base.metadata.create_all(engine) session = Session() # Load the Countries data if necessary countries = session.query(Country).all() if len(countries) == 0: load_countries(session) # Load the Keywords if neccessary keywords = session.query(FactKeyword).all() if len(keywords) == 0: load_terms(session) session.close() # Load the Classifier models once to ensure they are downloaded CategoryModel() RelevanceModel()
class TestManager(TestCase): start_date = '2017-01-01' plus_1_yr = '2018-01-01' plus_6_mo = '2017-07-01' plus_3_mo = '2017-04-01' plus_1_mo = '2017-02-01' def setUp(self): logger.debug("setUp") worker_logger = logging.getLogger("idetect.worker") worker_logger.setLevel(logging.INFO) logger.debug("Connecting to DB") db_host = os.environ.get('DB_HOST') db_port = os.environ.get('DB_PORT', 5432) db_user = os.environ.get('DB_USER', 'tester') db_pass = os.environ.get('DB_PASSWORD', 'tester') db_url = 'postgresql://{user}:{passwd}@{db_host}:{db_port}/{db}'.format( user=db_user, passwd=db_pass, db_host=db_host, db_port=db_port, db='idetect') self.engine = create_engine(db_url, echo=False) Session.configure(bind=self.engine) self.session = Session() self.session.query(FactApi).count() logger.debug("setUp complete") def tearDown(self): logger.debug("tearDown") self.session.rollback() logger.debug("sessions rolled back") logger.debug("tearDown complete") def test_timeline(self): t0 = time.time() counts = get_timeline_counts(self.session) t1 = time.time() days = {t['gdelt_day'] for t in counts} self.assertGreater(len(days), 180) categories = {t['category'] for t in counts} self.assertEqual(categories, {'Conflict', 'Disaster', 'Other'}) print(t1 - t0) self.assertLess( t1 - t0, 1.0, 'Calculating timeline counts {} - {} took too long'.format( self.start_date, self.plus_1_yr)) def test_histogram(self): t0 = time.time() counts = get_histogram_counts(self.session) t1 = time.time() print(len(counts)) figures = { t['specific_reported_figure'] for t in counts if t['specific_reported_figure'] } self.assertLess(min(figures), 10) self.assertGreater(max(figures), 1000000) units = {t['unit'] for t in counts} self.assertEqual(units, {'Household', 'Person'}) print(t1 - t0) self.assertLess( t1 - t0, 1.0, 'Calculating histogram counts {} - {} took too long'.format( self.start_date, self.plus_1_yr)) def test_wordcloud(self): t0 = time.time() terms = get_wordcloud(self.session, self.engine) t1 = time.time() print(t1 - t0) print(len(terms)) print(tabulate(terms)) self.assertLess( t1 - t0, 5.0, 'Calculating wordcloud {} - {} took too long'.format( self.start_date, self.plus_1_yr)) def test_timeline_year(self): t0 = time.time() counts = get_timeline_counts(self.session, fromdate=self.start_date, todate=self.plus_1_yr) t1 = time.time() days = {t['gdelt_day'] for t in counts} self.assertGreater(len(days), 180) categories = {t['category'] for t in counts} self.assertEqual(categories, {'Conflict', 'Disaster', 'Other'}) print(t1 - t0) self.assertLess( t1 - t0, 1.0, 'Calculating timeline counts {} - {} took too long'.format( self.start_date, self.plus_1_yr)) def test_histogram_year(self): t0 = time.time() counts = get_histogram_counts(self.session, fromdate=self.start_date, todate=self.plus_1_yr) t1 = time.time() print(len(counts)) figures = { t['specific_reported_figure'] for t in counts if t['specific_reported_figure'] } self.assertLess(min(figures), 10) self.assertGreater(max(figures), 1000000) units = {t['unit'] for t in counts} self.assertEqual(units, {'Household', 'Person'}) print(t1 - t0) self.assertLess( t1 - t0, 1.0, 'Calculating histogram counts {} - {} took too long'.format( self.start_date, self.plus_1_yr)) def test_wordcloud_year(self): t0 = time.time() terms = get_wordcloud(self.session, self.engine, fromdate=self.start_date, todate=self.plus_1_yr) t1 = time.time() print(t1 - t0) print(len(terms)) print(tabulate(terms)) self.assertLess( t1 - t0, 5.0, 'Calculating wordcloud {} - {} took too long'.format( self.start_date, self.plus_1_yr)) def test_map_week(self): print("hello") t0 = time.time() entries = get_map_week(self.session) t1 = time.time() print(t1 - t0) # print(json.dumps(entries, indent=2)) self.assertEqual(len(entries), 1) self.assertIsNotNone(entries[0].get('entries'))
class TestModel(TestCase): def setUp(self): db_host = os.environ.get('DB_HOST') db_url = 'postgresql://{user}:{passwd}@{db_host}/{db}'.format( user='******', passwd='tester', db_host=db_host, db='idetect_test') engine = create_engine(db_url) Session.configure(bind=engine) Base.metadata.drop_all(engine) Base.metadata.create_all(engine) self.session = Session() self.sample_data() def sample_data(self): gkg1 = Gkg( id=3771256, gkgrecordid="20170215174500-2503", date=20170215174500, document_identifier= "http://www.philstar.com/headlines/2017/02/16/1672746/yasay-harris-affirm-stronger-phl-us-ties" ) self.session.add(gkg1) gkg2 = Gkg( id=3771257, gkgrecordid="20170215174500-1536", date=20170215174500, document_identifier= "http://wynkcountry.iheart.com/onair/cmt-cody-alan-54719/thomas-rhett-and-lauren-akins-are-15565244/" ) self.session.add(gkg2) self.session.commit() def tearDown(self): self.session.rollback() for gkg in self.session.query(Gkg).all(): self.session.delete(gkg) self.session.commit() def test_status_update(self): gkg = self.session.query(Gkg).first() analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) self.session.commit() analysis.create_new_version(Status.SCRAPING) self.assertEqual(analysis.status, Status.SCRAPING) # meanwhile, some other process changed the status of this... session2 = Session() try: other = session2.query(Analysis).get(analysis.gkg_id) other.create_new_version(Status.SCRAPING_FAILED) finally: session2.rollback() with self.assertRaises(NotLatestException): analysis.create_new_version(Status.SCRAPED) def test_version_lifecycle(self): gkg = self.session.query(Gkg).first() analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) self.session.commit() analysis.create_new_version(Status.SCRAPING) history = self.session.query(AnalysisHistory).filter( AnalysisHistory.gkg == gkg) self.assertEqual(1, history.count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.NEW).count()) content = DocumentContent(content_type="text/html", content="Lorem ipsum") analysis.content = content analysis.create_new_version(Status.SCRAPED) self.assertEqual(2, history.count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.NEW).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.SCRAPING).count()) analysis.create_new_version(Status.EXTRACTING) self.assertEqual(3, history.count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.NEW).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.SCRAPING).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.SCRAPED).count()) # content is preserved scraped = history.filter( AnalysisHistory.status == Status.SCRAPED).one_or_none() self.assertEqual(analysis.content, scraped.content) fact = Fact(analysis_date=datetime.now()) analysis.facts = [fact] analysis.create_new_version(Status.EXTRACTED) self.assertEqual(4, history.count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.NEW).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.SCRAPING).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.SCRAPED).count()) self.assertEqual( 1, history.filter( AnalysisHistory.status == Status.EXTRACTING).count()) # content still preserved extracting = history.filter( AnalysisHistory.status == Status.EXTRACTING).one_or_none() self.assertEqual(analysis.content, extracting.content) analysis.create_new_version(Status.EDITING) analysis.content = DocumentContent(content_type="text/html", content="Lorem edited") analysis.create_new_version(Status.EDITED) self.assertEqual(6, history.count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.NEW).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.SCRAPING).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.SCRAPED).count()) self.assertEqual( 1, history.filter( AnalysisHistory.status == Status.EXTRACTING).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.EXTRACTED).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.EDITING).count()) # content has changed, but reports are preserved extracted = history.filter( AnalysisHistory.status == Status.EXTRACTED).one_or_none() self.assertNotEqual(analysis.content.id, extracted.content.id) self.assertCountEqual([f.id for f in analysis.facts], [f.id for f in extracted.facts]) analysis.create_new_version(Status.EDITING) fact2 = Fact(analysis_date=datetime.now()) analysis.facts.append(fact2) analysis.create_new_version(Status.EDITED) self.assertEqual(8, history.count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.NEW).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.SCRAPING).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.SCRAPED).count()) self.assertEqual( 1, history.filter( AnalysisHistory.status == Status.EXTRACTING).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.EXTRACTED).count()) self.assertEqual( 2, history.filter(AnalysisHistory.status == Status.EDITING).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.EDITED).count()) edited = history.filter( AnalysisHistory.status == Status.EDITED).one_or_none() self.assertCountEqual([f.id for f in analysis.facts], [fact.id, fact2.id]) self.assertCountEqual([f.id for f in edited.facts], [fact.id]) def test_status_counts(self): gkgs = self.session.query(Gkg).all()[:2] analysis1 = Analysis(gkg=gkgs[0], status=Status.NEW) self.session.add(analysis1) self.session.commit() self.assertEqual(Analysis.status_counts(self.session), {Status.NEW: 1}) analysis1.create_new_version(Status.SCRAPING) self.assertEqual(Analysis.status_counts(self.session), {Status.SCRAPING: 1}) analysis2 = Analysis(gkg=gkgs[1], status=Status.NEW) self.session.add(analysis2) self.session.commit() self.assertEqual(Analysis.status_counts(self.session), { Status.NEW: 1, Status.SCRAPING: 1 }) analysis2.create_new_version(Status.SCRAPING) self.assertEqual(Analysis.status_counts(self.session), {Status.SCRAPING: 2}) analysis2.create_new_version(Status.SCRAPED) self.assertEqual(Analysis.status_counts(self.session), { Status.SCRAPED: 1, Status.SCRAPING: 1 }) def test_country_term(self): mmr = Country(iso3="MMR", preferred_term="Myanmar") myanmar = CountryTerm(term="Myanmar", country=mmr) burma = CountryTerm(term="Burma", country=mmr) yangon = Location(location_name="Yangon", location_type=LocationType.CITY, country=mmr, latlong="16°51′N 96°11′E") self.assertEqual(yangon.country, myanmar.country) self.assertEqual(yangon.country, burma.country)
def analyse_url(): session = Session() status = None gkg_id = None try: url = request.get_json(silent=True)['url'] or request.form['url'] except Exception as e: return json.dumps({ 'success': False, 'Exception': str(e), 'status': 'missing or null url parameter' }), 422, { 'ContentType': 'application/json' } if url is None: return json.dumps({ 'success': False, 'status': 'null url parameter' }), 422, { 'ContentType': 'application/json' } gkg = session.query(Gkg.id).filter( Gkg.document_identifier.like("%" + url + "%")).order_by( Gkg.date.asc()).first() if gkg: gkg_id = gkg.id status = 'url already in IDETECT DB' else: analysis = create_new_analysis_from_url(session, url) gkg_id = analysis.gkg_id status = 'url added to IDETECT DB' try: work(session, analysis, Status.SCRAPING, Status.SCRAPED, Status.SCRAPING_FAILED, scrape) # TODO add classification, missing modules # work(session,analysis,Status.CLASSIFYING,Status.CLASSIFIED,Status.CLASSIFYING_FAILED,lambda article: classify(article, get_c_m(), get_r_m())) work(session, analysis, Status.EXTRACTING, Status.EXTRACTED, Status.EXTRACTING_FAILED, extract_facts) work(session, analysis, Status.GEOTAGGING, Status.GEOTAGGED, Status.GEOTAGGING_FAILED, process_locations) except Exception as e: return json.dumps({ 'success': False, 'Exception': str(e) }), 422, { 'ContentType': 'application/json' } finally: session.close() try: document = get_document(session, gkg_id) entries = get_facts_for_document(session, gkg_id) resp = jsonify({ 'document': document, 'facts': entries, 'status': status }) resp.status_code = 200 return resp finally: session.close()