Example #1
0
    def work(self):
        """
        Look for Documents in the given session Return for which no Analysis exists and
        creates one with Status.New. Returns True iff some Analyses were created
        """
        # start a new session for each job
        session = Session()
        try:
            # Get a Document
            # ... for which no Analysis exists
            # ... and lock it for updates
            # ... sort by created date
            # ... pick the first (oldest)
            gkgs = session.query(Gkg) \
                .filter(~session.query(Analysis).filter(Gkg.id == Analysis.gkg_id).exists()) \
                .with_for_update() \
                .order_by(Gkg.date) \
                .limit(1000).all()
            if len(gkgs) == 0:
                return False  # no work to be done
            for gkg in gkgs:
                analysis = Analysis(gkg=gkg, status=Status.NEW)
                session.add(analysis)
                session.commit()
                logger.info(
                    "Worker {} created Analysis {} in status {}".format(
                        os.getpid(), analysis.gkg_id, analysis.status))
        finally:
            # make sure to release a FOR UPDATE lock, if we got one
            if session is not None:
                session.rollback()
                session.close()

        return True
Example #2
0
class TestScraper(TestCase):
    def setUp(self):
        db_host = os.environ.get('DB_HOST')
        db_url = 'postgresql://{user}:{passwd}@{db_host}/{db}'.format(
            user='******', passwd='tester', db_host=db_host, db='idetect_test')
        engine = create_engine(db_url)
        Session.configure(bind=engine)
        Base.metadata.drop_all(engine)
        Base.metadata.create_all(engine)
        self.session = Session()

    def tearDown(self):
        self.session.rollback()
        for gkg in self.session.query(Gkg).all():
            self.session.delete(gkg)
        self.session.commit()

    def test_scrape_html(self):
        gkg = Gkg(
            document_identifier="http://www.cnn.com/2013/08/23/us/hurricane-katrina-statistics-fast-facts/index.html")
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        self.session.commit()
        scrape(analysis)
        content = analysis.content
        self.assertEqual("text", content.content_type)
        self.assertTrue("Katrina" in content.content_clean)
        self.assertTrue("Louisiana" in content.content_clean)
        self.assertTrue("\n" not in content.content_clean)
        self.assertTrue(content.content_ts is not None)
        matches = (
            self.session.query(DocumentContent)
                .filter(DocumentContent.content_ts.match('Katrina & Louisiana')).all()
        )
        self.assertIn(content, matches)

    def test_scrape_pdf(self):
        gkg = Gkg(
            document_identifier="https://www1.ncdc.noaa.gov/pub/data/extremeevents/specialreports/Hurricane-Katrina.pdf")
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        self.session.commit()
        scrape(analysis)
        content = analysis.content
        self.assertEqual("pdf", content.content_type)
        self.assertTrue("Katrina" in content.content)
        self.assertTrue("Louisiana" in content.content)
        self.assertTrue("\n" not in content.content)
Example #3
0
    def work(self):
        """
        Look for analyses in the given session and run function on them
        if any are found, managing status appropriately. Return True iff some Analyses were processed (successfully or not)
        """
        # start a new session for each job
        session = Session()
        try:
            # Get an analysis
            # ... and lock it for updates
            # ... that meets the conditions specified in the filter function
            # ... sort by updated date
            # ... pick the first (oldest)
            analysis = self.filter_function(session.query(Analysis)) \
                .with_for_update() \
                .order_by(Analysis.updated) \
                .first()
            if analysis is None:
                return False  # no work to be done
            analysis_status = analysis.status
            analysis.create_new_version(self.working_status)
            logger.info("Worker {} claimed Analysis {} in status {}".format(
                os.getpid(), analysis.gkg_id, analysis_status))
        finally:
            # make sure to release a FOR UPDATE lock, if we got one
            session.rollback()

        start = time.time()
        try:
            # set a timeout so if this worker stalls, we recover
            signal.alarm(self.timeout_seconds)
            # actually run the work function on this analysis
            self.function(analysis)
            delta = time.time() - start
            logger.info("Worker {} processed Analysis {} {} -> {} {}s".format(
                os.getpid(), analysis.gkg_id, analysis_status,
                self.success_status, delta))
            analysis.error_msg = None
            analysis.processing_time = delta
            analysis.create_new_version(self.success_status)
        except Exception as e:
            delta = time.time() - start
            logger.warning(
                "Worker {} failed to process Analysis {} {} -> {}".format(
                    os.getpid(), analysis.gkg_id, analysis_status,
                    self.failure_status),
                exc_info=e)
            analysis.error_msg = str(e)
            analysis.processing_time = delta
            analysis.create_new_version(self.failure_status)
            session.commit()
        finally:
            # clear the timeout
            signal.alarm(0)
            if session is not None:
                session.rollback()
                session.close()
        return True
Example #4
0
def homepage():
    session = Session()
    try:
        articles = session.query(Analysis).order_by(desc(
            Analysis.updated)).limit(10).all()
        counts = Analysis.status_counts(session)
        cat_counts = Analysis.category_counts(session)
        return render_template('index.html',
                               articles=articles,
                               counts=counts,
                               cat_counts=cat_counts)
    finally:
        session.close()
Example #5
0
def article(doc_id):
    session = Session()
    try:
        analysis = session.query(Analysis) \
            .filter(Analysis.gkg_id == doc_id).one()
        coords = {
            tuple(l.latlong.split(","))
            for f in analysis.facts for l in f.locations
            if l.latlong is not None
        }
        return render_template('article.html',
                               article=analysis,
                               coords=list(coords))
    finally:
        session.close()
Example #6
0
    def test_status_update(self):
        gkg = self.session.query(Gkg).first()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        self.session.commit()

        analysis.create_new_version(Status.SCRAPING)
        self.assertEqual(analysis.status, Status.SCRAPING)

        # meanwhile, some other process changed the status of this...
        session2 = Session()
        try:
            other = session2.query(Analysis).get(analysis.gkg_id)
            other.create_new_version(Status.SCRAPING_FAILED)
        finally:
            session2.rollback()

        with self.assertRaises(NotLatestException):
            analysis.create_new_version(Status.SCRAPED)
Example #7
0
def search_url():
    url = request.args.get('url')
    if url is None:
        return json.dumps({'success': False}), 422, {
            'ContentType': 'application/json'
        }
    session = Session()
    try:
        gkg = session.query(Gkg).filter(
            Gkg.document_identifier.like("%" + url + "%")).order_by(
                Gkg.date.desc()).first()
        if gkg:
            resp = jsonify({'doc_id': gkg.id})
            resp.status_code = 200
            return resp
        else:
            return json.dumps({'success': False}), 422, {
                'ContentType': 'application/json'
            }
    finally:
        session.close()
Example #8
0
class TestSyriaYear(TestCase):
    syria_location_ids = [
        127, 270, 281, 284, 307, 332, 372, 412, 429, 431, 531, 591, 612, 618,
        644, 671, 764, 807, 905, 958, 996, 1018, 1019, 1188, 1190, 1212, 1352,
        1357, 1524, 1678, 1898, 1981, 1990, 2058, 2060, 2272, 2378, 2735, 2933,
        3262, 3323, 3327, 3372, 3391, 3404, 3660, 3708, 3725, 3834, 3915, 3924,
        4069, 4172, 4399, 4509, 4648, 4824, 4890, 5017, 5285, 5833, 6053, 6070,
        6270, 6760, 6832, 7121, 7122, 7151, 7222, 7244, 7248, 7641, 7723, 7749,
        7757, 7827, 7919, 7970, 8078, 8107, 8131, 8166, 8176, 8210, 8222, 8240,
        8254, 8367, 8442, 8659, 8660, 8730, 8788, 8793, 8941, 9045, 9167, 9285,
        9370, 9531, 9606, 9775, 9909, 9913, 9916, 9917, 9933, 10136, 10312,
        10464, 10532, 10795, 10971, 11052, 11076, 11174, 11194, 11216, 11250,
        11311, 11501, 11703, 11727, 11916, 11933, 12242, 12387, 12990, 13126,
        13130, 13142, 13171, 13348, 13531, 13659, 13722, 14225, 14718, 14732,
        14737, 14917, 14930, 14988, 15215, 15257, 15984, 15993, 16188, 17034,
        17090, 17373, 17404, 17873, 18019, 18131, 18267, 18396, 18403, 18578,
        19550, 19641, 19721, 20180, 21339, 21894, 22003, 22022, 22162, 22201,
        22850, 23189, 23414, 23532, 23875, 24851, 25171, 25415, 25894, 25927,
        26024, 26283, 26458, 26545, 26909, 27027, 27393, 27507, 28185, 28626,
        28628, 29703, 29704, 29754, 29942, 30210, 30286, 30302, 30442, 30993,
        31492, 31743
    ]

    start_date = '2017-01-01'
    plus_1_yr = '2018-01-01'
    plus_6_mo = '2017-07-01'
    plus_3_mo = '2017-04-01'
    plus_1_mo = '2017-02-01'

    def setUp(self):
        logger.debug("setUp")
        worker_logger = logging.getLogger("idetect.worker")
        worker_logger.setLevel(logging.INFO)

        logger.debug("Connecting to DB")
        db_host = os.environ.get('DB_HOST')
        db_port = os.environ.get('DB_PORT', 5432)
        db_user = os.environ.get('DB_USER', 'tester')
        db_pass = os.environ.get('DB_PASSWORD', 'tester')

        db_url = 'postgresql://{user}:{passwd}@{db_host}:{db_port}/{db}'.format(
            user=db_user,
            passwd=db_pass,
            db_host=db_host,
            db_port=db_port,
            db='idetect')
        self.engine = create_engine(db_url, echo=False)
        Session.configure(bind=self.engine)
        self.session = Session()
        self.session.query(FactApi).count()
        logger.debug("setUp complete")

    def tearDown(self):
        logger.debug("tearDown")
        self.session.rollback()
        logger.debug("sessions rolled back")
        logger.debug("tearDown complete")

    def test_categories(self):
        syr_year_by_category = add_filters(
            self.session.query(func.count(FactApi.fact), FactApi.category),
            fromdate=self.start_date,
            todate=self.plus_1_yr,
            location_ids=self.syria_location_ids).group_by(FactApi.category)

        t0 = time.time()
        result = {
            category: count
            for count, category in syr_year_by_category.all()
        }
        t1 = time.time()
        # print(result)
        self.assertEqual(set(result.keys()), {'Conflict', 'Disaster', 'Other'})
        # print(explain_text(self.session, syr_year_by_category))
        print(t1 - t0)
        self.assertLess(t1 - t0, 1.0)

    def test_filter_counts(self):
        f_c = get_filter_counts(self.session,
                                fromdate=self.start_date,
                                todate=self.plus_1_yr,
                                location_ids=self.syria_location_ids)
        # print(f_c)
        self.assertGreater(len(f_c), 1000)

    def test_filter_counts_speed(self):
        for end_date in (self.plus_1_mo, self.plus_3_mo, self.plus_6_mo):
            # Adding this usually fails: , self.plus_1_yr):
            t0 = time.time()
            f_c = get_filter_counts(self.session,
                                    fromdate=self.start_date,
                                    todate=end_date,
                                    location_ids=self.syria_location_ids)
            t1 = time.time()
            print('{} - {}: {}s'.format(self.start_date, end_date, t1 - t0))
            self.assertLess(
                t1 - t0, 1.0,
                'Calculating filter counts {} - {} took too long'.format(
                    self.start_date, end_date))

    def test_timeline(self):
        t0 = time.time()
        counts = get_timeline_counts(self.session,
                                     fromdate=self.start_date,
                                     todate=self.plus_1_yr,
                                     location_ids=self.syria_location_ids)
        t1 = time.time()

        days = {t['gdelt_day'] for t in counts}
        self.assertGreater(len(days), 180)

        categories = {t['category'] for t in counts}
        self.assertEqual(categories, {'Conflict', 'Disaster', 'Other'})

        print(t1 - t0)
        self.assertLess(
            t1 - t0, 1.0,
            'Calculating timeline counts {} - {} took too long'.format(
                self.start_date, self.plus_1_yr))

    def test_histogram(self):
        t0 = time.time()
        counts = get_histogram_counts(self.session,
                                      fromdate=self.start_date,
                                      todate=self.plus_1_yr,
                                      location_ids=self.syria_location_ids)
        t1 = time.time()
        print(len(counts))

        figures = {
            t['specific_reported_figure']
            for t in counts if t['specific_reported_figure']
        }
        self.assertLess(min(figures), 10)
        self.assertGreater(max(figures), 1000000)

        units = {t['unit'] for t in counts}
        self.assertEqual(units, {'Household', 'Person'})

        print(t1 - t0)
        self.assertLess(
            t1 - t0, 1.0,
            'Calculating histogram counts {} - {} took too long'.format(
                self.start_date, self.plus_1_yr))

    def test_wordcloud(self):
        t0 = time.time()
        terms = get_wordcloud(self.session,
                              self.engine,
                              fromdate=self.start_date,
                              todate=self.plus_1_yr,
                              location_ids=self.syria_location_ids)
        t1 = time.time()
        print(t1 - t0)
        print(len(terms))
        print(tabulate(terms))
        self.assertLess(
            t1 - t0, 5.0, 'Calculating wordcloud {} - {} took too long'.format(
                self.start_date, self.plus_1_yr))

    def test_none_location(self):
        # TODO this isn't about Syria, move it somewhere else
        counts = get_filter_counts(self.session, location_ids=['NULL'])
        self.assertGreater(len(counts), 1000)

        self.assertEqual(
            counts, get_filter_counts(self.session, location_ids=['null']))
        self.assertEqual(counts,
                         get_filter_counts(self.session, location_ids=[None]))

        counts2 = get_filter_counts(self.session, location_ids=['NULL', 1])
        self.assertGreater(len(counts2), len(counts))

    def test_none_specific_reported_figure_1(self):
        # TODO this isn't about Syria, move it somewhere else
        counts = get_filter_counts(self.session,
                                   specific_reported_figures=['NULL'])
        srf_counts = [
            c for c in counts if c['filter_type'] == 'specific_reported_figure'
        ]
        self.assertEqual(len(srf_counts), 1)
        self.assertTrue([c for c in srf_counts if c['value'] is None])

        self.assertEqual(
            counts,
            get_filter_counts(self.session,
                              specific_reported_figures=['null']))
        self.assertEqual(
            counts,
            get_filter_counts(self.session, specific_reported_figures=[None]))

    def test_none_specific_reported_figure_2(self):
        # TODO this isn't about Syria, move it somewhere else
        counts = get_filter_counts(self.session,
                                   specific_reported_figures=['NULL', 1])
        srf_counts = [
            c for c in counts if c['filter_type'] == 'specific_reported_figure'
        ]
        self.assertEqual(len(srf_counts), 2)
        self.assertTrue([c for c in srf_counts if c['value'] is None])
        self.assertTrue([c for c in srf_counts if c['value'] == 1])

    def test_none_specific_reported_figure_3(self):
        # TODO this isn't about Syria, move it somewhere else
        counts = get_filter_counts(self.session,
                                   specific_reported_figures=['NULL', 1, 1000])
        srf_counts = [
            c for c in counts if c['filter_type'] == 'specific_reported_figure'
        ]
        self.assertGreater(len(srf_counts), 2)
        self.assertTrue([c for c in srf_counts if c['value'] is None])
        self.assertTrue([c for c in srf_counts if c['value'] == 1])

    def test_specific_reported_figure(self):
        # TODO this isn't about Syria, move it somewhere else
        counts = get_filter_counts(self.session,
                                   specific_reported_figures=[1, 1000])
        srf_counts = [
            c for c in counts if c['filter_type'] == 'specific_reported_figure'
        ]
        self.assertGreater(len(srf_counts), 2)
        self.assertFalse([c for c in srf_counts if c['value'] is None])
        self.assertTrue([c for c in srf_counts if c['value'] == 1])

    def test_filter_ts(self):
        t0 = time.time()
        query = add_filters(self.session.query(FactApi.content_id,
                                               DocumentContent.content_clean),
                            fromdate=self.start_date,
                            todate=self.plus_1_yr,
                            location_ids=self.syria_location_ids,
                            ts='Jordan').order_by(FactApi.gdelt_day).limit(32)
        results = query.all()
        t1 = time.time()
        print(t1 - t0)
        for id, content_clean in results:
            self.assertTrue('jordan' in content_clean.lower())

    @skip("Too slow in practice")
    def test_filter_ts_exhaustive(self):
        # make sure that the query found everything that it was supposed to
        t0 = time.time()
        query = add_filters(self.session.query(FactApi.content_id,
                                               DocumentContent.content_clean,
                                               FactApi.gdelt_day),
                            fromdate=self.start_date,
                            todate=self.plus_1_yr,
                            location_ids=self.syria_location_ids,
                            ts='Jordan').order_by(FactApi.gdelt_day).limit(32)
        results = query.all()
        t1 = time.time()
        print(t1 - t0)
        matched = set()
        max_day = date.min
        for id, content_clean, gdelt_day in results:
            self.assertTrue('jordan' in content_clean.lower())
            matched.add(id)
            max_day = max(max_day, gdelt_day)

        t2 = time.time()
        query = add_filters(self.session.query(FactApi.content_id,
                                               DocumentContent.content_clean),
                            fromdate=self.start_date,
                            todate=self.plus_1_yr,
                            location_ids=self.syria_location_ids).filter(
                                FactApi.gdelt_day <= max_day)
        results = query.all()
        t3 = time.time()
        print(t2 - t3)
        print(len(results))
        for id, content_clean in results:
            self.assertEqual(id in matched, 'jordan' in content_clean.lower())

    def test_urllist(self):
        t0 = time.time()
        result1 = get_urllist(self.session,
                              fromdate=self.start_date,
                              todate=self.plus_1_yr,
                              location_ids=self.syria_location_ids)
        t1 = time.time()
        print(t1 - t0)
        self.assertEqual(32, len(list(result1)))
        t2 = time.time()
        result2 = get_urllist(self.session,
                              offset=32,
                              limit=100,
                              fromdate=self.start_date,
                              todate=self.plus_1_yr,
                              location_ids=self.syria_location_ids)
        t3 = time.time()
        print(t3 - t2)
        self.assertEqual(100, len(list(result2)))
        for r1 in result1:
            # print(r1)
            for r2 in result2:
                self.assertLessEqual(
                    (r1['gdelt_day'], r1['gkg_id']),
                    (r2['gdelt_day'], r2['gkg_id']),
                )
        self.assertIn('display_color', result1[0])

    def test_urllist_unique_fact_id(self):
        t0 = time.time()
        result1 = get_urllist(self.session,
                              fromdate=self.start_date,
                              todate=self.plus_1_yr,
                              limit=1000000,
                              location_ids=self.syria_location_ids)
        t1 = time.time()
        print(t1 - t0)
        ids = [f['fact_id'] for f in result1]
        self.assertGreater(len(ids), 1000)
        self.assertEqual(len(ids), len(set(ids)))

    def test_urllist_ts(self):
        t0 = time.time()
        result1 = get_urllist(self.session,
                              fromdate=self.start_date,
                              todate=self.plus_1_yr,
                              location_ids=self.syria_location_ids,
                              ts='Jordan')
        t1 = time.time()
        print(t1 - t0)
        self.assertEqual(32, len(list(result1)))
        t2 = time.time()
        result2 = get_urllist(self.session,
                              offset=32,
                              limit=100,
                              fromdate=self.start_date,
                              todate=self.plus_1_yr,
                              location_ids=self.syria_location_ids,
                              ts='Jordan')
        t3 = time.time()
        print(t3 - t2)
        self.assertEqual(100, len(list(result2)))
        for r1 in result1:
            # print(r1)
            for r2 in result2:
                self.assertLessEqual(
                    (r1['gdelt_day'], r1['gkg_id']),
                    (r2['gdelt_day'], r2['gkg_id']),
                )

    def test_count_ts(self):
        t0 = time.time()
        c = get_count(self.session,
                      fromdate=self.start_date,
                      todate=self.plus_1_yr,
                      location_ids=self.syria_location_ids,
                      ts='Jordan')
        t1 = time.time()
        print(t1 - t0)
        self.assertGreater(c, 5000)
        self.assertLess(c, 10000)

    def test_urllist_grouped(self):
        t0 = time.time()
        result = get_urllist_grouped(self.session,
                                     fromdate=self.start_date,
                                     todate=self.plus_1_yr,
                                     location_ids=self.syria_location_ids,
                                     limit=100)
        t1 = time.time()
        print(t1 - t0)
        self.assertEqual(100, len(result))
        for entry in result:
            self.assertEqual(entry['nfacts'], len(entry['entry']))
            for fact in entry['entry']:
                self.assertEqual(entry['specific_reported_figure'],
                                 fact['specific_reported_figure'])
                self.assertEqual(entry['unit'], fact['unit'])
                self.assertEqual(entry['term'], fact['term'])
                self.assertGreater(len(fact['content_clean']), 0)
            fact_ids = [f['fact'] for f in entry['entry']]
            self.assertEqual(len(fact_ids), len(set(fact_ids)),
                             "fact repeated")
Example #9
0
class TestGeoTagger(TestCase):
    def setUp(self):
        db_host = os.environ.get('DB_HOST')
        db_url = 'postgresql://{user}:{passwd}@{db_host}/{db}'.format(
            user='******', passwd='tester', db_host=db_host, db='idetect_test')
        engine = create_engine(db_url)
        Session.configure(bind=engine)
        Base.metadata.drop_all(engine)
        Base.metadata.create_all(engine)
        self.session = Session()
        load_countries(self.session)

    def tearDown(self):
        self.session.rollback()

    def test_sets_no_results_flag(self):
        """Sets no-results flag if nothing found"""
        results = get_geo_info("xghijdshfkljdes")
        self.assertEqual(results['flag'], "no-results")

    def test_returns_detail_for_places(self):
        """Returns sufficient level of detail for results"""
        results = get_geo_info("Paris")
        self.assertNotEqual(results['country_code'], '')
        self.assertNotEqual(results['coordinates'], '')
        self.assertNotEqual(results['type'], '')

    def test_accuracy(self):
        """Returns sufficient level of detail for results"""
        results = get_geo_info("Beijing")
        self.assertEqual(results['country_code'], 'CHN')
        self.assertEqual(results['coordinates'], "39.9059631,116.391248")

    def test_country_code(self):
        """Returns sufficient level of detail for results"""
        results = get_geo_info("Bidibidi")
        self.assertEqual(results['country_code'], 'UGA')
        results = get_geo_info("Marrakech")
        self.assertEqual(results['country_code'], 'MAR')
        results = get_geo_info("Fairfax County")
        self.assertEqual(results['country_code'], 'USA')

    def test_location_types(self):
        """Corectly distinguishes between Countries, Cities and Subdivisions"""
        results = get_geo_info("London")
        self.assertEqual(results['type'], LocationType.CITY)
        results = get_geo_info("India")
        self.assertEqual(results['type'], LocationType.COUNTRY)
        results = get_geo_info("Alaska")
        self.assertEqual(results['type'], LocationType.SUBDIVISION)

    # DONT RUN geotagging if detail already exists
    @mock.patch('idetect.geotagger.nominatim_coordinates')
    def dont_geotag_if_detail_exists(self, nominatim):
        gkg = Gkg(
            id=3771256,
            gkgrecordid="20170215174500-2503",
            date=20170215174500,
            document_identifier=
            "http://www.philstar.com/headlines/2017/02/16/1672746/yasay-harris-affirm-stronger-phl-us-ties"
        )
        self.session.add(gkg)
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "It was early Saturday when a flash flood hit large parts of India and Pakistan and washed away more than 500 houses"
        )
        self.session.add(content)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        fact = Fact(unit='person', term='displaced')
        self.session.add(fact)
        self.session.commit()
        loc1 = self.session.query(Location).filter(
            Location.location_name == 'India').one_or_none()
        fact.locations.append(loc1)
        analysis.facts.append(fact)
        self.session.commit()
        process_locations(analysis)
        assert not nominatim.called

    def test_create_duplicate_fact(self):
        """Creates duplicate fact if locations from multiple countries exist"""
        gkg = Gkg(
            id=3771256,
            gkgrecordid="20170215174500-2503",
            date=20170215174500,
            document_identifier=
            "http://www.philstar.com/headlines/2017/02/16/1672746/yasay-harris-affirm-stronger-phl-us-ties"
        )
        self.session.add(gkg)
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        self.session.commit()
        fact = Fact(unit='person', term='displaced')
        self.session.add(fact)
        self.session.commit()
        loc1 = self.session.query(Location).filter(
            Location.location_name == 'India').one_or_none()
        loc2 = self.session.query(Location).filter(
            Location.location_name == 'Pakistan').one_or_none()
        fact.locations.append(loc1)
        fact.locations.append(loc2)
        analysis.facts.append(fact)
        self.session.commit()
        self.assertEqual(1, len(analysis.facts))
        process_locations(analysis)
        self.assertEqual(2, len(analysis.facts))
        fact_countries = [f.iso3 for f in analysis.facts]
        self.assertIn('IND', fact_countries)
        self.assertIn('PAK', fact_countries)
        self.assertEqual(1, len(analysis.facts[0].locations))
        self.assertEqual(1, len(analysis.facts[1].locations))

    @mock.patch('idetect.geotagger.nominatim_coordinates')
    def test_fail_if_geotagging_fails(self, nominatim):
        """Location processing should fail if geotagging fails"""
        nominatim.side_effect = GeotagException()
        gkg = Gkg(
            id=3771256,
            gkgrecordid="20170215174500-2503",
            date=20170215174500,
            document_identifier=
            "http://www.philstar.com/headlines/2017/02/16/1672746/yasay-harris-affirm-stronger-phl-us-ties"
        )
        self.session.add(gkg)
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        self.session.commit()
        fact = Fact(unit='person', term='displaced')
        self.session.add(fact)
        self.session.commit()
        loc1 = Location(location_name="Ruislip")
        fact.locations.append(loc1)
        analysis.facts.append(fact)
        self.session.commit()
        with self.assertRaises(GeotagException):
            process_locations(analysis)
Example #10
0
class TestFactExtractor(TestCase):
    def setUp(self):
        db_host = os.environ.get('DB_HOST')
        db_url = 'postgresql://{user}:{passwd}@{db_host}/{db}'.format(
            user='******', passwd='tester', db_host=db_host, db='idetect_test')
        engine = create_engine(db_url)
        Session.configure(bind=engine)
        Base.metadata.drop_all(engine)
        Base.metadata.create_all(engine)
        self.session = Session()
        load_countries(self.session)
        load_terms(self.session)

    def tearDown(self):
        self.session.rollback()
        for article in self.session.query(Gkg).all():
            self.session.delete(article)
        self.session.commit()

    def test_extract_facts_simple(self):
        """Extracts simple facts when present and saves to DB"""
        gkg = Gkg()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "It was early Saturday when a flash flood hit the area and washed away more than 500 houses"
        )
        self.session.add(content)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        extract_facts(analysis)
        self.assertEqual(1, len(analysis.facts))

    def test_extract_refugee_facts(self):
        """Extracts refugee-related facts with Refugee Term"""
        gkg = Gkg()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "It was early Saturday when government troops entered the area and forced more than 20000 refugees to flee."
        )
        self.session.add(content)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        extract_facts(analysis)
        self.assertEqual(FactTerm.REFUGEE, analysis.facts[0].term)

    def test_extract_evicted_facts(self):
        """Extracts eviction-related facts with eviction Term"""
        gkg = Gkg()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "2000 people have been evicted from their homes in Bosnia")
        self.session.add(content)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        extract_facts(analysis)
        self.assertEqual(FactTerm.EVICTED, analysis.facts[0].term)

    def test_extract_eviction_facts(self):
        """Extracts eviction-related facts with eviction Term"""
        gkg = Gkg()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "ordered eviction for 2000 people from their homes in Bosnia")
        self.session.add(content)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        extract_facts(analysis)
        self.assertEqual(FactTerm.EVICTED, analysis.facts[0].term)

    def test_extract_forced_eviction_facts(self):
        """Extracts eviction-related facts with eviction Term"""
        gkg = Gkg()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "ordered forced eviction for 2000 people from their homes in Bosnia"
        )
        self.session.add(content)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        extract_facts(analysis)
        self.assertEqual(FactTerm.EVICTED, analysis.facts[0].term)

    def test_extract_forcibly_evicted_facts(self):
        """Extracts eviction-related facts with eviction Term"""
        gkg = Gkg()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "2000 people were forcibly evicted from their homes in Bosnia")
        self.session.add(content)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        extract_facts(analysis)
        self.assertEqual(FactTerm.EVICTED, analysis.facts[0].term)

    def test_extract_sacked_facts(self):
        """Extracts sacked-related facts with eviction Term"""
        gkg = Gkg()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "last week 2000 people have been sacked from their homes in Nigeria"
        )
        self.session.add(content)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        extract_facts(analysis)
        self.assertEqual(FactTerm.SACKED, analysis.facts[0].term)

    def test_create_locations_with_names(self):
        """Creates locations for facts only with location names"""
        gkg = Gkg()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "It was early Saturday when a flash flood hit large parts of London and Middlesex and washed away more than 500 houses"
        )
        self.session.add(content)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        extract_facts(analysis)
        facts = analysis.facts
        self.assertEqual(1, len(facts))
        fact = facts[0]
        self.assertEqual(2, len(fact.locations))
        loc_names = [loc.location_name for loc in fact.locations]
        self.assertIn('London', loc_names)
        self.assertIn('Middlesex', loc_names)
        self.assertEqual([None, None], [loc.country for loc in fact.locations])

    def test_use_existing_location(self):
        """Uses existing locations when they exist"""
        gkg = Gkg()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "It was early Saturday when a flash flood hit large parts of Bosnia and washed away more than 500 houses"
        )
        self.session.add(content)
        location = Location(location_name='Bosnia')
        self.session.add(location)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        extract_facts(analysis)
        fact = analysis.facts[0]
        extracted_location = fact.locations[0]
        self.assertEqual(location.id, extracted_location.id)
Example #11
0
import string
import numpy as np
import pandas as pd
from idetect.nlp_models.category import *
from idetect.nlp_models.relevance import *
from idetect.nlp_models.base_model import CustomSklLsiModel

if __name__ == "__main__":

    # Create the Database
    engine = create_engine(db_url())
    Session.configure(bind=engine)
    Base.metadata.create_all(engine)

    session = Session()
    # Load the Countries data if necessary
    countries = session.query(Country).all()
    if len(countries) == 0:
        load_countries(session)

    # Load the Keywords if neccessary
    keywords = session.query(FactKeyword).all()
    if len(keywords) == 0:
        load_terms(session)

    session.close()

    # Load the Classifier models once to ensure they are downloaded
    CategoryModel()
    RelevanceModel()
Example #12
0
class TestManager(TestCase):
    start_date = '2017-01-01'
    plus_1_yr = '2018-01-01'
    plus_6_mo = '2017-07-01'
    plus_3_mo = '2017-04-01'
    plus_1_mo = '2017-02-01'

    def setUp(self):
        logger.debug("setUp")
        worker_logger = logging.getLogger("idetect.worker")
        worker_logger.setLevel(logging.INFO)

        logger.debug("Connecting to DB")
        db_host = os.environ.get('DB_HOST')
        db_port = os.environ.get('DB_PORT', 5432)
        db_user = os.environ.get('DB_USER', 'tester')
        db_pass = os.environ.get('DB_PASSWORD', 'tester')

        db_url = 'postgresql://{user}:{passwd}@{db_host}:{db_port}/{db}'.format(
            user=db_user,
            passwd=db_pass,
            db_host=db_host,
            db_port=db_port,
            db='idetect')
        self.engine = create_engine(db_url, echo=False)
        Session.configure(bind=self.engine)
        self.session = Session()
        self.session.query(FactApi).count()
        logger.debug("setUp complete")

    def tearDown(self):
        logger.debug("tearDown")
        self.session.rollback()
        logger.debug("sessions rolled back")
        logger.debug("tearDown complete")

    def test_timeline(self):
        t0 = time.time()
        counts = get_timeline_counts(self.session)
        t1 = time.time()

        days = {t['gdelt_day'] for t in counts}
        self.assertGreater(len(days), 180)

        categories = {t['category'] for t in counts}
        self.assertEqual(categories, {'Conflict', 'Disaster', 'Other'})

        print(t1 - t0)
        self.assertLess(
            t1 - t0, 1.0,
            'Calculating timeline counts {} - {} took too long'.format(
                self.start_date, self.plus_1_yr))

    def test_histogram(self):
        t0 = time.time()
        counts = get_histogram_counts(self.session)
        t1 = time.time()
        print(len(counts))

        figures = {
            t['specific_reported_figure']
            for t in counts if t['specific_reported_figure']
        }
        self.assertLess(min(figures), 10)
        self.assertGreater(max(figures), 1000000)

        units = {t['unit'] for t in counts}
        self.assertEqual(units, {'Household', 'Person'})

        print(t1 - t0)
        self.assertLess(
            t1 - t0, 1.0,
            'Calculating histogram counts {} - {} took too long'.format(
                self.start_date, self.plus_1_yr))

    def test_wordcloud(self):
        t0 = time.time()
        terms = get_wordcloud(self.session, self.engine)
        t1 = time.time()
        print(t1 - t0)
        print(len(terms))
        print(tabulate(terms))
        self.assertLess(
            t1 - t0, 5.0, 'Calculating wordcloud {} - {} took too long'.format(
                self.start_date, self.plus_1_yr))

    def test_timeline_year(self):
        t0 = time.time()
        counts = get_timeline_counts(self.session,
                                     fromdate=self.start_date,
                                     todate=self.plus_1_yr)
        t1 = time.time()

        days = {t['gdelt_day'] for t in counts}
        self.assertGreater(len(days), 180)

        categories = {t['category'] for t in counts}
        self.assertEqual(categories, {'Conflict', 'Disaster', 'Other'})

        print(t1 - t0)
        self.assertLess(
            t1 - t0, 1.0,
            'Calculating timeline counts {} - {} took too long'.format(
                self.start_date, self.plus_1_yr))

    def test_histogram_year(self):
        t0 = time.time()
        counts = get_histogram_counts(self.session,
                                      fromdate=self.start_date,
                                      todate=self.plus_1_yr)
        t1 = time.time()
        print(len(counts))

        figures = {
            t['specific_reported_figure']
            for t in counts if t['specific_reported_figure']
        }
        self.assertLess(min(figures), 10)
        self.assertGreater(max(figures), 1000000)

        units = {t['unit'] for t in counts}
        self.assertEqual(units, {'Household', 'Person'})

        print(t1 - t0)
        self.assertLess(
            t1 - t0, 1.0,
            'Calculating histogram counts {} - {} took too long'.format(
                self.start_date, self.plus_1_yr))

    def test_wordcloud_year(self):
        t0 = time.time()
        terms = get_wordcloud(self.session,
                              self.engine,
                              fromdate=self.start_date,
                              todate=self.plus_1_yr)
        t1 = time.time()
        print(t1 - t0)
        print(len(terms))
        print(tabulate(terms))
        self.assertLess(
            t1 - t0, 5.0, 'Calculating wordcloud {} - {} took too long'.format(
                self.start_date, self.plus_1_yr))

    def test_map_week(self):
        print("hello")
        t0 = time.time()
        entries = get_map_week(self.session)
        t1 = time.time()
        print(t1 - t0)
        # print(json.dumps(entries, indent=2))
        self.assertEqual(len(entries), 1)
        self.assertIsNotNone(entries[0].get('entries'))
Example #13
0
class TestModel(TestCase):
    def setUp(self):
        db_host = os.environ.get('DB_HOST')
        db_url = 'postgresql://{user}:{passwd}@{db_host}/{db}'.format(
            user='******', passwd='tester', db_host=db_host, db='idetect_test')
        engine = create_engine(db_url)
        Session.configure(bind=engine)
        Base.metadata.drop_all(engine)
        Base.metadata.create_all(engine)
        self.session = Session()
        self.sample_data()

    def sample_data(self):
        gkg1 = Gkg(
            id=3771256,
            gkgrecordid="20170215174500-2503",
            date=20170215174500,
            document_identifier=
            "http://www.philstar.com/headlines/2017/02/16/1672746/yasay-harris-affirm-stronger-phl-us-ties"
        )
        self.session.add(gkg1)

        gkg2 = Gkg(
            id=3771257,
            gkgrecordid="20170215174500-1536",
            date=20170215174500,
            document_identifier=
            "http://wynkcountry.iheart.com/onair/cmt-cody-alan-54719/thomas-rhett-and-lauren-akins-are-15565244/"
        )
        self.session.add(gkg2)
        self.session.commit()

    def tearDown(self):
        self.session.rollback()
        for gkg in self.session.query(Gkg).all():
            self.session.delete(gkg)
        self.session.commit()

    def test_status_update(self):
        gkg = self.session.query(Gkg).first()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        self.session.commit()

        analysis.create_new_version(Status.SCRAPING)
        self.assertEqual(analysis.status, Status.SCRAPING)

        # meanwhile, some other process changed the status of this...
        session2 = Session()
        try:
            other = session2.query(Analysis).get(analysis.gkg_id)
            other.create_new_version(Status.SCRAPING_FAILED)
        finally:
            session2.rollback()

        with self.assertRaises(NotLatestException):
            analysis.create_new_version(Status.SCRAPED)

    def test_version_lifecycle(self):
        gkg = self.session.query(Gkg).first()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        self.session.commit()

        analysis.create_new_version(Status.SCRAPING)

        history = self.session.query(AnalysisHistory).filter(
            AnalysisHistory.gkg == gkg)
        self.assertEqual(1, history.count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.NEW).count())

        content = DocumentContent(content_type="text/html",
                                  content="Lorem ipsum")
        analysis.content = content
        analysis.create_new_version(Status.SCRAPED)

        self.assertEqual(2, history.count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.NEW).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPING).count())

        analysis.create_new_version(Status.EXTRACTING)

        self.assertEqual(3, history.count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.NEW).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPED).count())

        # content is preserved
        scraped = history.filter(
            AnalysisHistory.status == Status.SCRAPED).one_or_none()
        self.assertEqual(analysis.content, scraped.content)

        fact = Fact(analysis_date=datetime.now())
        analysis.facts = [fact]
        analysis.create_new_version(Status.EXTRACTED)

        self.assertEqual(4, history.count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.NEW).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPED).count())
        self.assertEqual(
            1,
            history.filter(
                AnalysisHistory.status == Status.EXTRACTING).count())

        # content still preserved
        extracting = history.filter(
            AnalysisHistory.status == Status.EXTRACTING).one_or_none()
        self.assertEqual(analysis.content, extracting.content)

        analysis.create_new_version(Status.EDITING)
        analysis.content = DocumentContent(content_type="text/html",
                                           content="Lorem edited")
        analysis.create_new_version(Status.EDITED)

        self.assertEqual(6, history.count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.NEW).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPED).count())
        self.assertEqual(
            1,
            history.filter(
                AnalysisHistory.status == Status.EXTRACTING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.EXTRACTED).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.EDITING).count())

        # content has changed, but reports are preserved
        extracted = history.filter(
            AnalysisHistory.status == Status.EXTRACTED).one_or_none()
        self.assertNotEqual(analysis.content.id, extracted.content.id)
        self.assertCountEqual([f.id for f in analysis.facts],
                              [f.id for f in extracted.facts])

        analysis.create_new_version(Status.EDITING)
        fact2 = Fact(analysis_date=datetime.now())
        analysis.facts.append(fact2)
        analysis.create_new_version(Status.EDITED)

        self.assertEqual(8, history.count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.NEW).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPED).count())
        self.assertEqual(
            1,
            history.filter(
                AnalysisHistory.status == Status.EXTRACTING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.EXTRACTED).count())
        self.assertEqual(
            2,
            history.filter(AnalysisHistory.status == Status.EDITING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.EDITED).count())

        edited = history.filter(
            AnalysisHistory.status == Status.EDITED).one_or_none()
        self.assertCountEqual([f.id for f in analysis.facts],
                              [fact.id, fact2.id])
        self.assertCountEqual([f.id for f in edited.facts], [fact.id])

    def test_status_counts(self):
        gkgs = self.session.query(Gkg).all()[:2]
        analysis1 = Analysis(gkg=gkgs[0], status=Status.NEW)
        self.session.add(analysis1)
        self.session.commit()

        self.assertEqual(Analysis.status_counts(self.session), {Status.NEW: 1})

        analysis1.create_new_version(Status.SCRAPING)

        self.assertEqual(Analysis.status_counts(self.session),
                         {Status.SCRAPING: 1})

        analysis2 = Analysis(gkg=gkgs[1], status=Status.NEW)
        self.session.add(analysis2)
        self.session.commit()

        self.assertEqual(Analysis.status_counts(self.session), {
            Status.NEW: 1,
            Status.SCRAPING: 1
        })

        analysis2.create_new_version(Status.SCRAPING)

        self.assertEqual(Analysis.status_counts(self.session),
                         {Status.SCRAPING: 2})

        analysis2.create_new_version(Status.SCRAPED)

        self.assertEqual(Analysis.status_counts(self.session), {
            Status.SCRAPED: 1,
            Status.SCRAPING: 1
        })

    def test_country_term(self):
        mmr = Country(iso3="MMR", preferred_term="Myanmar")
        myanmar = CountryTerm(term="Myanmar", country=mmr)
        burma = CountryTerm(term="Burma", country=mmr)
        yangon = Location(location_name="Yangon",
                          location_type=LocationType.CITY,
                          country=mmr,
                          latlong="16°51′N 96°11′E")

        self.assertEqual(yangon.country, myanmar.country)
        self.assertEqual(yangon.country, burma.country)
Example #14
0
def analyse_url():
    session = Session()
    status = None
    gkg_id = None
    try:
        url = request.get_json(silent=True)['url'] or request.form['url']
    except Exception as e:
        return json.dumps({
            'success': False,
            'Exception': str(e),
            'status': 'missing or null url parameter'
        }), 422, {
            'ContentType': 'application/json'
        }
    if url is None:
        return json.dumps({
            'success': False,
            'status': 'null url parameter'
        }), 422, {
            'ContentType': 'application/json'
        }
    gkg = session.query(Gkg.id).filter(
        Gkg.document_identifier.like("%" + url + "%")).order_by(
            Gkg.date.asc()).first()
    if gkg:
        gkg_id = gkg.id
        status = 'url already in IDETECT DB'
    else:
        analysis = create_new_analysis_from_url(session, url)
        gkg_id = analysis.gkg_id
        status = 'url added to IDETECT DB'
        try:
            work(session, analysis, Status.SCRAPING, Status.SCRAPED,
                 Status.SCRAPING_FAILED, scrape)
            # TODO add classification, missing modules
            # work(session,analysis,Status.CLASSIFYING,Status.CLASSIFIED,Status.CLASSIFYING_FAILED,lambda article: classify(article, get_c_m(), get_r_m()))
            work(session, analysis, Status.EXTRACTING, Status.EXTRACTED,
                 Status.EXTRACTING_FAILED, extract_facts)
            work(session, analysis, Status.GEOTAGGING, Status.GEOTAGGED,
                 Status.GEOTAGGING_FAILED, process_locations)
        except Exception as e:
            return json.dumps({
                'success': False,
                'Exception': str(e)
            }), 422, {
                'ContentType': 'application/json'
            }
        finally:
            session.close()
    try:
        document = get_document(session, gkg_id)
        entries = get_facts_for_document(session, gkg_id)
        resp = jsonify({
            'document': document,
            'facts': entries,
            'status': status
        })
        resp.status_code = 200
        return resp
    finally:
        session.close()