Python Session.query Examples

Programming Language: Python

Namespace/Package Name: idetect.model

Class/Type: Session

Method/Function: query

Examples at hotexamples.com: 14

Python Session.query - 14 examples found. These are the top rated real world Python examples of idetect.model.Session.query extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Session(16)

query(14)

close(13)

rollback(9)

commit(8)

add(7)

configure(3)

delete(3)

Example #1

Show file

    def work(self):
        """
        Look for Documents in the given session Return for which no Analysis exists and
        creates one with Status.New. Returns True iff some Analyses were created
        """
        # start a new session for each job
        session = Session()
        try:
            # Get a Document
            # ... for which no Analysis exists
            # ... and lock it for updates
            # ... sort by created date
            # ... pick the first (oldest)
            gkgs = session.query(Gkg) \
                .filter(~session.query(Analysis).filter(Gkg.id == Analysis.gkg_id).exists()) \
                .with_for_update() \
                .order_by(Gkg.date) \
                .limit(1000).all()
            if len(gkgs) == 0:
                return False  # no work to be done
            for gkg in gkgs:
                analysis = Analysis(gkg=gkg, status=Status.NEW)
                session.add(analysis)
                session.commit()
                logger.info(
                    "Worker {} created Analysis {} in status {}".format(
                        os.getpid(), analysis.gkg_id, analysis.status))
        finally:
            # make sure to release a FOR UPDATE lock, if we got one
            if session is not None:
                session.rollback()
                session.close()

        return True

Example #2

Show file

File: test_scraper.py Project: idmc-labs/idetect

class TestScraper(TestCase):
    def setUp(self):
        db_host = os.environ.get('DB_HOST')
        db_url = 'postgresql://{user}:{passwd}@{db_host}/{db}'.format(
            user='******', passwd='tester', db_host=db_host, db='idetect_test')
        engine = create_engine(db_url)
        Session.configure(bind=engine)
        Base.metadata.drop_all(engine)
        Base.metadata.create_all(engine)
        self.session = Session()

    def tearDown(self):
        self.session.rollback()
        for gkg in self.session.query(Gkg).all():
            self.session.delete(gkg)
        self.session.commit()

    def test_scrape_html(self):
        gkg = Gkg(
            document_identifier="http://www.cnn.com/2013/08/23/us/hurricane-katrina-statistics-fast-facts/index.html")
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        self.session.commit()
        scrape(analysis)
        content = analysis.content
        self.assertEqual("text", content.content_type)
        self.assertTrue("Katrina" in content.content_clean)
        self.assertTrue("Louisiana" in content.content_clean)
        self.assertTrue("\n" not in content.content_clean)
        self.assertTrue(content.content_ts is not None)
        matches = (
            self.session.query(DocumentContent)
                .filter(DocumentContent.content_ts.match('Katrina & Louisiana')).all()
        )
        self.assertIn(content, matches)

    def test_scrape_pdf(self):
        gkg = Gkg(
            document_identifier="https://www1.ncdc.noaa.gov/pub/data/extremeevents/specialreports/Hurricane-Katrina.pdf")
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        self.session.commit()
        scrape(analysis)
        content = analysis.content
        self.assertEqual("pdf", content.content_type)
        self.assertTrue("Katrina" in content.content)
        self.assertTrue("Louisiana" in content.content)
        self.assertTrue("\n" not in content.content)

Example #3

Show file

    def work(self):
        """
        Look for analyses in the given session and run function on them
        if any are found, managing status appropriately. Return True iff some Analyses were processed (successfully or not)
        """
        # start a new session for each job
        session = Session()
        try:
            # Get an analysis
            # ... and lock it for updates
            # ... that meets the conditions specified in the filter function
            # ... sort by updated date
            # ... pick the first (oldest)
            analysis = self.filter_function(session.query(Analysis)) \
                .with_for_update() \
                .order_by(Analysis.updated) \
                .first()
            if analysis is None:
                return False  # no work to be done
            analysis_status = analysis.status
            analysis.create_new_version(self.working_status)
            logger.info("Worker {} claimed Analysis {} in status {}".format(
                os.getpid(), analysis.gkg_id, analysis_status))
        finally:
            # make sure to release a FOR UPDATE lock, if we got one
            session.rollback()

        start = time.time()
        try:
            # set a timeout so if this worker stalls, we recover
            signal.alarm(self.timeout_seconds)
            # actually run the work function on this analysis
            self.function(analysis)
            delta = time.time() - start
            logger.info("Worker {} processed Analysis {} {} -> {} {}s".format(
                os.getpid(), analysis.gkg_id, analysis_status,
                self.success_status, delta))
            analysis.error_msg = None
            analysis.processing_time = delta
            analysis.create_new_version(self.success_status)
        except Exception as e:
            delta = time.time() - start
            logger.warning(
                "Worker {} failed to process Analysis {} {} -> {}".format(
                    os.getpid(), analysis.gkg_id, analysis_status,
                    self.failure_status),
                exc_info=e)
            analysis.error_msg = str(e)
            analysis.processing_time = delta
            analysis.create_new_version(self.failure_status)
            session.commit()
        finally:
            # clear the timeout
            signal.alarm(0)
            if session is not None:
                session.rollback()
                session.close()
        return True

Example #4

Show file

def homepage():
    session = Session()
    try:
        articles = session.query(Analysis).order_by(desc(
            Analysis.updated)).limit(10).all()
        counts = Analysis.status_counts(session)
        cat_counts = Analysis.category_counts(session)
        return render_template('index.html',
                               articles=articles,
                               counts=counts,
                               cat_counts=cat_counts)
    finally:
        session.close()

Example #5

Show file

def article(doc_id):
    session = Session()
    try:
        analysis = session.query(Analysis) \
            .filter(Analysis.gkg_id == doc_id).one()
        coords = {
            tuple(l.latlong.split(","))
            for f in analysis.facts for l in f.locations
            if l.latlong is not None
        }
        return render_template('article.html',
                               article=analysis,
                               coords=list(coords))
    finally:
        session.close()

Example #6

Show file

File: test_model.py Project: idmc-labs/idetect

    def test_status_update(self):
        gkg = self.session.query(Gkg).first()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        self.session.commit()

        analysis.create_new_version(Status.SCRAPING)
        self.assertEqual(analysis.status, Status.SCRAPING)

        # meanwhile, some other process changed the status of this...
        session2 = Session()
        try:
            other = session2.query(Analysis).get(analysis.gkg_id)
            other.create_new_version(Status.SCRAPING_FAILED)
        finally:
            session2.rollback()

        with self.assertRaises(NotLatestException):
            analysis.create_new_version(Status.SCRAPED)

Example #7

Show file

def search_url():
    url = request.args.get('url')
    if url is None:
        return json.dumps({'success': False}), 422, {
            'ContentType': 'application/json'
        }
    session = Session()
    try:
        gkg = session.query(Gkg).filter(
            Gkg.document_identifier.like("%" + url + "%")).order_by(
                Gkg.date.desc()).first()
        if gkg:
            resp = jsonify({'doc_id': gkg.id})
            resp.status_code = 200
            return resp
        else:
            return json.dumps({'success': False}), 422, {
                'ContentType': 'application/json'
            }
    finally:
        session.close()

Example #8

Show file

class TestSyriaYear(TestCase):
    syria_location_ids = [
        127, 270, 281, 284, 307, 332, 372, 412, 429, 431, 531, 591, 612, 618,
        644, 671, 764, 807, 905, 958, 996, 1018, 1019, 1188, 1190, 1212, 1352,
        1357, 1524, 1678, 1898, 1981, 1990, 2058, 2060, 2272, 2378, 2735, 2933,
        3262, 3323, 3327, 3372, 3391, 3404, 3660, 3708, 3725, 3834, 3915, 3924,
        4069, 4172, 4399, 4509, 4648, 4824, 4890, 5017, 5285, 5833, 6053, 6070,
        6270, 6760, 6832, 7121, 7122, 7151, 7222, 7244, 7248, 7641, 7723, 7749,
        7757, 7827, 7919, 7970, 8078, 8107, 8131, 8166, 8176, 8210, 8222, 8240,
        8254, 8367, 8442, 8659, 8660, 8730, 8788, 8793, 8941, 9045, 9167, 9285,
        9370, 9531, 9606, 9775, 9909, 9913, 9916, 9917, 9933, 10136, 10312,
        10464, 10532, 10795, 10971, 11052, 11076, 11174, 11194, 11216, 11250,
        11311, 11501, 11703, 11727, 11916, 11933, 12242, 12387, 12990, 13126,
        13130, 13142, 13171, 13348, 13531, 13659, 13722, 14225, 14718, 14732,
        14737, 14917, 14930, 14988, 15215, 15257, 15984, 15993, 16188, 17034,
        17090, 17373, 17404, 17873, 18019, 18131, 18267, 18396, 18403, 18578,
        19550, 19641, 19721, 20180, 21339, 21894, 22003, 22022, 22162, 22201,
        22850, 23189, 23414, 23532, 23875, 24851, 25171, 25415, 25894, 25927,
        26024, 26283, 26458, 26545, 26909, 27027, 27393, 27507, 28185, 28626,
        28628, 29703, 29704, 29754, 29942, 30210, 30286, 30302, 30442, 30993,
        31492, 31743
    ]

    start_date = '2017-01-01'
    plus_1_yr = '2018-01-01'
    plus_6_mo = '2017-07-01'
    plus_3_mo = '2017-04-01'
    plus_1_mo = '2017-02-01'

    def setUp(self):
        logger.debug("setUp")
        worker_logger = logging.getLogger("idetect.worker")
        worker_logger.setLevel(logging.INFO)

        logger.debug("Connecting to DB")
        db_host = os.environ.get('DB_HOST')
        db_port = os.environ.get('DB_PORT', 5432)
        db_user = os.environ.get('DB_USER', 'tester')
        db_pass = os.environ.get('DB_PASSWORD', 'tester')

        db_url = 'postgresql://{user}:{passwd}@{db_host}:{db_port}/{db}'.format(
            user=db_user,
            passwd=db_pass,
            db_host=db_host,
            db_port=db_port,
            db='idetect')
        self.engine = create_engine(db_url, echo=False)
        Session.configure(bind=self.engine)
        self.session = Session()
        self.session.query(FactApi).count()
        logger.debug("setUp complete")

    def tearDown(self):
        logger.debug("tearDown")
        self.session.rollback()
        logger.debug("sessions rolled back")
        logger.debug("tearDown complete")

    def test_categories(self):
        syr_year_by_category = add_filters(
            self.session.query(func.count(FactApi.fact), FactApi.category),
            fromdate=self.start_date,
            todate=self.plus_1_yr,
            location_ids=self.syria_location_ids).group_by(FactApi.category)

        t0 = time.time()
        result = {
            category: count
            for count, category in syr_year_by_category.all()
        }
        t1 = time.time()
        # print(result)
        self.assertEqual(set(result.keys()), {'Conflict', 'Disaster', 'Other'})
        # print(explain_text(self.session, syr_year_by_category))
        print(t1 - t0)
        self.assertLess(t1 - t0, 1.0)

    def test_filter_counts(self):
        f_c = get_filter_counts(self.session,
                                fromdate=self.start_date,
                                todate=self.plus_1_yr,
                                location_ids=self.syria_location_ids)
        # print(f_c)
        self.assertGreater(len(f_c), 1000)

    def test_filter_counts_speed(self):
        for end_date in (self.plus_1_mo, self.plus_3_mo, self.plus_6_mo):
            # Adding this usually fails: , self.plus_1_yr):
            t0 = time.time()
            f_c = get_filter_counts(self.session,
                                    fromdate=self.start_date,
                                    todate=end_date,
                                    location_ids=self.syria_location_ids)
            t1 = time.time()
            print('{} - {}: {}s'.format(self.start_date, end_date, t1 - t0))
            self.assertLess(
                t1 - t0, 1.0,
                'Calculating filter counts {} - {} took too long'.format(
                    self.start_date, end_date))

    def test_timeline(self):
        t0 = time.time()
        counts = get_timeline_counts(self.session,
                                     fromdate=self.start_date,
                                     todate=self.plus_1_yr,
                                     location_ids=self.syria_location_ids)
        t1 = time.time()

        days = {t['gdelt_day'] for t in counts}
        self.assertGreater(len(days), 180)

        categories = {t['category'] for t in counts}
        self.assertEqual(categories, {'Conflict', 'Disaster', 'Other'})

        print(t1 - t0)
        self.assertLess(
            t1 - t0, 1.0,
            'Calculating timeline counts {} - {} took too long'.format(
                self.start_date, self.plus_1_yr))

    def test_histogram(self):
        t0 = time.time()
        counts = get_histogram_counts(self.session,
                                      fromdate=self.start_date,
                                      todate=self.plus_1_yr,
                                      location_ids=self.syria_location_ids)
        t1 = time.time()
        print(len(counts))

        figures = {
            t['specific_reported_figure']
            for t in counts if t['specific_reported_figure']
        }
        self.assertLess(min(figures), 10)
        self.assertGreater(max(figures), 1000000)

        units = {t['unit'] for t in counts}
        self.assertEqual(units, {'Household', 'Person'})

        print(t1 - t0)
        self.assertLess(
            t1 - t0, 1.0,
            'Calculating histogram counts {} - {} took too long'.format(
                self.start_date, self.plus_1_yr))

    def test_wordcloud(self):
        t0 = time.time()
        terms = get_wordcloud(self.session,
                              self.engine,
                              fromdate=self.start_date,
                              todate=self.plus_1_yr,
                              location_ids=self.syria_location_ids)
        t1 = time.time()
        print(t1 - t0)
        print(len(terms))
        print(tabulate(terms))
        self.assertLess(
            t1 - t0, 5.0, 'Calculating wordcloud {} - {} took too long'.format(
                self.start_date, self.plus_1_yr))

    def test_none_location(self):
        # TODO this isn't about Syria, move it somewhere else
        counts = get_filter_counts(self.session, location_ids=['NULL'])
        self.assertGreater(len(counts), 1000)

        self.assertEqual(
            counts, get_filter_counts(self.session, location_ids=['null']))
        self.assertEqual(counts,
                         get_filter_counts(self.session, location_ids=[None]))

        counts2 = get_filter_counts(self.session, location_ids=['NULL', 1])
        self.assertGreater(len(counts2), len(counts))

    def test_none_specific_reported_figure_1(self):
        # TODO this isn't about Syria, move it somewhere else
        counts = get_filter_counts(self.session,
                                   specific_reported_figures=['NULL'])
        srf_counts = [
            c for c in counts if c['filter_type'] == 'specific_reported_figure'
        ]
        self.assertEqual(len(srf_counts), 1)
        self.assertTrue([c for c in srf_counts if c['value'] is None])

        self.assertEqual(
            counts,
            get_filter_counts(self.session,
                              specific_reported_figures=['null']))
        self.assertEqual(
            counts,
            get_filter_counts(self.session, specific_reported_figures=[None]))

    def test_none_specific_reported_figure_2(self):
        # TODO this isn't about Syria, move it somewhere else
        counts = get_filter_counts(self.session,
                                   specific_reported_figures=['NULL', 1])
        srf_counts = [
            c for c in counts if c['filter_type'] == 'specific_reported_figure'
        ]
        self.assertEqual(len(srf_counts), 2)
        self.assertTrue([c for c in srf_counts if c['value'] is None])
        self.assertTrue([c for c in srf_counts if c['value'] == 1])

    def test_none_specific_reported_figure_3(self):
        # TODO this isn't about Syria, move it somewhere else
        counts = get_filter_counts(self.session,
                                   specific_reported_figures=['NULL', 1, 1000])
        srf_counts = [
            c for c in counts if c['filter_type'] == 'specific_reported_figure'
        ]
        self.assertGreater(len(srf_counts), 2)
        self.assertTrue([c for c in srf_counts if c['value'] is None])
        self.assertTrue([c for c in srf_counts if c['value'] == 1])

    def test_specific_reported_figure(self):
        # TODO this isn't about Syria, move it somewhere else
        counts = get_filter_counts(self.session,
                                   specific_reported_figures=[1, 1000])
        srf_counts = [
            c for c in counts if c['filter_type'] == 'specific_reported_figure'
        ]
        self.assertGreater(len(srf_counts), 2)
        self.assertFalse([c for c in srf_counts if c['value'] is None])
        self.assertTrue([c for c in srf_counts if c['value'] == 1])

    def test_filter_ts(self):
        t0 = time.time()
        query = add_filters(self.session.query(FactApi.content_id,
                                               DocumentContent.content_clean),
                            fromdate=self.start_date,
                            todate=self.plus_1_yr,
                            location_ids=self.syria_location_ids,
                            ts='Jordan').order_by(FactApi.gdelt_day).limit(32)
        results = query.all()
        t1 = time.time()
        print(t1 - t0)
        for id, content_clean in results:
            self.assertTrue('jordan' in content_clean.lower())

    @skip("Too slow in practice")
    def test_filter_ts_exhaustive(self):
        # make sure that the query found everything that it was supposed to
        t0 = time.time()
        query = add_filters(self.session.query(FactApi.content_id,
                                               DocumentContent.content_clean,
                                               FactApi.gdelt_day),
                            fromdate=self.start_date,
                            todate=self.plus_1_yr,
                            location_ids=self.syria_location_ids,
                            ts='Jordan').order_by(FactApi.gdelt_day).limit(32)
        results = query.all()
        t1 = time.time()
        print(t1 - t0)
        matched = set()
        max_day = date.min
        for id, content_clean, gdelt_day in results:
            self.assertTrue('jordan' in content_clean.lower())
            matched.add(id)
            max_day = max(max_day, gdelt_day)

        t2 = time.time()
        query = add_filters(self.session.query(FactApi.content_id,
                                               DocumentContent.content_clean),
                            fromdate=self.start_date,
                            todate=self.plus_1_yr,
                            location_ids=self.syria_location_ids).filter(
                                FactApi.gdelt_day <= max_day)
        results = query.all()
        t3 = time.time()
        print(t2 - t3)
        print(len(results))
        for id, content_clean in results:
            self.assertEqual(id in matched, 'jordan' in content_clean.lower())

    def test_urllist(self):
        t0 = time.time()
        result1 = get_urllist(self.session,
                              fromdate=self.start_date,
                              todate=self.plus_1_yr,
                              location_ids=self.syria_location_ids)
        t1 = time.time()
        print(t1 - t0)
        self.assertEqual(32, len(list(result1)))
        t2 = time.time()
        result2 = get_urllist(self.session,
                              offset=32,
                              limit=100,
                              fromdate=self.start_date,
                              todate=self.plus_1_yr,
                              location_ids=self.syria_location_ids)
        t3 = time.time()
        print(t3 - t2)
        self.assertEqual(100, len(list(result2)))
        for r1 in result1:
            # print(r1)
            for r2 in result2:
                self.assertLessEqual(
                    (r1['gdelt_day'], r1['gkg_id']),
                    (r2['gdelt_day'], r2['gkg_id']),
                )
        self.assertIn('display_color', result1[0])

    def test_urllist_unique_fact_id(self):
        t0 = time.time()
        result1 = get_urllist(self.session,
                              fromdate=self.start_date,
                              todate=self.plus_1_yr,
                              limit=1000000,
                              location_ids=self.syria_location_ids)
        t1 = time.time()
        print(t1 - t0)
        ids = [f['fact_id'] for f in result1]
        self.assertGreater(len(ids), 1000)
        self.assertEqual(len(ids), len(set(ids)))

    def test_urllist_ts(self):
        t0 = time.time()
        result1 = get_urllist(self.session,
                              fromdate=self.start_date,
                              todate=self.plus_1_yr,
                              location_ids=self.syria_location_ids,
                              ts='Jordan')
        t1 = time.time()
        print(t1 - t0)
        self.assertEqual(32, len(list(result1)))
        t2 = time.time()
        result2 = get_urllist(self.session,
                              offset=32,
                              limit=100,
                              fromdate=self.start_date,
                              todate=self.plus_1_yr,
                              location_ids=self.syria_location_ids,
                              ts='Jordan')
        t3 = time.time()
        print(t3 - t2)
        self.assertEqual(100, len(list(result2)))
        for r1 in result1:
            # print(r1)
            for r2 in result2:
                self.assertLessEqual(
                    (r1['gdelt_day'], r1['gkg_id']),
                    (r2['gdelt_day'], r2['gkg_id']),
                )

    def test_count_ts(self):
        t0 = time.time()
        c = get_count(self.session,
                      fromdate=self.start_date,
                      todate=self.plus_1_yr,
                      location_ids=self.syria_location_ids,
                      ts='Jordan')
        t1 = time.time()
        print(t1 - t0)
        self.assertGreater(c, 5000)
        self.assertLess(c, 10000)

    def test_urllist_grouped(self):
        t0 = time.time()
        result = get_urllist_grouped(self.session,
                                     fromdate=self.start_date,
                                     todate=self.plus_1_yr,
                                     location_ids=self.syria_location_ids,
                                     limit=100)
        t1 = time.time()
        print(t1 - t0)
        self.assertEqual(100, len(result))
        for entry in result:
            self.assertEqual(entry['nfacts'], len(entry['entry']))
            for fact in entry['entry']:
                self.assertEqual(entry['specific_reported_figure'],
                                 fact['specific_reported_figure'])
                self.assertEqual(entry['unit'], fact['unit'])
                self.assertEqual(entry['term'], fact['term'])
                self.assertGreater(len(fact['content_clean']), 0)
            fact_ids = [f['fact'] for f in entry['entry']]
            self.assertEqual(len(fact_ids), len(set(fact_ids)),
                             "fact repeated")

Example #9

Show file

class TestGeoTagger(TestCase):
    def setUp(self):
        db_host = os.environ.get('DB_HOST')
        db_url = 'postgresql://{user}:{passwd}@{db_host}/{db}'.format(
            user='******', passwd='tester', db_host=db_host, db='idetect_test')
        engine = create_engine(db_url)
        Session.configure(bind=engine)
        Base.metadata.drop_all(engine)
        Base.metadata.create_all(engine)
        self.session = Session()
        load_countries(self.session)

    def tearDown(self):
        self.session.rollback()

    def test_sets_no_results_flag(self):
        """Sets no-results flag if nothing found"""
        results = get_geo_info("xghijdshfkljdes")
        self.assertEqual(results['flag'], "no-results")

    def test_returns_detail_for_places(self):
        """Returns sufficient level of detail for results"""
        results = get_geo_info("Paris")
        self.assertNotEqual(results['country_code'], '')
        self.assertNotEqual(results['coordinates'], '')
        self.assertNotEqual(results['type'], '')

    def test_accuracy(self):
        """Returns sufficient level of detail for results"""
        results = get_geo_info("Beijing")
        self.assertEqual(results['country_code'], 'CHN')
        self.assertEqual(results['coordinates'], "39.9059631,116.391248")

    def test_country_code(self):
        """Returns sufficient level of detail for results"""
        results = get_geo_info("Bidibidi")
        self.assertEqual(results['country_code'], 'UGA')
        results = get_geo_info("Marrakech")
        self.assertEqual(results['country_code'], 'MAR')
        results = get_geo_info("Fairfax County")
        self.assertEqual(results['country_code'], 'USA')

    def test_location_types(self):
        """Corectly distinguishes between Countries, Cities and Subdivisions"""
        results = get_geo_info("London")
        self.assertEqual(results['type'], LocationType.CITY)
        results = get_geo_info("India")
        self.assertEqual(results['type'], LocationType.COUNTRY)
        results = get_geo_info("Alaska")
        self.assertEqual(results['type'], LocationType.SUBDIVISION)

    # DONT RUN geotagging if detail already exists
    @mock.patch('idetect.geotagger.nominatim_coordinates')
    def dont_geotag_if_detail_exists(self, nominatim):
        gkg = Gkg(
            id=3771256,
            gkgrecordid="20170215174500-2503",
            date=20170215174500,
            document_identifier=
            "http://www.philstar.com/headlines/2017/02/16/1672746/yasay-harris-affirm-stronger-phl-us-ties"
        )
        self.session.add(gkg)
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "It was early Saturday when a flash flood hit large parts of India and Pakistan and washed away more than 500 houses"
        )
        self.session.add(content)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        fact = Fact(unit='person', term='displaced')
        self.session.add(fact)
        self.session.commit()
        loc1 = self.session.query(Location).filter(
            Location.location_name == 'India').one_or_none()
        fact.locations.append(loc1)
        analysis.facts.append(fact)
        self.session.commit()
        process_locations(analysis)
        assert not nominatim.called

    def test_create_duplicate_fact(self):
        """Creates duplicate fact if locations from multiple countries exist"""
        gkg = Gkg(
            id=3771256,
            gkgrecordid="20170215174500-2503",
            date=20170215174500,
            document_identifier=
            "http://www.philstar.com/headlines/2017/02/16/1672746/yasay-harris-affirm-stronger-phl-us-ties"
        )
        self.session.add(gkg)
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        self.session.commit()
        fact = Fact(unit='person', term='displaced')
        self.session.add(fact)
        self.session.commit()
        loc1 = self.session.query(Location).filter(
            Location.location_name == 'India').one_or_none()
        loc2 = self.session.query(Location).filter(
            Location.location_name == 'Pakistan').one_or_none()
        fact.locations.append(loc1)
        fact.locations.append(loc2)
        analysis.facts.append(fact)
        self.session.commit()
        self.assertEqual(1, len(analysis.facts))
        process_locations(analysis)
        self.assertEqual(2, len(analysis.facts))
        fact_countries = [f.iso3 for f in analysis.facts]
        self.assertIn('IND', fact_countries)
        self.assertIn('PAK', fact_countries)
        self.assertEqual(1, len(analysis.facts[0].locations))
        self.assertEqual(1, len(analysis.facts[1].locations))

    @mock.patch('idetect.geotagger.nominatim_coordinates')
    def test_fail_if_geotagging_fails(self, nominatim):
        """Location processing should fail if geotagging fails"""
        nominatim.side_effect = GeotagException()
        gkg = Gkg(
            id=3771256,
            gkgrecordid="20170215174500-2503",
            date=20170215174500,
            document_identifier=
            "http://www.philstar.com/headlines/2017/02/16/1672746/yasay-harris-affirm-stronger-phl-us-ties"
        )
        self.session.add(gkg)
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        self.session.commit()
        fact = Fact(unit='person', term='displaced')
        self.session.add(fact)
        self.session.commit()
        loc1 = Location(location_name="Ruislip")
        fact.locations.append(loc1)
        analysis.facts.append(fact)
        self.session.commit()
        with self.assertRaises(GeotagException):
            process_locations(analysis)

Example #10

Show file

class TestFactExtractor(TestCase):
    def setUp(self):
        db_host = os.environ.get('DB_HOST')
        db_url = 'postgresql://{user}:{passwd}@{db_host}/{db}'.format(
            user='******', passwd='tester', db_host=db_host, db='idetect_test')
        engine = create_engine(db_url)
        Session.configure(bind=engine)
        Base.metadata.drop_all(engine)
        Base.metadata.create_all(engine)
        self.session = Session()
        load_countries(self.session)
        load_terms(self.session)

    def tearDown(self):
        self.session.rollback()
        for article in self.session.query(Gkg).all():
            self.session.delete(article)
        self.session.commit()

    def test_extract_facts_simple(self):
        """Extracts simple facts when present and saves to DB"""
        gkg = Gkg()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "It was early Saturday when a flash flood hit the area and washed away more than 500 houses"
        )
        self.session.add(content)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        extract_facts(analysis)
        self.assertEqual(1, len(analysis.facts))

    def test_extract_refugee_facts(self):
        """Extracts refugee-related facts with Refugee Term"""
        gkg = Gkg()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "It was early Saturday when government troops entered the area and forced more than 20000 refugees to flee."
        )
        self.session.add(content)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        extract_facts(analysis)
        self.assertEqual(FactTerm.REFUGEE, analysis.facts[0].term)

    def test_extract_evicted_facts(self):
        """Extracts eviction-related facts with eviction Term"""
        gkg = Gkg()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "2000 people have been evicted from their homes in Bosnia")
        self.session.add(content)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        extract_facts(analysis)
        self.assertEqual(FactTerm.EVICTED, analysis.facts[0].term)

    def test_extract_eviction_facts(self):
        """Extracts eviction-related facts with eviction Term"""
        gkg = Gkg()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "ordered eviction for 2000 people from their homes in Bosnia")
        self.session.add(content)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        extract_facts(analysis)
        self.assertEqual(FactTerm.EVICTED, analysis.facts[0].term)

    def test_extract_forced_eviction_facts(self):
        """Extracts eviction-related facts with eviction Term"""
        gkg = Gkg()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "ordered forced eviction for 2000 people from their homes in Bosnia"
        )
        self.session.add(content)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        extract_facts(analysis)
        self.assertEqual(FactTerm.EVICTED, analysis.facts[0].term)

    def test_extract_forcibly_evicted_facts(self):
        """Extracts eviction-related facts with eviction Term"""
        gkg = Gkg()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "2000 people were forcibly evicted from their homes in Bosnia")
        self.session.add(content)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        extract_facts(analysis)
        self.assertEqual(FactTerm.EVICTED, analysis.facts[0].term)

    def test_extract_sacked_facts(self):
        """Extracts sacked-related facts with eviction Term"""
        gkg = Gkg()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "last week 2000 people have been sacked from their homes in Nigeria"
        )
        self.session.add(content)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        extract_facts(analysis)
        self.assertEqual(FactTerm.SACKED, analysis.facts[0].term)

    def test_create_locations_with_names(self):
        """Creates locations for facts only with location names"""
        gkg = Gkg()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "It was early Saturday when a flash flood hit large parts of London and Middlesex and washed away more than 500 houses"
        )
        self.session.add(content)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        extract_facts(analysis)
        facts = analysis.facts
        self.assertEqual(1, len(facts))
        fact = facts[0]
        self.assertEqual(2, len(fact.locations))
        loc_names = [loc.location_name for loc in fact.locations]
        self.assertIn('London', loc_names)
        self.assertIn('Middlesex', loc_names)
        self.assertEqual([None, None], [loc.country for loc in fact.locations])

    def test_use_existing_location(self):
        """Uses existing locations when they exist"""
        gkg = Gkg()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        content = DocumentContent(
            content_clean=
            "It was early Saturday when a flash flood hit large parts of Bosnia and washed away more than 500 houses"
        )
        self.session.add(content)
        location = Location(location_name='Bosnia')
        self.session.add(location)
        self.session.commit()
        analysis.content_id = content.id
        self.session.commit()
        extract_facts(analysis)
        fact = analysis.facts[0]
        extracted_location = fact.locations[0]
        self.assertEqual(location.id, extracted_location.id)

Example #11

Show file

import string
import numpy as np
import pandas as pd
from idetect.nlp_models.category import *
from idetect.nlp_models.relevance import *
from idetect.nlp_models.base_model import CustomSklLsiModel

if __name__ == "__main__":

    # Create the Database
    engine = create_engine(db_url())
    Session.configure(bind=engine)
    Base.metadata.create_all(engine)

    session = Session()
    # Load the Countries data if necessary
    countries = session.query(Country).all()
    if len(countries) == 0:
        load_countries(session)

    # Load the Keywords if neccessary
    keywords = session.query(FactKeyword).all()
    if len(keywords) == 0:
        load_terms(session)

    session.close()

    # Load the Classifier models once to ensure they are downloaded
    CategoryModel()
    RelevanceModel()

Example #12

Show file

File: test_manager.py Project: idmc-labs/idetect

class TestManager(TestCase):
    start_date = '2017-01-01'
    plus_1_yr = '2018-01-01'
    plus_6_mo = '2017-07-01'
    plus_3_mo = '2017-04-01'
    plus_1_mo = '2017-02-01'

    def setUp(self):
        logger.debug("setUp")
        worker_logger = logging.getLogger("idetect.worker")
        worker_logger.setLevel(logging.INFO)

        logger.debug("Connecting to DB")
        db_host = os.environ.get('DB_HOST')
        db_port = os.environ.get('DB_PORT', 5432)
        db_user = os.environ.get('DB_USER', 'tester')
        db_pass = os.environ.get('DB_PASSWORD', 'tester')

        db_url = 'postgresql://{user}:{passwd}@{db_host}:{db_port}/{db}'.format(
            user=db_user,
            passwd=db_pass,
            db_host=db_host,
            db_port=db_port,
            db='idetect')
        self.engine = create_engine(db_url, echo=False)
        Session.configure(bind=self.engine)
        self.session = Session()
        self.session.query(FactApi).count()
        logger.debug("setUp complete")

    def tearDown(self):
        logger.debug("tearDown")
        self.session.rollback()
        logger.debug("sessions rolled back")
        logger.debug("tearDown complete")

    def test_timeline(self):
        t0 = time.time()
        counts = get_timeline_counts(self.session)
        t1 = time.time()

        days = {t['gdelt_day'] for t in counts}
        self.assertGreater(len(days), 180)

        categories = {t['category'] for t in counts}
        self.assertEqual(categories, {'Conflict', 'Disaster', 'Other'})

        print(t1 - t0)
        self.assertLess(
            t1 - t0, 1.0,
            'Calculating timeline counts {} - {} took too long'.format(
                self.start_date, self.plus_1_yr))

    def test_histogram(self):
        t0 = time.time()
        counts = get_histogram_counts(self.session)
        t1 = time.time()
        print(len(counts))

        figures = {
            t['specific_reported_figure']
            for t in counts if t['specific_reported_figure']
        }
        self.assertLess(min(figures), 10)
        self.assertGreater(max(figures), 1000000)

        units = {t['unit'] for t in counts}
        self.assertEqual(units, {'Household', 'Person'})

        print(t1 - t0)
        self.assertLess(
            t1 - t0, 1.0,
            'Calculating histogram counts {} - {} took too long'.format(
                self.start_date, self.plus_1_yr))

    def test_wordcloud(self):
        t0 = time.time()
        terms = get_wordcloud(self.session, self.engine)
        t1 = time.time()
        print(t1 - t0)
        print(len(terms))
        print(tabulate(terms))
        self.assertLess(
            t1 - t0, 5.0, 'Calculating wordcloud {} - {} took too long'.format(
                self.start_date, self.plus_1_yr))

    def test_timeline_year(self):
        t0 = time.time()
        counts = get_timeline_counts(self.session,
                                     fromdate=self.start_date,
                                     todate=self.plus_1_yr)
        t1 = time.time()

        days = {t['gdelt_day'] for t in counts}
        self.assertGreater(len(days), 180)

        categories = {t['category'] for t in counts}
        self.assertEqual(categories, {'Conflict', 'Disaster', 'Other'})

        print(t1 - t0)
        self.assertLess(
            t1 - t0, 1.0,
            'Calculating timeline counts {} - {} took too long'.format(
                self.start_date, self.plus_1_yr))

    def test_histogram_year(self):
        t0 = time.time()
        counts = get_histogram_counts(self.session,
                                      fromdate=self.start_date,
                                      todate=self.plus_1_yr)
        t1 = time.time()
        print(len(counts))

        figures = {
            t['specific_reported_figure']
            for t in counts if t['specific_reported_figure']
        }
        self.assertLess(min(figures), 10)
        self.assertGreater(max(figures), 1000000)

        units = {t['unit'] for t in counts}
        self.assertEqual(units, {'Household', 'Person'})

        print(t1 - t0)
        self.assertLess(
            t1 - t0, 1.0,
            'Calculating histogram counts {} - {} took too long'.format(
                self.start_date, self.plus_1_yr))

    def test_wordcloud_year(self):
        t0 = time.time()
        terms = get_wordcloud(self.session,
                              self.engine,
                              fromdate=self.start_date,
                              todate=self.plus_1_yr)
        t1 = time.time()
        print(t1 - t0)
        print(len(terms))
        print(tabulate(terms))
        self.assertLess(
            t1 - t0, 5.0, 'Calculating wordcloud {} - {} took too long'.format(
                self.start_date, self.plus_1_yr))

    def test_map_week(self):
        print("hello")
        t0 = time.time()
        entries = get_map_week(self.session)
        t1 = time.time()
        print(t1 - t0)
        # print(json.dumps(entries, indent=2))
        self.assertEqual(len(entries), 1)
        self.assertIsNotNone(entries[0].get('entries'))

Example #13

Show file

File: test_model.py Project: idmc-labs/idetect

class TestModel(TestCase):
    def setUp(self):
        db_host = os.environ.get('DB_HOST')
        db_url = 'postgresql://{user}:{passwd}@{db_host}/{db}'.format(
            user='******', passwd='tester', db_host=db_host, db='idetect_test')
        engine = create_engine(db_url)
        Session.configure(bind=engine)
        Base.metadata.drop_all(engine)
        Base.metadata.create_all(engine)
        self.session = Session()
        self.sample_data()

    def sample_data(self):
        gkg1 = Gkg(
            id=3771256,
            gkgrecordid="20170215174500-2503",
            date=20170215174500,
            document_identifier=
            "http://www.philstar.com/headlines/2017/02/16/1672746/yasay-harris-affirm-stronger-phl-us-ties"
        )
        self.session.add(gkg1)

        gkg2 = Gkg(
            id=3771257,
            gkgrecordid="20170215174500-1536",
            date=20170215174500,
            document_identifier=
            "http://wynkcountry.iheart.com/onair/cmt-cody-alan-54719/thomas-rhett-and-lauren-akins-are-15565244/"
        )
        self.session.add(gkg2)
        self.session.commit()

    def tearDown(self):
        self.session.rollback()
        for gkg in self.session.query(Gkg).all():
            self.session.delete(gkg)
        self.session.commit()

    def test_status_update(self):
        gkg = self.session.query(Gkg).first()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        self.session.commit()

        analysis.create_new_version(Status.SCRAPING)
        self.assertEqual(analysis.status, Status.SCRAPING)

        # meanwhile, some other process changed the status of this...
        session2 = Session()
        try:
            other = session2.query(Analysis).get(analysis.gkg_id)
            other.create_new_version(Status.SCRAPING_FAILED)
        finally:
            session2.rollback()

        with self.assertRaises(NotLatestException):
            analysis.create_new_version(Status.SCRAPED)

    def test_version_lifecycle(self):
        gkg = self.session.query(Gkg).first()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        self.session.commit()

        analysis.create_new_version(Status.SCRAPING)

        history = self.session.query(AnalysisHistory).filter(
            AnalysisHistory.gkg == gkg)
        self.assertEqual(1, history.count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.NEW).count())

        content = DocumentContent(content_type="text/html",
                                  content="Lorem ipsum")
        analysis.content = content
        analysis.create_new_version(Status.SCRAPED)

        self.assertEqual(2, history.count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.NEW).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPING).count())

        analysis.create_new_version(Status.EXTRACTING)

        self.assertEqual(3, history.count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.NEW).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPED).count())

        # content is preserved
        scraped = history.filter(
            AnalysisHistory.status == Status.SCRAPED).one_or_none()
        self.assertEqual(analysis.content, scraped.content)

        fact = Fact(analysis_date=datetime.now())
        analysis.facts = [fact]
        analysis.create_new_version(Status.EXTRACTED)

        self.assertEqual(4, history.count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.NEW).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPED).count())
        self.assertEqual(
            1,
            history.filter(
                AnalysisHistory.status == Status.EXTRACTING).count())

        # content still preserved
        extracting = history.filter(
            AnalysisHistory.status == Status.EXTRACTING).one_or_none()
        self.assertEqual(analysis.content, extracting.content)

        analysis.create_new_version(Status.EDITING)
        analysis.content = DocumentContent(content_type="text/html",
                                           content="Lorem edited")
        analysis.create_new_version(Status.EDITED)

        self.assertEqual(6, history.count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.NEW).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPED).count())
        self.assertEqual(
            1,
            history.filter(
                AnalysisHistory.status == Status.EXTRACTING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.EXTRACTED).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.EDITING).count())

        # content has changed, but reports are preserved
        extracted = history.filter(
            AnalysisHistory.status == Status.EXTRACTED).one_or_none()
        self.assertNotEqual(analysis.content.id, extracted.content.id)
        self.assertCountEqual([f.id for f in analysis.facts],
                              [f.id for f in extracted.facts])

        analysis.create_new_version(Status.EDITING)
        fact2 = Fact(analysis_date=datetime.now())
        analysis.facts.append(fact2)
        analysis.create_new_version(Status.EDITED)

        self.assertEqual(8, history.count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.NEW).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPED).count())
        self.assertEqual(
            1,
            history.filter(
                AnalysisHistory.status == Status.EXTRACTING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.EXTRACTED).count())
        self.assertEqual(
            2,
            history.filter(AnalysisHistory.status == Status.EDITING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.EDITED).count())

        edited = history.filter(
            AnalysisHistory.status == Status.EDITED).one_or_none()
        self.assertCountEqual([f.id for f in analysis.facts],
                              [fact.id, fact2.id])
        self.assertCountEqual([f.id for f in edited.facts], [fact.id])

    def test_status_counts(self):
        gkgs = self.session.query(Gkg).all()[:2]
        analysis1 = Analysis(gkg=gkgs[0], status=Status.NEW)
        self.session.add(analysis1)
        self.session.commit()

        self.assertEqual(Analysis.status_counts(self.session), {Status.NEW: 1})

        analysis1.create_new_version(Status.SCRAPING)

        self.assertEqual(Analysis.status_counts(self.session),
                         {Status.SCRAPING: 1})

        analysis2 = Analysis(gkg=gkgs[1], status=Status.NEW)
        self.session.add(analysis2)
        self.session.commit()

        self.assertEqual(Analysis.status_counts(self.session), {
            Status.NEW: 1,
            Status.SCRAPING: 1
        })

        analysis2.create_new_version(Status.SCRAPING)

        self.assertEqual(Analysis.status_counts(self.session),
                         {Status.SCRAPING: 2})

        analysis2.create_new_version(Status.SCRAPED)

        self.assertEqual(Analysis.status_counts(self.session), {
            Status.SCRAPED: 1,
            Status.SCRAPING: 1
        })

    def test_country_term(self):
        mmr = Country(iso3="MMR", preferred_term="Myanmar")
        myanmar = CountryTerm(term="Myanmar", country=mmr)
        burma = CountryTerm(term="Burma", country=mmr)
        yangon = Location(location_name="Yangon",
                          location_type=LocationType.CITY,
                          country=mmr,
                          latlong="16°51′N 96°11′E")

        self.assertEqual(yangon.country, myanmar.country)
        self.assertEqual(yangon.country, burma.country)

Example #14

Show file

def analyse_url():
    session = Session()
    status = None
    gkg_id = None
    try:
        url = request.get_json(silent=True)['url'] or request.form['url']
    except Exception as e:
        return json.dumps({
            'success': False,
            'Exception': str(e),
            'status': 'missing or null url parameter'
        }), 422, {
            'ContentType': 'application/json'
        }
    if url is None:
        return json.dumps({
            'success': False,
            'status': 'null url parameter'
        }), 422, {
            'ContentType': 'application/json'
        }
    gkg = session.query(Gkg.id).filter(
        Gkg.document_identifier.like("%" + url + "%")).order_by(
            Gkg.date.asc()).first()
    if gkg:
        gkg_id = gkg.id
        status = 'url already in IDETECT DB'
    else:
        analysis = create_new_analysis_from_url(session, url)
        gkg_id = analysis.gkg_id
        status = 'url added to IDETECT DB'
        try:
            work(session, analysis, Status.SCRAPING, Status.SCRAPED,
                 Status.SCRAPING_FAILED, scrape)
            # TODO add classification, missing modules
            # work(session,analysis,Status.CLASSIFYING,Status.CLASSIFIED,Status.CLASSIFYING_FAILED,lambda article: classify(article, get_c_m(), get_r_m()))
            work(session, analysis, Status.EXTRACTING, Status.EXTRACTED,
                 Status.EXTRACTING_FAILED, extract_facts)
            work(session, analysis, Status.GEOTAGGING, Status.GEOTAGGED,
                 Status.GEOTAGGING_FAILED, process_locations)
        except Exception as e:
            return json.dumps({
                'success': False,
                'Exception': str(e)
            }), 422, {
                'ContentType': 'application/json'
            }
        finally:
            session.close()
    try:
        document = get_document(session, gkg_id)
        entries = get_facts_for_document(session, gkg_id)
        resp = jsonify({
            'document': document,
            'facts': entries,
            'status': status
        })
        resp.status_code = 200
        return resp
    finally:
        session.close()