def work(self): """ Look for Documents in the given session Return for which no Analysis exists and creates one with Status.New. Returns True iff some Analyses were created """ # start a new session for each job session = Session() try: # Get a Document # ... for which no Analysis exists # ... and lock it for updates # ... sort by created date # ... pick the first (oldest) gkgs = session.query(Gkg) \ .filter(~session.query(Analysis).filter(Gkg.id == Analysis.gkg_id).exists()) \ .with_for_update() \ .order_by(Gkg.date) \ .limit(1000).all() if len(gkgs) == 0: return False # no work to be done for gkg in gkgs: analysis = Analysis(gkg=gkg, status=Status.NEW) session.add(analysis) session.commit() logger.info( "Worker {} created Analysis {} in status {}".format( os.getpid(), analysis.gkg_id, analysis.status)) finally: # make sure to release a FOR UPDATE lock, if we got one if session is not None: session.rollback() session.close() return True
def work(self): """ Look for analyses in the given session and run function on them if any are found, managing status appropriately. Return True iff some Analyses were processed (successfully or not) """ # start a new session for each job session = Session() try: # Get an analysis # ... and lock it for updates # ... that meets the conditions specified in the filter function # ... sort by updated date # ... pick the first (oldest) analysis = self.filter_function(session.query(Analysis)) \ .with_for_update() \ .order_by(Analysis.updated) \ .first() if analysis is None: return False # no work to be done analysis_status = analysis.status analysis.create_new_version(self.working_status) logger.info("Worker {} claimed Analysis {} in status {}".format( os.getpid(), analysis.gkg_id, analysis_status)) finally: # make sure to release a FOR UPDATE lock, if we got one session.rollback() start = time.time() try: # set a timeout so if this worker stalls, we recover signal.alarm(self.timeout_seconds) # actually run the work function on this analysis self.function(analysis) delta = time.time() - start logger.info("Worker {} processed Analysis {} {} -> {} {}s".format( os.getpid(), analysis.gkg_id, analysis_status, self.success_status, delta)) analysis.error_msg = None analysis.processing_time = delta analysis.create_new_version(self.success_status) except Exception as e: delta = time.time() - start logger.warning( "Worker {} failed to process Analysis {} {} -> {}".format( os.getpid(), analysis.gkg_id, analysis_status, self.failure_status), exc_info=e) analysis.error_msg = str(e) analysis.processing_time = delta analysis.create_new_version(self.failure_status) session.commit() finally: # clear the timeout signal.alarm(0) if session is not None: session.rollback() session.close() return True
def setUp(self): db_host = os.environ.get('DB_HOST') db_url = 'postgresql://{user}:{passwd}@{db_host}/{db}'.format( user='******', passwd='tester', db_host=db_host, db='idetect_test') engine = create_engine(db_url) Session.configure(bind=engine) Base.metadata.drop_all(engine) Base.metadata.create_all(engine) self.session = Session()
def map_week_mview(): session = Session() try: entries = get_map_week(session) resp = jsonify(entries) resp.status_code = 200 return resp finally: session.close()
def wordcloud(): session = Session() try: data = request.get_json(silent=True) or request.form filters = filter_params(data) result = get_wordcloud(session, engine, **filters) resp = jsonify(result) resp.status_code = 200 return resp finally: session.close()
def histogram(): session = Session() try: data = request.get_json(silent=True) or request.form filters = filter_params(data) result = get_histogram_counts(session, **filters) resp = jsonify(result) resp.status_code = 200 return resp finally: session.close()
def homepage(): session = Session() try: articles = session.query(Analysis).order_by(desc( Analysis.updated)).limit(10).all() counts = Analysis.status_counts(session) cat_counts = Analysis.category_counts(session) return render_template('index.html', articles=articles, counts=counts, cat_counts=cat_counts) finally: session.close()
def urllist(): session = Session() try: data = request.get_json(silent=True) or request.form filters = filter_params(data) limit = data.get('limit', 32) offset = data.get('offset', 0) entries = get_urllist(session, limit=limit, offset=offset, **filters) count = get_count(session, **filters) resp = jsonify({'entries': entries, 'nentries': count}) resp.status_code = 200 return resp finally: session.close()
def article(doc_id): session = Session() try: analysis = session.query(Analysis) \ .filter(Analysis.gkg_id == doc_id).one() coords = { tuple(l.latlong.split(",")) for f in analysis.facts for l in f.locations if l.latlong is not None } return render_template('article.html', article=analysis, coords=list(coords)) finally: session.close()
def add_url(): url = request.form['url'] logger.info("Scraping by url: {url}".format(url=url)) if url is None: flash(u'Something went wrong. Please try again.', 'danger') return redirect(url_for('/')) article = Gkg(document_identifier=url) session = Session() try: session.add(article) session.commit() flash(u"{} was successfully added".format(url), 'success') return redirect('/') finally: session.close()
def test_status_update(self): gkg = self.session.query(Gkg).first() analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) self.session.commit() analysis.create_new_version(Status.SCRAPING) self.assertEqual(analysis.status, Status.SCRAPING) # meanwhile, some other process changed the status of this... session2 = Session() try: other = session2.query(Analysis).get(analysis.gkg_id) other.create_new_version(Status.SCRAPING_FAILED) finally: session2.rollback() with self.assertRaises(NotLatestException): analysis.create_new_version(Status.SCRAPED)
def search_url(): url = request.args.get('url') if url is None: return json.dumps({'success': False}), 422, { 'ContentType': 'application/json' } session = Session() try: gkg = session.query(Gkg).filter( Gkg.document_identifier.like("%" + url + "%")).order_by( Gkg.date.desc()).first() if gkg: resp = jsonify({'doc_id': gkg.id}) resp.status_code = 200 return resp else: return json.dumps({'success': False}), 422, { 'ContentType': 'application/json' } finally: session.close()
def setUp(self): logger.debug("setUp") worker_logger = logging.getLogger("idetect.worker") worker_logger.setLevel(logging.INFO) logger.debug("Connecting to DB") db_host = os.environ.get('DB_HOST') db_port = os.environ.get('DB_PORT', 5432) db_user = os.environ.get('DB_USER', 'tester') db_pass = os.environ.get('DB_PASSWORD', 'tester') db_url = 'postgresql://{user}:{passwd}@{db_host}:{db_port}/{db}'.format( user=db_user, passwd=db_pass, db_host=db_host, db_port=db_port, db='idetect') self.engine = create_engine(db_url, echo=False) Session.configure(bind=self.engine) self.session = Session() self.session.query(FactApi).count() logger.debug("setUp complete")
def urllist_grouped(): session = Session() try: data = request.get_json(silent=True) or request.form filters = filter_params(data) limit = data.get('limit', 32) offset = data.get('offset', 0) entries = get_urllist_grouped(session, limit=limit, offset=offset, **filters) # TODO for url_list grouped count should be the number of groups rather than the number of entries factcount = get_count(session, **filters) groupcount = get_group_count(session, **filters) resp = jsonify({ 'groups': entries, 'ngroups': groupcount, 'tot_nfacts': factcount }) resp.status_code = 200 return resp finally: session.close()
import re import string import numpy as np import pandas as pd from idetect.nlp_models.category import * from idetect.nlp_models.relevance import * from idetect.nlp_models.base_model import CustomSklLsiModel if __name__ == "__main__": # Create the Database engine = create_engine(db_url()) Session.configure(bind=engine) Base.metadata.create_all(engine) session = Session() # Load the Countries data if necessary countries = session.query(Country).all() if len(countries) == 0: load_countries(session) # Load the Keywords if neccessary keywords = session.query(FactKeyword).all() if len(keywords) == 0: load_terms(session) session.close() # Load the Classifier models once to ensure they are downloaded CategoryModel() RelevanceModel()
def analyse_url(): session = Session() status = None gkg_id = None try: url = request.get_json(silent=True)['url'] or request.form['url'] except Exception as e: return json.dumps({ 'success': False, 'Exception': str(e), 'status': 'missing or null url parameter' }), 422, { 'ContentType': 'application/json' } if url is None: return json.dumps({ 'success': False, 'status': 'null url parameter' }), 422, { 'ContentType': 'application/json' } gkg = session.query(Gkg.id).filter( Gkg.document_identifier.like("%" + url + "%")).order_by( Gkg.date.asc()).first() if gkg: gkg_id = gkg.id status = 'url already in IDETECT DB' else: analysis = create_new_analysis_from_url(session, url) gkg_id = analysis.gkg_id status = 'url added to IDETECT DB' try: work(session, analysis, Status.SCRAPING, Status.SCRAPED, Status.SCRAPING_FAILED, scrape) # TODO add classification, missing modules # work(session,analysis,Status.CLASSIFYING,Status.CLASSIFIED,Status.CLASSIFYING_FAILED,lambda article: classify(article, get_c_m(), get_r_m())) work(session, analysis, Status.EXTRACTING, Status.EXTRACTED, Status.EXTRACTING_FAILED, extract_facts) work(session, analysis, Status.GEOTAGGING, Status.GEOTAGGED, Status.GEOTAGGING_FAILED, process_locations) except Exception as e: return json.dumps({ 'success': False, 'Exception': str(e) }), 422, { 'ContentType': 'application/json' } finally: session.close() try: document = get_document(session, gkg_id) entries = get_facts_for_document(session, gkg_id) resp = jsonify({ 'document': document, 'facts': entries, 'status': status }) resp.status_code = 200 return resp finally: session.close()