def doCall(self): self.log.info("Calling job %s", self.job_name) session = db.get_db_session() item = session.query(db.PluginStatus).filter(db.PluginStatus.plugin_name==self.job_name).one() if item.is_running: session.commit() self.log.error("Plugin %s is already running! Not doing re-entrant call!", self.job_name) return item.is_running = True item.last_run = datetime.datetime.now() session.commit() try: self._doCall() except Exception: item.last_error = datetime.datetime.now() item.last_error_msg = traceback.format_exc() raise finally: item2 = session.query(db.PluginStatus).filter(db.PluginStatus.plugin_name==self.job_name).one() item2.is_running = False item2.last_run_end = datetime.datetime.now() session.commit() db.delete_db_session() self.log.info("Job %s complete.", self.job_name)
def resetRunStates(): print("JobSetup call resetting run-states!") session = db.get_db_session() session.query(db.PluginStatus).update({db.PluginStatus.is_running : False}) session.commit() db.delete_db_session() print("Run-states reset.")
def initializeStartUrls(rules): print("Initializing all start URLs in the database") sess = db.get_db_session() for ruleset in [rset for rset in rules if rset['starturls']]: for starturl in ruleset['starturls']: have = sess.query(db.WebPages) \ .filter(db.WebPages.url == starturl) \ .count() if not have: netloc = urlFuncs.getNetLoc(starturl) new = db.WebPages( url = starturl, starturl = starturl, netloc = netloc, type = ruleset['type'], priority = db.DB_IDLE_PRIORITY, distance = db.DB_DEFAULT_DIST, normal_fetch_mode = ruleset['normal_fetch_mode'], ) print("Missing start-url for address: '{}'".format(starturl)) sess.add(new) try: sess.commit() except sqlalchemy.SQLAlchemyError: print("Failure inserting start url for address: '{}'".format(starturl)) sess.rollback() sess.close() db.delete_db_session()
def teardown_request(response): try: try: g.session.commit() except Exception: g.session.rollback() print("Returned session") database.delete_db_session(flask_sess_if_possible=False) except Exception: print("Failure in teardown_request()!") traceback.print_exc()
def do_task(self): db_handle = db.get_db_session() hadjob = False try: self.archiver = WebMirror.Engine.SiteArchiver(self.cookie_lock, new_job_queue=self.new_job_queue, response_queue=self.resp_queue, db_interface=db_handle) hadjob = self.archiver.taskProcess() finally: # Clear out the sqlalchemy state db_handle.expunge_all() db.delete_db_session() return hadjob
def resetInProgress(): print("Resetting any stalled downloads from the previous session.") sess = db.get_db_session() sess.query(db.WebPages) \ .filter( (db.WebPages.state == "fetching") | (db.WebPages.state == "processing") | (db.WebPages.state == "specialty_deferred") | (db.WebPages.state == "specialty_ready") ) \ .update({db.WebPages.state : "new"}) sess.commit() sess.close() db.delete_db_session()
def get_times(self): conn = database.get_db_session() aps = conn.execute("SELECT job_state FROM apscheduler_jobs;") update_times = [] for blob, in aps: job_dict = pickle.loads(blob) update_times.append( (job_dict['id'], job_dict['next_run_time'].isoformat())) data = { "update-times": update_times, } database.delete_db_session() return pack_message("system-update-times", data)
def exposed_raw_test_retrieve(url): ''' Lower level fetch test, otherwise similar to `test_retreive` ''' # try: # WebMirror.SpecialCase.startAmqpFetcher() # except RuntimeError: # Fetcher already started # pass parsed = urllib.parse.urlparse(url) root = urllib.parse.urlunparse((parsed[0], parsed[1], "", "", "", "")) sess = db.get_db_session() row = sess.query(db.RawWebPages).filter(db.RawWebPages.url == url).scalar() if row: row.state = 'new' else: row = db.RawWebPages( url = url, starturl = root, netloc = parsed.netloc, distance = 50000, priority = 500000, state = 'new', fetchtime = datetime.datetime.now(), ) sess.add(row) try: archiver = RawArchiver.RawEngine.RawSiteArchiver( total_worker_count = 1, worker_num = 0, new_job_queue = None, cookie_lock = None, db_interface = sess, response_queue = None ) job = archiver.do_job(row) except Exception as e: traceback.print_exc() finally: db.delete_db_session()
def get_times(self): conn = database.get_db_session() aps = conn.execute("SELECT job_state FROM apscheduler_jobs;") update_times = [] for blob, in aps: job_dict = pickle.loads(blob) update_times.append(( job_dict['id'], job_dict['next_run_time'].isoformat() )) data = { "update-times" : update_times, } database.delete_db_session() return pack_message("system-update-times", data)
def dump_scheduled_jobs(sched): print("Scheduled jobs:") existing = sched.get_jobs() if not existing: print(" No jobs in scheduler!") tznow = datetime.datetime.now(tz=pytz.utc) for job in existing: print(" ", job, job.args, "running in:", job.next_run_time - tznow, (job.id, )) session = db.get_db_session() running = session.query(db.PluginStatus).filter(db.PluginStatus.is_running == True).all() print("Running jobs:") for jitem in running: print(" ", jitem.plugin_name, jitem.is_running, jitem.last_run, jitem.last_error, jitem.last_error_msg) if not running: print(" <None!>") print("Running threads:") for thread in threading.enumerate(): print(" ", thread.getName(), thread) db.delete_db_session()
def exposed_fetch(url, debug=True, rss_debug=False): ''' Do a synchronous fetch of content from url `url`. ''' # try: # WebMirror.SpecialCase.startAmqpFetcher() # except RuntimeError: # Fetcher already started # pass if rss_debug: print("Debugging RSS") flags.RSS_DEBUG = True parsed = urllib.parse.urlparse(url) root = urllib.parse.urlunparse((parsed[0], parsed[1], "", "", "", "")) new = db.WebPages( url = url, starturl = root, netloc = parsed.netloc, distance = 50000, is_text = True, priority = 500000, type = 'unknown', fetchtime = datetime.datetime.now(), ) if debug: print(new) try: archiver = SiteArchiver(None, db.get_db_session(), None) job = archiver.synchronousJobRequest(url, ignore_cache=True) except Exception as e: traceback.print_exc() finally: db.delete_db_session()
def __init__(self, job_name): if not job_name in CALLABLE_LUT: raise JobNameException("Callable '%s' is not in the class lookup table: '%s'!" % (job_name, CALLABLE_LUT)) self.runModule = CALLABLE_LUT[job_name] self.job_name = job_name session = db.get_db_session() try: query = session.query(db.PluginStatus).filter(db.PluginStatus.plugin_name==job_name) have = query.scalar() if not have: new = db.PluginStatus(plugin_name=job_name) session.add(new) session.commit() except sqlalchemy.exc.OperationalError: session.rollback() except sqlalchemy.exc.InvalidRequestError: session.rollback() finally: db.delete_db_session()
def __del__(self): db.delete_db_session()