def get_current_state(user_id, stage): curr_state_doc = edb.get_pipeline_state_db().find_one({"user_id": user_id, "pipeline_stage": stage.value}) #logging.debug("returning curr_state_doc %s for stage %s " % (curr_state_doc, stage)) if curr_state_doc is not None: return ps.PipelineState(curr_state_doc) else: return None
def get_time_range_for_stage(user_id, stage): """ Returns the start ts and the end ts of the entries in the stage """ curr_state = get_current_state(user_id, stage) if curr_state is None: start_ts = None curr_state = ps.PipelineState() curr_state.user_id = user_id curr_state.pipeline_stage = stage curr_state.curr_run_ts = None curr_state.last_processed_ts = None curr_state.last_ts_run = None else: start_ts = curr_state.last_processed_ts if start_ts is None: logging.info("For stage %s, start_ts is None" % stage) else: logging.info( "For stage %s, start_ts = %s" % (stage, pydt.datetime.utcfromtimestamp(start_ts).isoformat())) assert curr_state.curr_run_ts is None, "curr_state.curr_run_ts = %s" % curr_state.curr_run_ts # Let's pick a point 5 secs in the past. If we don't do this, then we will # read all entries upto the current ts and this may lead to lost data. For # example, let us say that the current ts is t1. At the time that we read # the data, we have 4 entries for t1. By the time we finish copying, we # have 6 entries for t1, we will end up deleting all 6, which will lose 2 # entries. end_ts = time.time() - END_FUZZ_AVOID_LTE ret_query = estt.TimeQuery("metadata.write_ts", start_ts, end_ts) curr_state.curr_run_ts = end_ts logging.debug("About to save object %s" % curr_state) edb.save(edb.get_pipeline_state_db(), curr_state) logging.debug( "After saving state %s, list is %s" % (curr_state, list(edb.get_pipeline_state_db().find({"user_id": user_id})))) return ret_query