def get_current_state(user_id, stage):
    curr_state_doc = edb.get_pipeline_state_db().find_one({"user_id": user_id,
                                                            "pipeline_stage": stage.value})
    #logging.debug("returning curr_state_doc  %s for stage %s " % (curr_state_doc, stage))
    if curr_state_doc is not None:
        return ps.PipelineState(curr_state_doc)
    else:
        return None
Beispiel #2
0
def get_time_range_for_stage(user_id, stage):
    """
    Returns the start ts and the end ts of the entries in the stage
    """
    curr_state = get_current_state(user_id, stage)

    if curr_state is None:
        start_ts = None
        curr_state = ps.PipelineState()
        curr_state.user_id = user_id
        curr_state.pipeline_stage = stage
        curr_state.curr_run_ts = None
        curr_state.last_processed_ts = None
        curr_state.last_ts_run = None
    else:
        start_ts = curr_state.last_processed_ts

    if start_ts is None:
        logging.info("For stage %s, start_ts is None" % stage)
    else:
        logging.info(
            "For stage %s, start_ts = %s" %
            (stage, pydt.datetime.utcfromtimestamp(start_ts).isoformat()))

    assert curr_state.curr_run_ts is None, "curr_state.curr_run_ts = %s" % curr_state.curr_run_ts
    # Let's pick a point 5 secs in the past. If we don't do this, then we will
    # read all entries upto the current ts and this may lead to lost data. For
    # example, let us say that the current ts is t1. At the time that we read
    # the data, we have 4 entries for t1. By the time we finish copying, we
    # have 6 entries for t1, we will end up deleting all 6, which will lose 2
    # entries.
    end_ts = time.time() - END_FUZZ_AVOID_LTE

    ret_query = estt.TimeQuery("metadata.write_ts", start_ts, end_ts)

    curr_state.curr_run_ts = end_ts
    logging.debug("About to save object %s" % curr_state)
    edb.save(edb.get_pipeline_state_db(), curr_state)
    logging.debug(
        "After saving state %s, list is %s" %
        (curr_state,
         list(edb.get_pipeline_state_db().find({"user_id": user_id}))))
    return ret_query