コード例 #1
0
    def testLocalDtFillTimesDailyOneTz(self):
        key = (2016, 5, 3)
        test_section_list = []
        test_section_list.append(
            self._createTestSection(
                arrow.Arrow(2016, 5, 3, 6, tzinfo=tz.gettz(PST)), PST))
        test_section_list.append(
            self._createTestSection(
                arrow.Arrow(2016, 5, 3, 10, tzinfo=tz.gettz(PST)), PST))
        test_section_list.append(
            self._createTestSection(
                arrow.Arrow(2016, 5, 3, 23, tzinfo=tz.gettz(PST)), PST))

        section_group_df = self.ts.to_data_df(
            eac.get_section_key_for_analysis_results(), test_section_list)
        logging.debug("First row of section_group_df = %s" %
                      section_group_df.iloc[0])
        self.assertEqual(earmt._get_tz(section_group_df), PST)

        ms = ecwms.ModeStatTimeSummary()
        earmt.local_dt_fill_times_daily(key, section_group_df, ms)
        logging.debug("before starting checks, ms = %s" % ms)
        self.assertEqual(ms.ts, 1462258800)
        self.assertEqual(ms.local_dt.day, 3)
        self.assertEqual(ms.local_dt.timezone, PST)
コード例 #2
0
def group_by_local_date(user_id, from_dt, to_dt, freq, summary_fn_list):
    """
    Get grouped data frames for the specified local date range and frequency
    :param user_id: id for the user. None for aggregate.
    :param from_dt: start local dt object. We assume that only the year, month
    and date entries are filled in and represent a date range.
    :param to_dt: end local dt object. We assume that only the year, month
    and date entries are filled in and represent a date range.
    :param freq: since we only expand certain local_dt fields, we can only
    support frequencies corresponding to them. These are represented in the
    `LocalFreq` enum.
    :return: a dict containing the last start_ts of the last section processed
        and a result list of ModeStatTimeSummary objects
        If there were no matching sections, the last start_ts is None
        and the list is empty.
    """
    time_query = esttc.TimeComponentQuery("data.start_local_dt", from_dt, to_dt)
    section_df = esda.get_data_df(eac.get_section_key_for_analysis_results(),
                                  user_id=user_id, time_query=time_query,
                                  geo_query=None)
    if len(section_df) == 0:
        logging.info("Found no entries for user %s, time_query %s" % (user_id, time_query))
        return {
            "last_ts_processed": None,
            "result": [[] for i in range(len(summary_fn_list))]
        }

    groupby_arr = _get_local_group_by(freq)
    time_grouped_df = section_df.groupby(groupby_arr)
    local_dt_fill_fn = _get_local_key_to_fill_fn(freq)
    return {
        "last_ts_processed": section_df.iloc[-1].start_ts,
        "result": [grouped_to_summary(time_grouped_df, local_dt_fill_fn, summary_fn)
                        for summary_fn in summary_fn_list]
    }
コード例 #3
0
def group_by_local_date(user_id, from_dt, to_dt, freq, summary_fn_list):
    """
    Get grouped data frames for the specified local date range and frequency
    :param user_id: id for the user. None for aggregate.
    :param from_dt: start local dt object. We assume that only the year, month
    and date entries are filled in and represent a date range.
    :param to_dt: end local dt object. We assume that only the year, month
    and date entries are filled in and represent a date range.
    :param freq: since we only expand certain local_dt fields, we can only
    support frequencies corresponding to them. These are represented in the
    `LocalFreq` enum.
    :return: a dict containing the last start_ts of the last section processed
        and a result list of ModeStatTimeSummary objects
        If there were no matching sections, the last start_ts is None
        and the list is empty.
    """
    time_query = esttc.TimeComponentQuery("data.start_local_dt", from_dt, to_dt)
    section_df = esda.get_data_df(eac.get_section_key_for_analysis_results(),
                                  user_id=user_id, time_query=time_query,
                                  geo_query=None)
    if len(section_df) == 0:
        logging.info("Found no entries for user %s, time_query %s" % (user_id, time_query))
        return {
            "last_ts_processed": None,
            "result": [[] for i in range(len(summary_fn_list))]
        }
    groupby_arr = _get_local_group_by(freq)
    time_grouped_df = section_df.groupby(groupby_arr)
    local_dt_fill_fn = _get_local_key_to_fill_fn(freq)
    return {
        "last_ts_processed": section_df.iloc[-1].start_ts,
        "result": [grouped_to_summary(time_grouped_df, local_dt_fill_fn, summary_fn)
                        for summary_fn in summary_fn_list]
    }
コード例 #4
0
    def _createTestSection(self, start_ardt, start_timezone):
        section = ecws.Section()
        self._fillDates(section, "start_", start_ardt, start_timezone)
        # Hackily fill in the end with the same values as the start
        # so that the field exists
        # in cases where the end is important (mainly for range timezone
        # calculation with local times), it can be overridden using _fillDates
        # from the test case
        self._fillDates(section, "end_", start_ardt, start_timezone)
        logging.debug("created section %s" % (section.start_fmt_time))

        entry = ecwe.Entry.create_entry(self.testUUID,
                                        eac.get_section_key_for_analysis_results(),
                                        section, create_id=True)
        self.ts.insert(entry)
        return entry
コード例 #5
0
def grouped_to_summary(time_grouped_df, key_to_fill_fn, summary_fn):
    ret_list = []
    # When we group by a time range, the key is the end of the range
    for key, section_group_df in time_grouped_df:
        curr_msts = ecwms.ModeStatTimeSummary()
        key_to_fill_fn(key, section_group_df, curr_msts)
        curr_msts.nUsers = len(section_group_df.user_id.unique())
        mode_grouped_df = section_group_df.groupby('sensed_mode')
        mode_results = summary_fn(mode_grouped_df)
        for mode, result in mode_results.items():
            if eac.get_section_key_for_analysis_results() == "analysis/inferred_section":
                curr_msts[ecwmp.PredictedModeTypes(mode).name] = result
            else:
                curr_msts[ecwm.MotionTypes(mode).name] = result
        ret_list.append(curr_msts)
    return ret_list
コード例 #6
0
def grouped_to_summary(time_grouped_df, key_to_fill_fn, summary_fn):
    ret_list = []
    # When we group by a time range, the key is the end of the range
    for key, section_group_df in time_grouped_df:
        curr_msts = ecwms.ModeStatTimeSummary()
        key = fix_int64_key_if_needed(key)
        key_to_fill_fn(key, section_group_df, curr_msts)
        curr_msts.nUsers = len(section_group_df.user_id.unique())
        mode_grouped_df = section_group_df.groupby('sensed_mode')
        mode_results = summary_fn(mode_grouped_df)
        for mode, result in mode_results.items():
            if eac.get_section_key_for_analysis_results() == "analysis/inferred_section":
                curr_msts[ecwmp.PredictedModeTypes(mode).name] = result
            else:
                curr_msts[ecwm.MotionTypes(mode).name] = result
        ret_list.append(curr_msts)
#         import bson.json_util as bju
#         logging.debug("After appending %s, ret_list = %s" % (curr_msts, ret_list))
#         for k in curr_msts.keys():
#             print("Serializing key = %s" % k)
#             logging.debug("Serializing key %s = %s" %
#                 (k, bju.dumps(curr_msts[k])))
    return ret_list
コード例 #7
0
def group_by_timestamp(user_id, start_ts, end_ts, freq, summary_fn_list):
    """
    Get grouped dataframes for the specific time range and at the specified frequency
    :param user_id: The user for whom we are computing this information. None for all users.
    :param from_ld: The start timestamp
    :param to_ld: The end timestamp
    :param freq: The frequency as specified in a pandas date_range frequency string.
    We only support frequencies of a day or longer in order to return the data
    in a format that makes sense
    http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases
    The canonical list can be found at:
    > pandas.tseries.offsets.prefix_mapping
    :return: a dict containing the last start_ts of the last section processed
        and a result list of ModeStatTimeSummary objects
        If there were no matching sections, the last start_ts is None
        and the list is empty.
    """
    time_query = estt.TimeQuery("data.start_ts", start_ts, end_ts)
    section_df = esda.get_data_df(eac.get_section_key_for_analysis_results(),
                                  user_id=user_id, time_query=time_query,
                                  geo_query=None)
    if len(section_df) == 0:
        logging.info("Found no entries for user %s, time_query %s" % (user_id, time_query))
        return {
            "last_ts_processed": None,
            "result": [[] for i in range(len(summary_fn_list))]
        }
    logging.debug("first row is %s" % section_df.iloc[0])
    secs_to_nanos = lambda x: x * 10 ** 9
    section_df['start_dt'] = pd.to_datetime(secs_to_nanos(section_df.start_ts))
    time_grouped_df = section_df.groupby(pd.Grouper(freq=freq, key='start_dt'))
    return {
        "last_ts_processed": section_df.iloc[-1].start_ts,
        "result": [grouped_to_summary(time_grouped_df, timestamp_fill_times, summary_fn)
                   for summary_fn in summary_fn_list]
    }
コード例 #8
0
    def testLocalDtFillTimesDailyOneTz(self):
        key = (2016, 5, 3)
        test_section_list = []
        test_section_list.append(
            self._createTestSection(arrow.Arrow(2016,5,3,6, tzinfo=tz.gettz(PST)),
                                    PST))
        test_section_list.append(
            self._createTestSection(arrow.Arrow(2016,5,3,10, tzinfo=tz.gettz(PST)),
                                    PST))
        test_section_list.append(
            self._createTestSection(arrow.Arrow(2016,5,3,23, tzinfo=tz.gettz(PST)),
                                    PST))

        section_group_df = self.ts.to_data_df(eac.get_section_key_for_analysis_results(),
                                              test_section_list)
        logging.debug("First row of section_group_df = %s" % section_group_df.iloc[0])
        self.assertEqual(earmt._get_tz(section_group_df), PST)

        ms = ecwms.ModeStatTimeSummary()
        earmt.local_dt_fill_times_daily(key, section_group_df, ms)
        logging.debug("before starting checks, ms = %s" % ms)
        self.assertEqual(ms.ts, 1462258800)
        self.assertEqual(ms.local_dt.day, 3)
        self.assertEqual(ms.local_dt.timezone, PST)
コード例 #9
0
def group_by_timestamp(user_id, start_ts, end_ts, freq, summary_fn_list):
    """
    Get grouped dataframes for the specific time range and at the specified frequency
    :param user_id: The user for whom we are computing this information. None for all users.
    :param from_ld: The start timestamp
    :param to_ld: The end timestamp
    :param freq: The frequency as specified in a pandas date_range frequency string.
    We only support frequencies of a day or longer in order to return the data
    in a format that makes sense
    http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases
    The canonical list can be found at:
    > pandas.tseries.offsets.prefix_mapping
    :return: a dict containing the last start_ts of the last section processed
        and a result list of ModeStatTimeSummary objects
        If there were no matching sections, the last start_ts is None
        and the list is empty.
    """
    time_query = estt.TimeQuery("data.start_ts", start_ts, end_ts)
    section_df = esda.get_data_df(eac.get_section_key_for_analysis_results(),
                                  user_id=user_id, time_query=time_query,
                                  geo_query=None)
    if len(section_df) == 0:
        logging.info("Found no entries for user %s, time_query %s" % (user_id, time_query))
        return {
            "last_ts_processed": None,
            "result": [[] for i in range(len(summary_fn_list))]
        }
    logging.debug("first row is %s" % section_df.iloc[0])
    secs_to_nanos = lambda x: x * 10 ** 9
    section_df['start_dt'] = pd.to_datetime(secs_to_nanos(section_df.start_ts))
    time_grouped_df = section_df.groupby(pd.Grouper(freq=freq, key='start_dt'))
    return {
        "last_ts_processed": section_df.iloc[-1].start_ts,
        "result": [grouped_to_summary(time_grouped_df, timestamp_fill_times, summary_fn)
                   for summary_fn in summary_fn_list]
    }
コード例 #10
0
def section_to_geojson(section, tl):
    """
    This is the trickiest part of the visualization.
    The section is basically a collection of points with a line through them.
    So the representation is a feature in which one feature which is the line, and one feature collection which is the set of point features.
    :param section: the section to be converted
    :return: a feature collection which is the geojson version of the section
    """

    ts = esta.TimeSeries.get_time_series(section.user_id)
    entry_it = ts.find_entries(["analysis/recreated_location"],
                               esda.get_time_query_for_trip_like(
                                   "analysis/cleaned_section",
                                   section.get_id()))

    # TODO: Decide whether we want to use Rewrite to use dataframes throughout instead of python arrays.
    # dataframes insert nans. We could use fillna to fill with default values, but if we are not actually
    # using dataframe features here, it is unclear how much that would help.
    feature_array = []
    section_location_entries = [ecwe.Entry(entry) for entry in entry_it]
    if len(section_location_entries) != 0:
        logging.debug("first element in section_location_array = %s" % section_location_entries[0])

        if not ecc.compare_rounded_arrays(section.data.end_loc.coordinates,
                                      section_location_entries[-1].data.loc.coordinates,
                                      digits=4):
            logging.info("section_location_array[-1].data.loc %s != section.data.end_loc %s even after df.ts fix, filling gap" % \
                    (section_location_entries[-1].data.loc, section.data.end_loc))
            assert(False)
            last_loc_doc = ts.get_entry_at_ts("background/filtered_location", "data.ts", section.data.end_ts)
            if last_loc_doc is None:
                logging.warning("can't find entry to patch gap, leaving gap")
            else:
                last_loc_entry = ecwe.Entry(last_loc_doc)
                logging.debug("Adding new entry %s to fill the end point gap between %s and %s"
                   % (last_loc_entry.data.loc, section_location_entries[-1].data.loc,
                        section.data.end_loc))
                section_location_entries.append(last_loc_entry)

    points_line_feature = point_array_to_line(section_location_entries)
    points_line_feature.id = str(section.get_id())
    points_line_feature.properties.update(copy.copy(section.data))
    # Update works on dicts, convert back to a section object to make the modes
    # work properly
    points_line_feature.properties = ecwcs.Cleanedsection(points_line_feature.properties)

    points_line_feature.properties["feature_type"] = "section"

    if eac.get_section_key_for_analysis_results() == esda.INFERRED_SECTION_KEY:
        ise = esds.cleaned2inferred_section(section.user_id, section.get_id())
        if ise is not None:
            logging.debug("mapped cleaned section %s -> inferred section %s" % 
                (section.get_id(), ise.get_id()))
            logging.debug("changing mode from %s -> %s" % 
                (points_line_feature.properties.sensed_mode, ise.data.sensed_mode))
            points_line_feature.properties["sensed_mode"] = str(ise.data.sensed_mode)
        else:
            points_line_feature.properties["sensed_mode"] = str(points_line_feature.properties.sensed_mode)
    else:
        points_line_feature.properties["sensed_mode"] = str(points_line_feature.properties.sensed_mode)
    
    _del_non_derializable(points_line_feature.properties, ["start_loc", "end_loc"])

    # feature_array.append(gj.FeatureCollection(points_feature_array))
    feature_array.append(points_line_feature)

    return gj.FeatureCollection(feature_array)
コード例 #11
0
    def testLocalDtFillTimesDailyMultiTzGoingEast(self):
        key = (2016, 5, 3)
        test_section_list = []
        # This is perhaps an extreme use case, but it is actually a fairly
        # common one with air travel

        # Step 1: user leaves SFO at 1am on the 3rd for JFK on a cross-country flight
        test_section_list.append(
            self._createTestSection(
                arrow.Arrow(2016, 5, 3, 1, tzinfo=tz.gettz(PST)), PST))
        # cross-country takes 8 hours, so she arrives in New York at 9:00 IST = 12:00am EDT
        # (taking into account the time difference)
        test_section_list[0]['data'] = self._fillDates(
            test_section_list[0].data, "end_",
            arrow.Arrow(2016, 5, 3, 9, tzinfo=tz.gettz(PST)), EST)

        # Step 2: user leaves JFK for LHR at 1pm EST.
        test_section_list.append(
            self._createTestSection(
                arrow.Arrow(2016, 5, 3, 13, tzinfo=tz.gettz(EST)), EST))

        # cross-atlantic flight takes 7 hours, so she arrives at LHR at 8:00pm EDT
        # = 2am on the 4th local time
        test_section_list[1]['data'] = self._fillDates(
            test_section_list[1].data, "end_",
            arrow.Arrow(2016, 5, 3, 21, tzinfo=tz.gettz(EST)), BST)

        # Then, she catches the train from the airport to her hotel in London
        # at 3am local time = 9:00pm EST
        # So as per local time, this is a new trip
        #
        # This clearly indicates why we need to use the timezone of the end of
        # last section to generate the timestamp for the range. If we use the
        # timezone of the beginning of the trip, we will say that the range ends
        # at midnight EST. But then it should include the next_day_first_trip,
        # which starts at 9pm EST, but it does not.
        # So we should use midnight BST instead. Note that midnight BST was
        # actually during the trip, but then it is no different from a regular
        # trip (in one timezone) where the trip spans the date change
        next_day_first_trip = self._createTestSection(
            arrow.Arrow(2016, 5, 4, 3, tzinfo=tz.gettz(BST)), BST)

        section_group_df = self.ts.to_data_df(
            eac.get_section_key_for_analysis_results(), test_section_list)
        logging.debug("first row is %s" % section_group_df.loc[0])

        # Timestamps are monotonically increasing
        self.assertEqual(section_group_df.start_ts.tolist(),
                         [1462262400, 1462294800])

        # The timezone for the end time is PST since that's where we started
        # the first trip from
        self.assertEqual(earmt._get_tz(section_group_df), PST)

        ms = ecwms.ModeStatTimeSummary()
        earmt.local_dt_fill_times_daily(key, section_group_df, ms)
        logging.debug("before starting checks, ms = %s" % ms)

        self.assertEqual(ms.ts, 1462258800)
        self.assertEqual(ms.local_dt.day, 3)
        self.assertEqual(ms.local_dt.timezone, PST)

        # This test fails if it is not BST
        self.assertGreater(next_day_first_trip.data.start_ts, ms.ts)
コード例 #12
0
    def testLocalDtFillTimesDailyMultiTzGoingWest(self):
        key = (2016, 5, 3)
        test_section_list = []
        # This is perhaps an extreme use case, but it is actually a fairly
        # common one with air travel

        # Step 1: user leaves Delhi at 1am on the 3rd for JFK on the non-stop
        test_section_list.append(
            self._createTestSection(
                arrow.Arrow(2016, 5, 3, 1, tzinfo=tz.gettz(IST)), IST))
        # non-stop takes 15 hours, so she arrives in New York at 16:00 IST = 6:30am EDT
        # (taking into account the time difference)

        # Step 2: user leaves JFK for SFO at 7am EST on a non-stop
        test_section_list.append(
            self._createTestSection(
                arrow.Arrow(2016, 5, 3, 7, tzinfo=tz.gettz(EST)), EST))

        # cross-country flight takes 8 hours, so she arrives in SFO at 15:00 EDT
        # = 12:00 PDT
        test_section_list[1]['data'] = self._fillDates(
            test_section_list[1].data, "end_",
            arrow.Arrow(2016, 5, 3, 15, tzinfo=tz.gettz(EST)), PST)

        # Step 2: user starts a trip out of SFO a midnight of the 4th PST
        # (earliest possible trip)
        # for our timestamp algo to be correct, this has to be after the
        # timestamp for the range
        next_day_first_trip = self._createTestSection(
            arrow.Arrow(2016, 5, 4, 0, tzinfo=tz.gettz(PST)), PST)

        section_group_df = self.ts.to_data_df(
            eac.get_section_key_for_analysis_results(), test_section_list)

        # Timestamps are monotonically increasing
        self.assertEqual(section_group_df.start_ts.tolist(),
                         [1462217400, 1462273200])
        self.assertEqual(next_day_first_trip.data.start_ts, 1462345200)

        # The timezone for the end time is IST since that's where we started
        # the first trip
        self.assertEqual(earmt._get_tz(section_group_df), IST)

        ms = ecwms.ModeStatTimeSummary()
        earmt.local_dt_fill_times_daily(key, section_group_df, ms)
        logging.debug("before starting checks, ms = %s" % ms)

        # The end of the period is the end of the day in PST. So that we can
        # capture trip home from the airport, etc.
        # The next trip must start from the same timezone
        # if a trip straddles two timezones, we need to decide how the metrics
        # are split. A similar issue occurs when the trip straddles two days.
        # We have arbitrarily decided to bucket by start_time, so we follow the
        # same logic and bucket by the timezone of the start time.
        #
        # So the bucket for this day ends at the end of the day in EDT.
        # If we included any trips after noon in SF, e.g. going home from the
        # aiport, then it would extend to midnight PDT.
        #
        # The main argument that I'm trying to articulate is that we need to
        # come up with a notion of when the bucket ended. To some extent, we can
        # set this arbitrarily between the end of the last trip on the 3rd and the
        # and the start of the first trip on the 4th.
        #
        # Picking midnight on the timezone of the last trip on the 3rd is
        # reasonable since we know that no trips have started since the last
        # trip on the 3rd to the midnight of the 3rd EST.

        # So the worry here is that the first trip on the next day may be on
        # next day in the end timezone of the trip but on the same day in the
        # start timezone of the trip
        # e.g. reverse trip
        # maybe using the end of the section is best after all

        self.assertEqual(ms.ts, 1462213800)
        self.assertEqual(ms.local_dt.day, 3)
        self.assertEqual(ms.local_dt.timezone, IST)
        self.assertGreater(next_day_first_trip.data.start_ts, ms.ts)
コード例 #13
0
def section_to_geojson(section, tl):
    """
    This is the trickiest part of the visualization.
    The section is basically a collection of points with a line through them.
    So the representation is a feature in which one feature which is the line, and one feature collection which is the set of point features.
    :param section: the section to be converted
    :return: a feature collection which is the geojson version of the section
    """

    ts = esta.TimeSeries.get_time_series(section.user_id)
    entry_it = ts.find_entries(["analysis/recreated_location"],
                               esda.get_time_query_for_trip_like(
                                   "analysis/cleaned_section",
                                   section.get_id()))

    # TODO: Decide whether we want to use Rewrite to use dataframes throughout instead of python arrays.
    # dataframes insert nans. We could use fillna to fill with default values, but if we are not actually
    # using dataframe features here, it is unclear how much that would help.
    feature_array = []
    section_location_entries = [ecwe.Entry(entry) for entry in entry_it]
    if len(section_location_entries) != 0:
        logging.debug("first element in section_location_array = %s" %
                      section_location_entries[0])

        if not ecc.compare_rounded_arrays(
                section.data.end_loc.coordinates,
                section_location_entries[-1].data.loc.coordinates,
                digits=4):
            logging.info("section_location_array[-1].data.loc %s != section.data.end_loc %s even after df.ts fix, filling gap" % \
                    (section_location_entries[-1].data.loc, section.data.end_loc))
            assert (False)
            last_loc_doc = ts.get_entry_at_ts("background/filtered_location",
                                              "data.ts", section.data.end_ts)
            if last_loc_doc is None:
                logging.warning("can't find entry to patch gap, leaving gap")
            else:
                last_loc_entry = ecwe.Entry(last_loc_doc)
                logging.debug(
                    "Adding new entry %s to fill the end point gap between %s and %s"
                    % (last_loc_entry.data.loc,
                       section_location_entries[-1].data.loc,
                       section.data.end_loc))
                section_location_entries.append(last_loc_entry)

    points_line_feature = point_array_to_line(section_location_entries)
    points_line_feature.id = str(section.get_id())
    points_line_feature.properties.update(copy.copy(section.data))
    # Update works on dicts, convert back to a section object to make the modes
    # work properly
    points_line_feature.properties = ecwcs.Cleanedsection(
        points_line_feature.properties)

    points_line_feature.properties["feature_type"] = "section"

    if eac.get_section_key_for_analysis_results() == esda.INFERRED_SECTION_KEY:
        ise = esds.cleaned2inferred_section(section.user_id, section.get_id())
        if ise is not None:
            logging.debug("mapped cleaned section %s -> inferred section %s" %
                          (section.get_id(), ise.get_id()))
            logging.debug("changing mode from %s -> %s" %
                          (points_line_feature.properties.sensed_mode,
                           ise.data.sensed_mode))
            points_line_feature.properties["sensed_mode"] = str(
                ise.data.sensed_mode)
        else:
            points_line_feature.properties["sensed_mode"] = str(
                points_line_feature.properties.sensed_mode)
    else:
        points_line_feature.properties["sensed_mode"] = str(
            points_line_feature.properties.sensed_mode)

    _del_non_derializable(points_line_feature.properties,
                          ["start_loc", "end_loc"])

    # feature_array.append(gj.FeatureCollection(points_feature_array))
    feature_array.append(points_line_feature)

    return gj.FeatureCollection(feature_array)
コード例 #14
0
    def testLocalDtFillTimesDailyMultiTzGoingEast(self):
        key = (2016, 5, 3)
        test_section_list = []
        # This is perhaps an extreme use case, but it is actually a fairly
        # common one with air travel

        # Step 1: user leaves SFO at 1am on the 3rd for JFK on a cross-country flight
        test_section_list.append(
            self._createTestSection(arrow.Arrow(2016,5,3,1, tzinfo=tz.gettz(PST)),
                                    PST))
        # cross-country takes 8 hours, so she arrives in New York at 9:00 IST = 12:00am EDT
        # (taking into account the time difference)
        test_section_list[0]['data'] = self._fillDates(test_section_list[0].data, "end_",
                        arrow.Arrow(2016,5,3,9,tzinfo=tz.gettz(PST)),
                        EST)

        # Step 2: user leaves JFK for LHR at 1pm EST.
        test_section_list.append(
            self._createTestSection(arrow.Arrow(2016,5,3,13, tzinfo=tz.gettz(EST)),
                                    EST))

        # cross-atlantic flight takes 7 hours, so she arrives at LHR at 8:00pm EDT
        # = 2am on the 4th local time
        test_section_list[1]['data'] = self._fillDates(test_section_list[1].data, "end_",
                        arrow.Arrow(2016,5,3,21,tzinfo=tz.gettz(EST)),
                        BST)

        # Then, she catches the train from the airport to her hotel in London
        # at 3am local time = 9:00pm EST
        # So as per local time, this is a new trip
        #
        # This clearly indicates why we need to use the timezone of the end of
        # last section to generate the timestamp for the range. If we use the
        # timezone of the beginning of the trip, we will say that the range ends
        # at midnight EST. But then it should include the next_day_first_trip,
        # which starts at 9pm EST, but it does not.
        # So we should use midnight BST instead. Note that midnight BST was
        # actually during the trip, but then it is no different from a regular
        # trip (in one timezone) where the trip spans the date change
        next_day_first_trip = self._createTestSection(
            arrow.Arrow(2016,5,4,3, tzinfo=tz.gettz(BST)),
            BST)

        section_group_df = self.ts.to_data_df(eac.get_section_key_for_analysis_results(),
            test_section_list)
        logging.debug("first row is %s" % section_group_df.loc[0])

        # Timestamps are monotonically increasing
        self.assertEqual(section_group_df.start_ts.tolist(),
                         [1462262400, 1462294800])

        # The timezone for the end time is PST since that's where we started
        # the first trip from
        self.assertEqual(earmt._get_tz(section_group_df), PST)

        ms = ecwms.ModeStatTimeSummary()
        earmt.local_dt_fill_times_daily(key, section_group_df, ms)
        logging.debug("before starting checks, ms = %s" % ms)

        self.assertEqual(ms.ts, 1462258800)
        self.assertEqual(ms.local_dt.day, 3)
        self.assertEqual(ms.local_dt.timezone, PST)

        # This test fails if it is not BST
        self.assertGreater(next_day_first_trip.data.start_ts, ms.ts)
コード例 #15
0
    def testLocalDtFillTimesDailyMultiTzGoingWest(self):
        key = (2016, 5, 3)
        test_section_list = []
        # This is perhaps an extreme use case, but it is actually a fairly
        # common one with air travel

        # Step 1: user leaves Delhi at 1am on the 3rd for JFK on the non-stop
        test_section_list.append(
            self._createTestSection(arrow.Arrow(2016,5,3,1, tzinfo=tz.gettz(IST)),
                                    IST))
        # non-stop takes 15 hours, so she arrives in New York at 16:00 IST = 6:30am EDT
        # (taking into account the time difference)

        # Step 2: user leaves JFK for SFO at 7am EST on a non-stop
        test_section_list.append(
            self._createTestSection(arrow.Arrow(2016,5,3,7, tzinfo=tz.gettz(EST)),
                                    EST))

        # cross-country flight takes 8 hours, so she arrives in SFO at 15:00 EDT
        # = 12:00 PDT
        test_section_list[1]['data'] = self._fillDates(test_section_list[1].data, "end_",
                        arrow.Arrow(2016,5,3,15,tzinfo=tz.gettz(EST)),
                        PST)

        # Step 2: user starts a trip out of SFO a midnight of the 4th PST
        # (earliest possible trip)
        # for our timestamp algo to be correct, this has to be after the
        # timestamp for the range
        next_day_first_trip = self._createTestSection(
            arrow.Arrow(2016,5,4,0, tzinfo=tz.gettz(PST)),
                                    PST)

        section_group_df = self.ts.to_data_df(eac.get_section_key_for_analysis_results(),
                                              test_section_list)

        # Timestamps are monotonically increasing
        self.assertEqual(section_group_df.start_ts.tolist(),
                         [1462217400, 1462273200])
        self.assertEqual(next_day_first_trip.data.start_ts, 1462345200)

        # The timezone for the end time is IST since that's where we started
        # the first trip
        self.assertEqual(earmt._get_tz(section_group_df), IST)

        ms = ecwms.ModeStatTimeSummary()
        earmt.local_dt_fill_times_daily(key, section_group_df, ms)
        logging.debug("before starting checks, ms = %s" % ms)

        # The end of the period is the end of the day in PST. So that we can
        # capture trip home from the airport, etc.
        # The next trip must start from the same timezone
        # if a trip straddles two timezones, we need to decide how the metrics
        # are split. A similar issue occurs when the trip straddles two days.
        # We have arbitrarily decided to bucket by start_time, so we follow the
        # same logic and bucket by the timezone of the start time.
        #
        # So the bucket for this day ends at the end of the day in EDT.
        # If we included any trips after noon in SF, e.g. going home from the
        # aiport, then it would extend to midnight PDT.
        #
        # The main argument that I'm trying to articulate is that we need to
        # come up with a notion of when the bucket ended. To some extent, we can
        # set this arbitrarily between the end of the last trip on the 3rd and the
        # and the start of the first trip on the 4th.
        #
        # Picking midnight on the timezone of the last trip on the 3rd is
        # reasonable since we know that no trips have started since the last
        # trip on the 3rd to the midnight of the 3rd EST.

        # So the worry here is that the first trip on the next day may be on
        # next day in the end timezone of the trip but on the same day in the
        # start timezone of the trip
        # e.g. reverse trip
        # maybe using the end of the section is best after all

        self.assertEqual(ms.ts, 1462213800)
        self.assertEqual(ms.local_dt.day, 3)
        self.assertEqual(ms.local_dt.timezone, IST)
        self.assertGreater(next_day_first_trip.data.start_ts, ms.ts)