Ejemplo n.º 1
0
 def evaluate_bins(self):
     self.labels = []
     for bin in self.bins:
         for b in bin:
             self.labels.append(self.bins.index(bin))
     if not self.data or not self.bins:
         return
     if len(self.labels) < 2:
         logging.debug('Everything is in one bin.')
         return
     labels = numpy.array(self.labels)
     points = []
     for bin in self.bins:
         for b in bin:
             tb = self.data[b]
             start_place = esda.get_entry(esda.CLEANED_PLACE_KEY,
                                          tb.data.start_place)
             end_place = esda.get_entry(esda.CLEANED_PLACE_KEY,
                                        tb.data.end_place)
             start_lon = start_place.data.location["coordinates"][0]
             start_lat = start_place.data.location["coordinates"][1]
             end_lon = end_place.data.location["coordinates"][0]
             end_lat = end_place.data.location["coordinates"][1]
             path = [start_lat, start_lon, end_lat, end_lon]
             points.append(path)
     logging.debug("number of labels are %d, number of points are = %d" %
                   (len(labels), len(points)))
     a = metrics.silhouette_score(numpy.array(points), labels)
     logging.debug('number of bins is %d' % len(self.bins))
     logging.debug('silhouette score is %d' % a)
     return a
Ejemplo n.º 2
0
    def __init__(self, data, radius):
        self.data = data
        if not data:
            self.data = []
        self.bins = []
        self.radius = float(radius)
        for t in self.data:
            logging.debug("Considering trip %s" % t)
            try:
                start_place = esda.get_entry(esda.CLEANED_PLACE_KEY,
                                             t.data.start_place)
                end_place = esda.get_entry(esda.CLEANED_PLACE_KEY,
                                           t.data.end_place)
                start_lon = start_place.data.location["coordinates"][0]
                start_lat = start_place.data.location["coordinates"][1]
                end_lon = end_place.data.location["coordinates"][0]
                end_lat = end_place.data.location["coordinates"][1]
                logging.debug("endpoints are = (%s, %s) and (%s, %s)" %
                              (start_lon, start_lat, end_lon, end_lat))
                if self.distance(start_lat, start_lon, end_lat, end_lon):
                    self.data.remove(t)
            except:
                logging.exception(
                    "exception while getting start and end places for %s" % t)
                self.data.remove(t)

        logging.debug(
            'After removing trips that are points, there are %s data points' %
            len(self.data))
        self.size = len(self.data)
 def get_reps(self):
     self.reps = []
     if not self.data:
         return
     for cluster in self.clusters:
         points = [[], [], [], []]
         for c in cluster:
             if self.is_old:
                 points[0].append(c.trip_start_location.lat)
                 points[1].append(c.trip_start_location.lon)
                 points[2].append(c.trip_end_location.lat)
                 points[3].append(c.trip_end_location.lon)
             else:
                 # We want (lat, lon) to be consistent with old above.
                 # But in the new, our data is in geojson so it is (lon, lat).
                 # Fix it by flipping the order of the indices
                 # Note also that we want to use the locations of the start
                 # and end places, not of the start point of the trip, which
                 # may be some distance away due to geofencing.
                 start_place = esda.get_entry(esda.CLEANED_PLACE_KEY,
                                              c.data.start_place)
                 end_place = esda.get_entry(esda.CLEANED_PLACE_KEY,
                                              c.data.end_place)
                 points[0].append(start_place.data.location["coordinates"][1])
                 points[1].append(start_place.data.location["coordinates"][0])
                 points[2].append(end_place.data.location["coordinates"][1])
                 points[3].append(end_place.data.location["coordinates"][0])
                 logging.debug("in representatives, endpoints are = %s" %
                               points)
         centers = numpy.mean(points, axis=1)
         a = Trip(None, None, None, None, None, None, Coordinate(centers[0], centers[1]), Coordinate(centers[2], centers[3]))
         self.reps.append(a)
Ejemplo n.º 4
0
    def get_reps(self):
        self.reps = []
        if not self.data:
            return
        for i, cluster in enumerate(self.clusters):
            logging.debug("Considering cluster %d = %s" % (i, cluster))
            points = [[], [], [], []]

            # If this cluster has no points, we skip it
            if len(cluster) == 0:
                logging.info("Cluster %d = %s, has length %d, skipping" %
                             (i, cluster, len(cluster)))
                continue

            for j, c in enumerate(cluster):
                logging.debug("Consider point %d = %s" % (j, c))
                start_place = esda.get_entry(esda.CLEANED_PLACE_KEY,
                                             c.data.start_place)
                end_place = esda.get_entry(esda.CLEANED_PLACE_KEY,
                                             c.data.end_place)
                points[0].append(start_place.data.location["coordinates"][1]) # lat
                points[1].append(start_place.data.location["coordinates"][0]) # lng
                points[2].append(end_place.data.location["coordinates"][1]) # lat
                points[3].append(end_place.data.location["coordinates"][0]) # lng
                logging.debug("in representatives, endpoints have len = %s" %
                              len(points))
            centers = numpy.mean(points, axis=1)
            logging.debug("For cluster %d, centers are %s" % (i, centers))
            t = ecwt.Trip({
                "start_loc": gj.Point([centers[1], centers[0]]),
                "end_loc": gj.Point([centers[3], centers[2]])
            })
            a = ecwe.Entry.create_entry(c.user_id, "analysis/cleaned_trip", t)
            self.reps.append(a)
Ejemplo n.º 5
0
    def __init__(self, data, radius):
        self.data = data
        if not data:
            self.data = []
        self.bins = []
        self.radius = float(radius)
        for t in self.data:
            logging.debug("Considering trip %s" % t)
            try:
                start_place = esda.get_entry(esda.CLEANED_PLACE_KEY,
                                             t.data.start_place)
                end_place = esda.get_entry(esda.CLEANED_PLACE_KEY,
                                             t.data.end_place)
                start_lon = start_place.data.location["coordinates"][0]
                start_lat = start_place.data.location["coordinates"][1]
                end_lon = end_place.data.location["coordinates"][0]
                end_lat = end_place.data.location["coordinates"][1]
                logging.debug("endpoints are = (%s, %s) and (%s, %s)" %
                              (start_lon, start_lat, end_lon, end_lat))
                if self.distance(start_lat, start_lon, end_lat, end_lon):
                    self.data.remove(t)
            except:
                logging.exception("exception while getting start and end places for %s" % t)
                self.data.remove(t)

        logging.debug('After removing trips that are points, there are %s data points' % len(self.data))
        self.size = len(self.data)
Ejemplo n.º 6
0
 def evaluate_bins(self):
     self.labels = []
     for bin in self.bins:
         for b in bin:
             self.labels.append(self.bins.index(bin))
     if not self.data or not self.bins:
         return
     if len(self.labels) < 2:
         logging.debug('Everything is in one bin.')
         return
     labels = numpy.array(self.labels)
     points = []
     for bin in self.bins:
         for b in bin:
             tb = self.data[b]
             start_place = esda.get_entry(esda.CLEANED_PLACE_KEY,
                                          tb.data.start_place)
             end_place = esda.get_entry(esda.CLEANED_PLACE_KEY,
                                        tb.data.end_place)
             start_lon = start_place.data.location["coordinates"][0]
             start_lat = start_place.data.location["coordinates"][1]
             end_lon = end_place.data.location["coordinates"][0]
             end_lat = end_place.data.location["coordinates"][1]
             path = [start_lat, start_lon, end_lat, end_lon]
             points.append(path)
     logging.debug("number of labels are %d, number of points are = %d" %
                   (len(labels), len(points)))
     a = metrics.silhouette_score(numpy.array(points), labels)
     logging.debug('number of bins is %d' % len(self.bins))
     logging.debug('silhouette score is %d' % a)
     return a
Ejemplo n.º 7
0
    def get_reps(self):
        self.reps = []
        if not self.data:
            return
        for i, cluster in enumerate(self.clusters):
            logging.debug("Considering cluster %d = %s" % (i, cluster))
            points = [[], [], [], []]

            # If this cluster has no points, we skip it
            if len(cluster) == 0:
                logging.info("Cluster %d = %s, has length %d, skipping" %
                             (i, cluster, len(cluster)))
                continue

            for j, c in enumerate(cluster):
                logging.debug("Consider point %d = %s" % (j, c))
                start_place = esda.get_entry(esda.CLEANED_PLACE_KEY,
                                             c.data.start_place)
                end_place = esda.get_entry(esda.CLEANED_PLACE_KEY,
                                             c.data.end_place)
                points[0].append(start_place.data.location["coordinates"][1]) # lat
                points[1].append(start_place.data.location["coordinates"][0]) # lng
                points[2].append(end_place.data.location["coordinates"][1]) # lat
                points[3].append(end_place.data.location["coordinates"][0]) # lng
                logging.debug("in representatives, endpoints have len = %s" %
                              len(points))
            centers = numpy.mean(points, axis=1)
            logging.debug("For cluster %d, centers are %s" % (i, centers))
            t = ecwt.Trip({
                "start_loc": gj.Point([centers[1], centers[0]]),
                "end_loc": gj.Point([centers[3], centers[2]])
            })
            a = ecwe.Entry.create_entry(c.user_id, "analysis/cleaned_trip", t)
            self.reps.append(a)
Ejemplo n.º 8
0
    def _addIfNotExists(self, place_id):
        """
        Adds the place specified by the given place_id to the place list and the place map and returns it
        :param place_id:
        :return:
        """
        import emission.storage.decorations.place_queries as esdp

        if place_id not in self.id_map:
            logging.debug("place id %s is not in the map, searching in database" % place_id)
            place_entry = esda.get_entry(self.place_key, place_id)
            self.places.append(place_entry)
            self.id_map[place_id] = place_entry
            logging.debug("retrieved object %s and added to id_map" % place_entry)
            return place_entry
        else:
            return self.id_map[place_id]
Ejemplo n.º 9
0
    def _addIfNotExists(self, place_id):
        """
        Adds the place specified by the given place_id to the place list and the place map and returns it
        :param place_id:
        :return:
        """
        import emission.storage.decorations.place_queries as esdp

        if place_id not in self.id_map:
            logging.debug(
                "place id %s is not in the map, searching in database" %
                place_id)
            place_entry = esda.get_entry(self.place_key, place_id)
            self.places.append(place_entry)
            self.id_map[place_id] = place_entry
            logging.debug("retrieved object %s and added to id_map" %
                          place_entry)
            return place_entry
        else:
            return self.id_map[place_id]
    def testGetLastPlace(self):
        old_place = ecwrp.Rawplace()
        old_place.enter_ts = 5
        old_place_id = esta.TimeSeries.get_time_series(
            self.testUserId).insert_data(
            self.testUserId, "segmentation/raw_place", old_place)
        old_place_entry = esda.get_entry(esda.RAW_PLACE_KEY, old_place_id)
        logging.debug("old place entry is %s "% old_place_entry)
        esta.TimeSeries.get_time_series(self.testUserId).update(old_place_entry)
        # The place saved in the previous step has no exit_ts set, so it is the
        # last place
        last_place_entry = esdp.get_last_place_entry(esda.RAW_PLACE_KEY,
                                                     self.testUserId)
        last_place_entry["data"]["exit_ts"] = 6
        logging.debug("About to update entry to %s" % last_place_entry)
        esta.TimeSeries.get_time_series(self.testUserId).update(last_place_entry)

        # Now that I have set the exit_ts and saved it, there is no last place
        last_place_entry = esdp.get_last_place_entry(esda.RAW_PLACE_KEY,
                                                     self.testUserId)
        self.assertIsNone(last_place_entry)
Ejemplo n.º 11
0
    def testGetLastPlace(self):
        old_place = ecwrp.Rawplace()
        old_place.enter_ts = 5
        old_place_id = esta.TimeSeries.get_time_series(
            self.testUserId).insert_data(self.testUserId,
                                         "segmentation/raw_place", old_place)
        old_place_entry = esda.get_entry(esda.RAW_PLACE_KEY, old_place_id)
        logging.debug("old place entry is %s " % old_place_entry)
        esta.TimeSeries.get_time_series(
            self.testUserId).update(old_place_entry)
        # The place saved in the previous step has no exit_ts set, so it is the
        # last place
        last_place_entry = esdp.get_last_place_entry(esda.RAW_PLACE_KEY,
                                                     self.testUserId)
        last_place_entry["data"]["exit_ts"] = 6
        logging.debug("About to update entry to %s" % last_place_entry)
        esta.TimeSeries.get_time_series(
            self.testUserId).update(last_place_entry)

        # Now that I have set the exit_ts and saved it, there is no last place
        last_place_entry = esdp.get_last_place_entry(esda.RAW_PLACE_KEY,
                                                     self.testUserId)
        self.assertIsNone(last_place_entry)
Ejemplo n.º 12
0
def get_last_place_before(place_key, reset_ts, user_id):
    """
    Unlike `get_last_place_before` which returns the last place in the
    timeline, this returns the last place before a particular timestamp.
    Used to reset the pipeline, for example.

    To implement this, we can't just look for places before that timestamp,
    because then we will get a list. And we don't want to retrieve all of them
    and sort either.

    We can look for places that exit after that timestamp, but that will also
    give a list. But hopefully, a shorter list, so that we don't have to sort
    as much.  I can't think of an alternative that doesn't require sorting.

    Oh wait! There is an alternative!

    We can look for the place that has an enter timestamp before the ts and an
    exit timestamp after, or a trip that has a start timestamp before the ts
    and an end timestamp after. We should only find one. And if we find the
    trip then the place is its start place.

    Note that these correspond to the two use cases in 
    https://github.com/e-mission/e-mission-server/issues/333
    """
    trip_key_query = _get_trip_key_query(place_key)
    logging.debug("Looking for last place before %s" % reset_ts)

    ts = esta.TimeSeries.get_time_series(user_id)
    all_user_places = list(edb.get_analysis_timeseries_db().find(
        {"user_id": user_id, "metadata.key": place_key},
        {"_id": True, "data.enter_fmt_time": True, "data.exit_fmt_time": True}))
    logging.debug("all places for this user = %s" % all_user_places)
    ret_place_doc = ts.analysis_timeseries_db.find_one({'user_id': user_id,
                                                        'metadata.key': place_key,
                                                        'data.exit_ts' : {'$gt': reset_ts},
                                                        'data.enter_ts': {'$lt': reset_ts}
                                                       })
    logging.debug("last place doc for user %s = %s" % (user_id, ret_place_doc))
    ret_trip_doc = ts.analysis_timeseries_db.find_one({'user_id': user_id,
                                                        'metadata.key': trip_key_query,
                                                        'data.end_ts' : {'$gt': reset_ts},
                                                        'data.start_ts': {'$lt': reset_ts}
                                                       })
    logging.debug("last trip doc for user %s = %s" % (user_id, ret_trip_doc))
    if ret_place_doc is None and ret_trip_doc is None:
        # Check to see if the pipeline ended before this
        last_place = get_last_place_entry(place_key, user_id)
        logging.debug("last_place = %s, reset_ts = %s" % 
            (last_place, reset_ts))
        if last_place is None:
            return None
        elif last_place.data.enter_ts < reset_ts:
            return last_place
        else:
            raise ValueError("No trip or place straddling time %s for user %s" % 
                (reset_ts, user_id))
    if ret_place_doc is None:
        assert ret_trip_doc is not None
        logging.info("ret_trip_doc start = %s, end = %s" % 
            (ret_trip_doc["data"]["start_fmt_time"],
             ret_trip_doc["data"]["end_fmt_time"]))
        ret_place_doc = esda.get_entry(place_key, ret_trip_doc["data"]['start_place'])

    assert ret_place_doc is not None
    ret_place = ecwe.Entry(ret_place_doc)
    return ret_place
Ejemplo n.º 13
0
def get_last_place_before(place_key, reset_ts, user_id):
    """
    Unlike `get_last_place_before` which returns the last place in the
    timeline, this returns the last place before a particular timestamp.
    Used to reset the pipeline, for example.

    To implement this, we can't just look for places before that timestamp,
    because then we will get a list. And we don't want to retrieve all of them
    and sort either.

    We can look for places that exit after that timestamp, but that will also
    give a list. But hopefully, a shorter list, so that we don't have to sort
    as much.  I can't think of an alternative that doesn't require sorting.

    Oh wait! There is an alternative!

    We can look for the place that has an enter timestamp before the ts and an
    exit timestamp after, or a trip that has a start timestamp before the ts
    and an end timestamp after. We should only find one. And if we find the
    trip then the place is its start place.

    Note that these correspond to the two use cases in 
    https://github.com/e-mission/e-mission-server/issues/333
    """
    trip_key_query = _get_trip_key_query(place_key)
    logging.debug("Looking for last place before %s" % reset_ts)

    ts = esta.TimeSeries.get_time_series(user_id)
    all_user_places = list(edb.get_analysis_timeseries_db().find(
        {"user_id": user_id, "metadata.key": place_key},
        {"_id": True, "data.enter_fmt_time": True, "data.exit_fmt_time": True}))
    logging.debug("all places for this user = %s" % all_user_places)
    ret_place_doc = ts.analysis_timeseries_db.find_one({'user_id': user_id,
                                                        'metadata.key': place_key,
                                                        'data.exit_ts' : {'$gt': reset_ts},
                                                        'data.enter_ts': {'$lt': reset_ts}
                                                       })
    logging.debug("last place doc for user %s = %s" % (user_id, ret_place_doc))
    ret_trip_doc = ts.analysis_timeseries_db.find_one({'user_id': user_id,
                                                        'metadata.key': trip_key_query,
                                                        'data.end_ts' : {'$gt': reset_ts},
                                                        'data.start_ts': {'$lt': reset_ts}
                                                       })
    logging.debug("last trip doc for user %s = %s" % (user_id, ret_trip_doc))
    if ret_place_doc is None and ret_trip_doc is None:
        # Check to see if the pipeline ended before this
        last_place = get_last_place_entry(place_key, user_id)
        logging.debug("last_place = %s, reset_ts = %s" % 
            (last_place, reset_ts))
        if last_place is None:
            return None
        elif last_place.data.enter_ts is None:
            return None
        elif last_place.data.enter_ts < reset_ts:
            return last_place
        else:
            raise ValueError("No trip or place straddling time %s for user %s" % 
                (reset_ts, user_id))
    if ret_place_doc is None:
        assert ret_trip_doc is not None
        logging.info("ret_trip_doc start = %s, end = %s" % 
            (ret_trip_doc["data"]["start_fmt_time"],
             ret_trip_doc["data"]["end_fmt_time"]))
        ret_place_doc = esda.get_entry(place_key, ret_trip_doc["data"]['start_place'])

    assert ret_place_doc is not None
    ret_place = ecwe.Entry(ret_place_doc)
    return ret_place
Ejemplo n.º 14
0
def reset_user_to_ts(user_id, ts, is_dry_run):
    """
        When we delete objects, we want to leave an open connection to the prior
        chain to connect the newly created chain to. In other words, if we want
        to delete after  2016-07-23, we want the place that we entered at
        2016-07-22 to be retained but with no starting trip, so that we can
        rejoin the newly identified trip to the existing place
        The various use cases for this are documented under 
        https://github.com/e-mission/e-mission-server/issues/333

        But basically, it comes down to
        a) find the place before the time
        b) clear all analysis results after it
        c) open the place
        d) reset pipeline states to its enter_ts

        FYI: this is how we did the query earlier
        edb.get_analysis_timeseries_db().find(first_affected_query).sort('data.exit_ts').limit(1)
    """
    if user_id is None:
        logging.info("user_id = None, skipping reset...")
        return

    # Find the place before the time
    try:
        last_cleaned_place = esdp.get_last_place_before(esda.CLEANED_PLACE_KEY, ts, user_id)
        logging.debug("last_cleaned_place = %s" % last_cleaned_place)
        if last_cleaned_place is None or last_cleaned_place.data.exit_ts is None:
            logging.info("Data collection for user %s stopped before reset time, early return" % user_id)
            return
    except ValueError as e:
        first_cleaned_place = esdp.get_first_place_entry(esda.CLEANED_PLACE_KEY, user_id)
        if first_cleaned_place is not None and first_cleaned_place.data.exit_ts > ts:
            logging.info("first_cleaned_place.exit = %s (%s), resetting to start" % 
                (first_cleaned_place.data.exit_ts,
                first_cleaned_place.data.exit_fmt_time))
            reset_user_to_start(user_id, is_dry_run)
            return
        else:
            raise

    last_raw_place_id = last_cleaned_place["data"]["raw_places"][-1]
    last_raw_place = esda.get_entry(esda.RAW_PLACE_KEY, last_raw_place_id)
    logging.debug("last_raw_place = %s" % last_raw_place)

    # Reason for using first_raw_place is
    # https://github.com/e-mission/e-mission-server/issues/333#issuecomment-312735236
    first_raw_place_id = last_cleaned_place["data"]["raw_places"][0]
    first_raw_place = esda.get_entry(esda.RAW_PLACE_KEY, first_raw_place_id)
    logging.debug("first_raw_place = %s" % first_raw_place)

    last_place_enter_ts = first_raw_place.data.enter_ts
    logging.debug("last_place_enter_ts = %s" % last_place_enter_ts)

    reset_ts = last_place_enter_ts
    logging.debug("reset_ts = %s" % last_place_enter_ts)

    # clear all analysis results after it
    del_objects_after(user_id, reset_ts, is_dry_run)

    # open the raw and cleaned places
    reset_last_place(last_cleaned_place, is_dry_run)
    reset_last_place(last_raw_place, is_dry_run)

    # reset pipeline states to its enter_ts
    reset_pipeline_state(user_id, reset_ts, is_dry_run)
Ejemplo n.º 15
0
        if last_cleaned_place is None or last_cleaned_place.data.exit_ts is None:
            logging.info("Data collection for user %s stopped before reset time, early return" % user_id)
            return
    except ValueError, e:
        first_cleaned_place = esdp.get_first_place_entry(esda.CLEANED_PLACE_KEY, user_id)
        if first_cleaned_place is not None and first_cleaned_place.data.exit_ts > ts:
            logging.info("first_cleaned_place.exit = %s (%s), resetting to start" % 
                (first_cleaned_place.data.exit_ts,
                first_cleaned_place.data.exit_fmt_time))
            reset_user_to_start(user_id, is_dry_run)
            return
        else:
            raise

    last_raw_place_id = last_cleaned_place["data"]["raw_places"][-1]
    last_raw_place = esda.get_entry(esda.RAW_PLACE_KEY, last_raw_place_id)
    logging.debug("last_raw_place = %s" % last_raw_place)

    # Reason for using first_raw_place is
    # https://github.com/e-mission/e-mission-server/issues/333#issuecomment-312735236
    first_raw_place_id = last_cleaned_place["data"]["raw_places"][0]
    first_raw_place = esda.get_entry(esda.RAW_PLACE_KEY, first_raw_place_id)
    logging.debug("first_raw_place = %s" % first_raw_place)

    last_place_enter_ts = first_raw_place.data.enter_ts
    logging.debug("last_place_enter_ts = %s" % last_place_enter_ts)

    reset_ts = last_place_enter_ts
    logging.debug("reset_ts = %s" % last_place_enter_ts)

    # clear all analysis results after it