class DataDefinition(models.Model): """ Defines a common datadefinition to share among common tripdata schema instances """ short_name = models.CharField(max_length=32) definition = JSONField(default={}, validators=[validators.JsonValidator()]) def __str__(self): return self.short_name
class ClusterConfig(models.Model): """ Represents the cluster algorithm and algorithm inputs. """ ALGORITHMS = ( ('AffinityPropagation', 'Affinity Propagation'), ('DBSCAN', 'DBSCAN'), ('Agglomerative', 'Agglomerative'), ('Birch', 'Birch'), ('KMeans', 'k-Means'), ('MiniBatchKMeans', 'Mini Batch k-Means'), ('MeanShift', 'Mean Shift'), ('Spectral', 'Spectral'), ('Ward', 'Ward'), ) algorithm = models.CharField(max_length=20, choices=ALGORITHMS) arguments = JSONField( default={}, validators=[validators.JsonValidator()], blank=True, null=True, help_text='Additional arguments to pass to the specific cluster model') def __str__(self): return '{} {}'.format( self.get_algorithm_display(), self.arguments, )
class Attribute(models.Model, AttributeQueryMixin): objects = AttributeQueryObjectManager() name = models.CharField(max_length=64) attribute = JSONField(null=True, blank=True, validators=[validators.JsonValidator()]) def __str__(self): return self.name
class Organization(models.Model): name = models.CharField(max_length=64) timezone = models.CharField(max_length=64, null=True, choices=((k, timezones[k]) for k in sorted(timezones.keys()))) metadata = JSONField(default={}, validators=[validators.JsonValidator()], blank=True) def __str__(self): return self.name
class TopicModelConfig(models.Model): """ Stores the configuration for a particular model that can be applied to multiple sets of data to allow for general comparison of the model to multiple data sets. """ IMPLEMENTATIONS = ( ('MalletLda', 'LDA MALLET'), ('GensimLda', 'LDA Gensim'), # ('GensimTfIdf', 'TF/IDF'), ('GensimLsi', 'LSI'), ) algorithm = models.CharField(max_length=16, choices=IMPLEMENTATIONS) num_topics = models.IntegerField(default=300) arguments = JSONField( default={}, blank=True, null=True, validators=[validators.JsonValidator()], help_text='Additional arguments to pass to the topic model.') def __str__(self): return '%s(%s)' % (self.get_algorithm_display(), self.num_topics)
class Entity(models.Model): """ Represents a physical entity that can generate trajectories (aka Trips). It allows association of multiple trips to a single resource for better data connectivity and relationships. """ objects = EntityManager() organization = models.ForeignKey(Organization, null=True) common_id = models.CharField( max_length=16, db_index=True, help_text='Commonly used identifier (e.g. Tail Number, License plate') physical_id = models.CharField( max_length=64, db_index=True, help_text='Unique id following the physical resource (e.g. VIN)') metadata = JSONField(default={}, validators=[validators.JsonValidator()], blank=True) class Meta: # It seems tail numbers are more unique unique_together = ('physical_id', ) # 'common_id') verbose_name_plural = 'entities' def __str__(self): return self.common_id def natural_key(self): return self.physical_id def combine_trip_data(self, prepare=_prep_dataframe, trip_filter=None): trips = self.trip_set.all() if trip_filter: trips = trips.filter(trip_filter) qs = TripData.objects.filter(trip__in=trips) return TripManager.combine_trip_data(qs, prepare)
class TripData(models.Model): """ Isolates the large binary data from the rest of the record. """ trip = models.ForeignKey(Trip, related_name='tripdata_set') _dataframe = models.BinaryField() metadata = JSONField(default={}, validators=[validators.JsonValidator()]) definition = models.ForeignKey(DataDefinition) class Meta: verbose_name_plural = 'Trip data' def __str__(self): return '%s|%s' % (str(self.trip), self.id) @classmethod def from_db(cls, db, field_names, values): instance = super().from_db(db, field_names, values) instance._db_to_dataframe() return instance def _dataframe_to_db(self): """ This is basically the same code from pandas 0.18 pandas.io.pickle.to_pickle but keeping the bytes in memory. Since that method does not allow passing the pickle in memory (only via file on file system) the logic is duplicated. """ self._dataframe = pickle.dumps(self.dataframe, protocol=pickle.HIGHEST_PROTOCOL) def _db_to_dataframe(self): """ This is basically pd.read_pickle but allowing in-memory objects and forcing python 3. The idea is that pandas.read_pickle maintains some semblance of backward compatibility making this more robust. The code this is derived from came from pandas 0.18 pandas.io.pickle. """ fh = BytesIO(self._dataframe) encoding = 'latin1' try: self.dataframe = pickle.load(fh) except (Exception) as e: try: # reg/patched pickle self.dataframe = pc.load(fh, encoding=encoding, compat=False) except: # compat pickle self.dataframe = pc.load(fh, encoding=encoding, compat=True) def save(self, *args, **kwargs): self._dataframe_to_db() super().save(*args, **kwargs) def dataframe_filter(self, params=None, times=(None, None)): """ params is an iterable of parameters to retrieve times is a tuple of start_time, duration to retrieve """ if times[0] is None and (params is None or len(params) == 0): return self.dataframe if times is not None and times[0] is not None: times = (times[0], times[0] + times[1]) else: times = (None, None) params = params if params is not None else [] if len(params) > 0: avail_params_set = set(self.paramlist()) params = avail_params_set.intersection(params) if len(params) == 0: return None # This dataframe does not contain desired params return self.dataframe.loc[times[0]:times[1], params] else: return self.dataframe.loc[times[0]:times[1], ] def paramlist(self): return self.dataframe.columns @property def organization(self): return self.trip.entity.organization
class Trip(geomodels.Model): """ Represents a partition of the movement of an Entity. It represents the base atomic element of most analysis. Although some analysis will dissect further into the TripData or specific points, the results are often associated back to the Trip. When different granularity is required, the partitioning is usually modified and a new set of trips created. """ objects = TripManager() # Not all taxi data associates with an single physical taxi id = geomodels.BigIntegerField(primary_key=True) entity = geomodels.ForeignKey(Entity, blank=True, null=True) start_datetime = geomodels.DateTimeField(db_index=True) duration = geomodels.DurationField() geometry = geomodels.LineStringField(dim=3, null=True) metadata = JSONField( default={}, validators=[validators.JsonValidator()], blank=True, # db_index=True - Defined by migration so it uses a GIST instead of BTREE index ) archive_uri = geomodels.CharField(max_length=1024) class Meta: # ordering = ['-start_datetime'] # Newer trips listed first # Trip IDs are generated as a function of entity and start_datetime as well as # another ID that enforce this unique_together in the PK. # unique_together = ('entity', 'start_datetime') pass def __str__(self): return '%s|%s' % (self.label, self.start_datetime.isoformat()) @property def label(self): return self.entity.common_id if self.entity else ('Trip(%s)' % self.id) def natural_key(self): return self.entity, self.start_datetime @property def organization(self): return self.entity.organization def dataframe_filter(self, params=None, times=(None, None), interpolate=None): df_list = list(tripdata for tripdata in self.__iter__( params=params, times=times, interpolate=interpolate)) return df_list def __iter__(self, params=None, times=(None, None), interpolate=None): # TODO: Best ways to carve up a data frame. # Is it faster to return a series when only one parameter requested? # How will the code calling this most often iterate the results? for tripdata in self.tripdata_set.all(): data = tripdata.dataframe_filter(params=params, times=times) if data is None: continue if interpolate: data = data.interpolate(method=interpolate) yield data
class ClusterModel(models.Model): """ Defines an execution of the clustering data """ config = models.ForeignKey(ClusterConfig) data = models.ForeignKey(TripQuery) topic_model = models.ForeignKey(TopicModel, null=True, blank=True) arguments = JSONField(blank=True, null=True, validators=[validators.JsonValidator()], help_text='Additional arguments for clustering') created = models.DateTimeField(auto_now_add=True) modified = models.DateTimeField(auto_now=True) def __init__(self, *args, **kwargs): self._model = None super().__init__(*args, **kwargs) def __str__(self): name = '%s with data (%s)' % ( self.config, self.data, ) if self.topic_model is not None: name = '%s on topic model (%s)' % ( name, self.topic_model, ) return name def geo_data(self, data): cluster_data = numpy.ndarray(shape=(len(data.q()), 4)) for i, trip in enumerate(data.q()): geom_start = trip.start_point geom_end = trip.end_point cluster_data[i][0] = geom_start.coords[0] cluster_data[i][1] = geom_start.coords[1] cluster_data[i][2] = geom_end.coords[0] cluster_data[i][3] = geom_end.coords[1] return cluster_data def topic_data(self, data): cluster_data = numpy.zeros(shape=(len(data.q()), self.topic_model.model.num_topics)) if True or self.arguments.get('cluster_street_topics', None): # Find the most probable topic for each street in the data by # creating a corpus of single term documents with the term being # the street id and then inferring that with the model streets_corpus = [] for trip in data.q(): gid_df = trip.dataframe_filter(params=['gid'])[0] for gid in gid_df['gid']: gid = str(gid) if gid not in streets_corpus: streets_corpus.append(gid) streets_corpus = [[street] for street in streets_corpus] inferred_streets = self.topic_model.model[streets_corpus] street_topics = {} for i, topic in enumerate(inferred_streets): topics = sorted(topic, key=lambda t: -t[1]) street_topics[int(streets_corpus[i][0])] = topics[0] adj = 1 # For each document, create an entry in the cluster data by # iterating through each street in the trajectory and incrementing # the topic that is most probable for that street. for i, trip in enumerate(data.q()): streets_df = trip.dataframe_filter(params=['gid'])[0] for gid in streets_df['gid']: street_topic = street_topics[gid][0] cluster_data[i][street_topic] += adj for s in range(len(cluster_data[i])): cluster_data[i][s] /= (len(streets_df) * adj) else: # Infer the data associated with the cluster. inferred_corpus = self.topic_model.model[trip_queryset_to_corpus( data.q(), data.id)] # Create cluster data base of inferred corpus for i, topics in enumerate(inferred_corpus): topics = sorted(topics, key=lambda t: t[1], reverse=True) for t in topics: cluster_data[i][t[0]] = t[1] return cluster_data @property def model(self): # If the cluster model hasn't been accessed on this model yet, get it. if self._model is None: # First, attempt to get the model from the cache if available. if CLUSTER_SETTINGS['CACHE'] is not None: key = 'cluster:model%s' % (self.id) cache = caches[CLUSTER_SETTINGS['CACHE']] self._model = cache.get(key, CLUSTER_SETTINGS['QUERY_CACHE_TIME']) # If the model was not in the cache then calculate it. if self._model is None: logger.info('Calculating cluster %s' % self) if self.topic_model is not None: cluster_data = self.topic_data(self.data) else: cluster_data = self.geo_data(self.data) self._model = processing.ClusterModel( impl=self.config.algorithm, **self.config.arguments) self._model.fit(cluster_data) if CLUSTER_SETTINGS['CACHE'] is not None: # key is calculated above and cache retrieved above cache.set(key, self._model, CLUSTER_SETTINGS['QUERY_CACHE_TIME']) else: logger.info('Cluster pulled from cache') return self._model