def getHistoricFeatures(self, entropy_para): # this method computes the features that capture the difference between current # event and background knowledge end_time = self.getLatestPhotoTime() begin_time = self.getEarliestPhotoTime() pi = PhotoInterface() photos = [] dt = 0 for day in xrange(1, 15): # here 15 is hard coded because we use 14 days' data as the training et = end_time - day * 24 * 3600 + dt / 2 bt = begin_time - day * 24 * 3600 - dt / 2 day_photos = pi.rangeQuery(self._event['region'], [str(bt), str(et)]) for photo in day_photos: # since rangeQuery sorts the photos from the most current to the most early # thus all the photos in the List "photos" are sorted by their created time from # the most current to the most early photos.append(photo) random.shuffle(photos) photos = photos[0:min(len(self._event['photos']), len(photos))] if len(photos) == 0: # TODO: refine return [1, 10, 10] # fake a historic event historic_event = Event() historic_event.setPhotos(photos) historic_event.setRegion(self._event['region']) historic_event.setActualValue(historic_event._getActualValueByCounting()) historic_event = BaseFeature(historic_event) # compute the difference between entropy # this has been smoothed pro1 = self._divideAndCount(entropy_para) pro2 = historic_event._divideAndCount(entropy_para) entropy_divergence = KLDivergence.averageKLDivergence(pro1, pro2) # compute the difference between top words topic_divergence = self.computeWordKLDivergenceWith(historic_event) return [historic_event.getPhotoDisFeatures()[3], topic_divergence, # historic_event.getEntropy(entropy_para), entropy_divergence]
def getHistoricFeatures(self, entropy_para): # this method computes the features that capture the difference between current # event and background knowledge end_time = self.getLatestPhotoTime() begin_time = self.getEarliestPhotoTime() pi = PhotoInterface() pi.setDB('citybeat') pi.setCollection('photos') photos = [] dt = 0 for day in xrange(1,15): # here 15 is hard coded because we use 14 days' data as the training et = end_time - day * 24 * 3600 + dt / 2 bt = begin_time - day * 24 * 3600 - dt / 2 day_photos = pi.rangeQuery(self._event['region'], [str(bt), str(et)]) for photo in day_photos: # since rangeQuery sorts the photos from the most current to the most early # thus all the photos in the List "photos" are sorted by their created time from # the most current to the most early photos.append(photo) random.shuffle(photos) photos = photos[0:min(len(self._event['photos']), len(photos))] # fake a historic event historic_event = Event() historic_event.setPhotos(photos) historic_event.setRegion(self._event['region']) historic_event.setActualValue(historic_event._getActualValueByCounting()) historic_event = EventFeature(historic_event) # compute the difference between entropy # this has been smoothed pro1 = self._divideAndCount(entropy_para) pro2 = historic_event._divideAndCount(entropy_para) entropy_divergence = KLDivergence.averageKLDivergence(pro1, pro2) # compute the difference between top words topic_divergence = self.computeWordKLDivergenceWith(historic_event) return [historic_event.getPhotoDisFeatures()[3], topic_divergence, # historic_event.getEntropy(entropy_para), entropy_divergence]
def getHistoricFeatures(self, entropy_para): # this method computes the features that capture the difference between current # event and background knowledge end_time = self.getLatestPhotoTime() begin_time = self.getEarliestPhotoTime() pi = PhotoInterface() pi.setDB("citybeat") pi.setCollection("photos") photos = [] dt = 3600 for day in xrange(1, 15): # here 15 is hard coded because we use 14 days' data as the training et = end_time - day * 24 * 3600 + dt / 2 bt = begin_time - day * 24 * 3600 - dt / 2 day_photos = pi.rangeQuery(self._event["region"], [str(bt), str(et)]) for photo in day_photos: # since rangeQuery sorts the photos from the most current to the most early # thus all the photos in the List "photos" are sorted by their created time from # the most current to the most early photos.append(photo) event = Event() event.setPhotos(photos) event.setRegion(self._event["region"]) event.setActualValue(event.getActualValueByCounting()) event = EventFeature(event) # compute the difference between entropy # this has been smoothed pro1 = self._divideAndCount(entropy_para) pro2 = event._divideAndCount(entropy_para) entropy_divergence = KLDivergence.averageKLDivergence(pro1, pro2) # compute the difference between top words event_topword_list = self._getTopWords(-1, True) historic_topword_list = event._getTopWords(-1, True) n_ind = 0 ind = {} for word, freq in event_topword_list + historic_topword_list: if not ind.has_key(word): ind[word] = n_ind n_ind += 1 freq1 = [0] * n_ind freq2 = [0] * n_ind for word, freq in event_topword_list: freq1[ind[word]] = freq for word, freq in historic_topword_list: freq2[ind[word]] = freq topic_divergence = KLDivergence.averageKLDivergence(freq1, freq2) return [ event.getAvgPhotoDis(), topic_divergence, # event.getEntropy(entropy_para), entropy_divergence, event.getAvgCaptionLen(), event.getRatioOfPeopleToPhoto(), ]
def getHistoricFeatures(self, entropy_para): # this method computes the features that capture the difference between current # event and background knowledge end_time = self.getLatestPhotoTime() begin_time = self.getEarliestPhotoTime() pi = PhotoInterface() pi.setDB('citybeat') pi.setCollection('photos') photos = [] dt = 3600 for day in xrange(1, 15): # here 15 is hard coded because we use 14 days' data as the training et = end_time - day * 24 * 3600 + dt / 2 bt = begin_time - day * 24 * 3600 - dt / 2 day_photos = pi.rangeQuery(self._event['region'], [str(bt), str(et)]) for photo in day_photos: # since rangeQuery sorts the photos from the most current to the most early # thus all the photos in the List "photos" are sorted by their created time from # the most current to the most early photos.append(photo) event = Event() event.setPhotos(photos) event.setRegion(self._event['region']) event.setActualValue(event.getActualValueByCounting()) event = EventFeature(event) # compute the difference between entropy # this has been smoothed pro1 = self._divideAndCount(entropy_para) pro2 = event._divideAndCount(entropy_para) entropy_divergence = KLDivergence.averageKLDivergence(pro1, pro2) # compute the difference between top words event_topword_list = self._getTopWords(-1, True) historic_topword_list = event._getTopWords(-1, True) n_ind = 0 ind = {} for word, freq in event_topword_list + historic_topword_list: if not ind.has_key(word): ind[word] = n_ind n_ind += 1 freq1 = [0] * n_ind freq2 = [0] * n_ind for word, freq in event_topword_list: freq1[ind[word]] = freq for word, freq in historic_topword_list: freq2[ind[word]] = freq topic_divergence = KLDivergence.averageKLDivergence(freq1, freq2) return [ event.getAvgPhotoDis(), topic_divergence, # event.getEntropy(entropy_para), entropy_divergence, event.getAvgCaptionLen(), event.getRatioOfPeopleToPhoto() ]