def getHistoricFeatures(self, entropy_para): # this method computes the features that capture the difference between current # event and background knowledge end_time = self.getLatestElementTime() begin_time = self.getEarliestElementTime() if self._element_type == "photos": pi = PhotoInterface() else: pi = TweetInterface() elements = [] dt = 0 for day in xrange(1, 8): # here 15 is hard coded because we use 14 days' data as the training et = end_time - day * 24 * 3600 + dt / 2 bt = begin_time - day * 24 * 3600 - dt / 2 day_elements = pi.rangeQuery(self._event["region"], [str(bt), str(et)]) inds = range(0, day_elements.count()) # only select 40 elements if len(inds) > 40: random.shuffle(inds) inds = inds[0:40] for i in inds: elements.append(day_elements[i]) random.shuffle(elements) elements = elements[0 : min(len(self._event[self._element_type]), len(elements))] if len(elements) == 0: # TODO: refine return [1, 10, 10] # fake a historic event historic_event = BaseEvent(self._element_type) historic_event.setElements(elements) historic_event.setRegion(self._event["region"]) historic_event.setActualValue(historic_event._getActualValueByCounting()) historic_event = BaseFeature(historic_event) # compute the difference between entropy # this has been smoothed pro1 = self._divideAndCount(entropy_para) pro2 = historic_event._divideAndCount(entropy_para) entropy_divergence = KLDivergence.averageKLDivergence(pro1, pro2) # compute the difference between top words topic_divergence = self.computeWordKLDivergenceWith(historic_event) return [ historic_event.getElementDisFeatures()[1], topic_divergence, # historic_event.getEntropy(entropy_para), entropy_divergence, ]
def getHistoricFeatures(self, entropy_para): # this method computes the features that capture the difference between current # event and background knowledge end_time = self.getLatestPhotoTime() begin_time = self.getEarliestPhotoTime() pi = PhotoInterface() photos = [] dt = 0 for day in xrange(1, 15): # here 15 is hard coded because we use 14 days' data as the training et = end_time - day * 24 * 3600 + dt / 2 bt = begin_time - day * 24 * 3600 - dt / 2 day_photos = pi.rangeQuery(self._event['region'], [str(bt), str(et)]) for photo in day_photos: # since rangeQuery sorts the photos from the most current to the most early # thus all the photos in the List "photos" are sorted by their created time from # the most current to the most early photos.append(photo) random.shuffle(photos) photos = photos[0:min(len(self._event['photos']), len(photos))] if len(photos) == 0: # TODO: refine return [1, 10, 10] # fake a historic event historic_event = Event() historic_event.setPhotos(photos) historic_event.setRegion(self._event['region']) historic_event.setActualValue(historic_event._getActualValueByCounting()) historic_event = BaseFeature(historic_event) # compute the difference between entropy # this has been smoothed pro1 = self._divideAndCount(entropy_para) pro2 = historic_event._divideAndCount(entropy_para) entropy_divergence = KLDivergence.averageKLDivergence(pro1, pro2) # compute the difference between top words topic_divergence = self.computeWordKLDivergenceWith(historic_event) return [historic_event.getPhotoDisFeatures()[3], topic_divergence, # historic_event.getEntropy(entropy_para), entropy_divergence]
def getHistoricFeatures(self, entropy_para): # this method computes the features that capture the difference between current # event and background knowledge end_time = self.getLatestPhotoTime() begin_time = self.getEarliestPhotoTime() pi = PhotoInterface() pi.setDB('citybeat') pi.setCollection('photos') photos = [] dt = 0 for day in xrange(1,15): # here 15 is hard coded because we use 14 days' data as the training et = end_time - day * 24 * 3600 + dt / 2 bt = begin_time - day * 24 * 3600 - dt / 2 day_photos = pi.rangeQuery(self._event['region'], [str(bt), str(et)]) for photo in day_photos: # since rangeQuery sorts the photos from the most current to the most early # thus all the photos in the List "photos" are sorted by their created time from # the most current to the most early photos.append(photo) random.shuffle(photos) photos = photos[0:min(len(self._event['photos']), len(photos))] # fake a historic event historic_event = Event() historic_event.setPhotos(photos) historic_event.setRegion(self._event['region']) historic_event.setActualValue(historic_event._getActualValueByCounting()) historic_event = EventFeature(historic_event) # compute the difference between entropy # this has been smoothed pro1 = self._divideAndCount(entropy_para) pro2 = historic_event._divideAndCount(entropy_para) entropy_divergence = KLDivergence.averageKLDivergence(pro1, pro2) # compute the difference between top words topic_divergence = self.computeWordKLDivergenceWith(historic_event) return [historic_event.getPhotoDisFeatures()[3], topic_divergence, # historic_event.getEntropy(entropy_para), entropy_divergence]
def computeWordKLDivergenceWith(self, event): if type(event) is types.DictType: fake_event = BaseFeature(event) else: fake_event = event event_topword_list = self._getTopWords(-1, True) event_topword_list2 = fake_event._getTopWords(-1, True) n_ind = 0 ind = {} for word, freq in event_topword_list + event_topword_list2: if not ind.has_key(word): ind[word] = n_ind n_ind += 1 freq1 = [0] * n_ind freq2 = [0] * n_ind for word, freq in event_topword_list: freq1[ind[word]] = freq for word, freq in event_topword_list2: freq2[ind[word]] = freq topic_divergence = KLDivergence.averageKLDivergence(freq1, freq2) return topic_divergence
def computeWordKLDivergenceWith(self, event): if type(event) is types.DictType: fake_event = EventFeature(event) else: fake_event = event event_topword_list = self._getTopWords(-1, True) event_topword_list2 = fake_event._getTopWords(-1, True) n_ind = 0 ind = {} for word, freq in event_topword_list + event_topword_list2: if not ind.has_key(word): ind[word] = n_ind n_ind += 1 freq1 = [0] * n_ind freq2 = [0] * n_ind for word, freq in event_topword_list: freq1[ind[word]] = freq for word, freq in event_topword_list2: freq2[ind[word]] = freq topic_divergence = KLDivergence.averageKLDivergence(freq1, freq2) return topic_divergence
def getHistoricFeatures(self, entropy_para): # this method computes the features that capture the difference between current # event and background knowledge end_time = self.getLatestPhotoTime() begin_time = self.getEarliestPhotoTime() pi = PhotoInterface() pi.setDB("citybeat") pi.setCollection("photos") photos = [] dt = 3600 for day in xrange(1, 15): # here 15 is hard coded because we use 14 days' data as the training et = end_time - day * 24 * 3600 + dt / 2 bt = begin_time - day * 24 * 3600 - dt / 2 day_photos = pi.rangeQuery(self._event["region"], [str(bt), str(et)]) for photo in day_photos: # since rangeQuery sorts the photos from the most current to the most early # thus all the photos in the List "photos" are sorted by their created time from # the most current to the most early photos.append(photo) event = Event() event.setPhotos(photos) event.setRegion(self._event["region"]) event.setActualValue(event.getActualValueByCounting()) event = EventFeature(event) # compute the difference between entropy # this has been smoothed pro1 = self._divideAndCount(entropy_para) pro2 = event._divideAndCount(entropy_para) entropy_divergence = KLDivergence.averageKLDivergence(pro1, pro2) # compute the difference between top words event_topword_list = self._getTopWords(-1, True) historic_topword_list = event._getTopWords(-1, True) n_ind = 0 ind = {} for word, freq in event_topword_list + historic_topword_list: if not ind.has_key(word): ind[word] = n_ind n_ind += 1 freq1 = [0] * n_ind freq2 = [0] * n_ind for word, freq in event_topword_list: freq1[ind[word]] = freq for word, freq in historic_topword_list: freq2[ind[word]] = freq topic_divergence = KLDivergence.averageKLDivergence(freq1, freq2) return [ event.getAvgPhotoDis(), topic_divergence, # event.getEntropy(entropy_para), entropy_divergence, event.getAvgCaptionLen(), event.getRatioOfPeopleToPhoto(), ]
def getHistoricFeatures(self, entropy_para): # this method computes the features that capture the difference between current # event and background knowledge end_time = self.getLatestPhotoTime() begin_time = self.getEarliestPhotoTime() pi = PhotoInterface() pi.setDB('citybeat') pi.setCollection('photos') photos = [] dt = 3600 for day in xrange(1, 15): # here 15 is hard coded because we use 14 days' data as the training et = end_time - day * 24 * 3600 + dt / 2 bt = begin_time - day * 24 * 3600 - dt / 2 day_photos = pi.rangeQuery(self._event['region'], [str(bt), str(et)]) for photo in day_photos: # since rangeQuery sorts the photos from the most current to the most early # thus all the photos in the List "photos" are sorted by their created time from # the most current to the most early photos.append(photo) event = Event() event.setPhotos(photos) event.setRegion(self._event['region']) event.setActualValue(event.getActualValueByCounting()) event = EventFeature(event) # compute the difference between entropy # this has been smoothed pro1 = self._divideAndCount(entropy_para) pro2 = event._divideAndCount(entropy_para) entropy_divergence = KLDivergence.averageKLDivergence(pro1, pro2) # compute the difference between top words event_topword_list = self._getTopWords(-1, True) historic_topword_list = event._getTopWords(-1, True) n_ind = 0 ind = {} for word, freq in event_topword_list + historic_topword_list: if not ind.has_key(word): ind[word] = n_ind n_ind += 1 freq1 = [0] * n_ind freq2 = [0] * n_ind for word, freq in event_topword_list: freq1[ind[word]] = freq for word, freq in historic_topword_list: freq2[ind[word]] = freq topic_divergence = KLDivergence.averageKLDivergence(freq1, freq2) return [ event.getAvgPhotoDis(), topic_divergence, # event.getEntropy(entropy_para), entropy_divergence, event.getAvgCaptionLen(), event.getRatioOfPeopleToPhoto() ]