def getHistoricFeatures(self, entropy_para): # this method computes the features that capture the difference between current # event and background knowledge end_time = self.getLatestPhotoTime() begin_time = self.getEarliestPhotoTime() pi = PhotoInterface() pi.setDB('citybeat') pi.setCollection('photos') photos = [] dt = 0 for day in xrange(1,15): # here 15 is hard coded because we use 14 days' data as the training et = end_time - day * 24 * 3600 + dt / 2 bt = begin_time - day * 24 * 3600 - dt / 2 day_photos = pi.rangeQuery(self._event['region'], [str(bt), str(et)]) for photo in day_photos: # since rangeQuery sorts the photos from the most current to the most early # thus all the photos in the List "photos" are sorted by their created time from # the most current to the most early photos.append(photo) random.shuffle(photos) photos = photos[0:min(len(self._event['photos']), len(photos))] # fake a historic event historic_event = Event() historic_event.setPhotos(photos) historic_event.setRegion(self._event['region']) historic_event.setActualValue(historic_event._getActualValueByCounting()) historic_event = EventFeature(historic_event) # compute the difference between entropy # this has been smoothed pro1 = self._divideAndCount(entropy_para) pro2 = historic_event._divideAndCount(entropy_para) entropy_divergence = KLDivergence.averageKLDivergence(pro1, pro2) # compute the difference between top words topic_divergence = self.computeWordKLDivergenceWith(historic_event) return [historic_event.getPhotoDisFeatures()[3], topic_divergence, # historic_event.getEntropy(entropy_para), entropy_divergence]
def getHistoricFeatures(self, entropy_para): # this method computes the features that capture the difference between current # event and background knowledge end_time = self.getLatestPhotoTime() begin_time = self.getEarliestPhotoTime() pi = PhotoInterface() pi.setDB('citybeat') pi.setCollection('photos') photos = [] dt = 3600 for day in xrange(1, 15): # here 15 is hard coded because we use 14 days' data as the training et = end_time - day * 24 * 3600 + dt / 2 bt = begin_time - day * 24 * 3600 - dt / 2 day_photos = pi.rangeQuery(self._event['region'], [str(bt), str(et)]) for photo in day_photos: # since rangeQuery sorts the photos from the most current to the most early # thus all the photos in the List "photos" are sorted by their created time from # the most current to the most early photos.append(photo) event = Event() event.setPhotos(photos) event.setRegion(self._event['region']) event.setActualValue(event.getActualValueByCounting()) event = EventFeature(event) # compute the difference between entropy # this has been smoothed pro1 = self._divideAndCount(entropy_para) pro2 = event._divideAndCount(entropy_para) entropy_divergence = KLDivergence.averageKLDivergence(pro1, pro2) # compute the difference between top words event_topword_list = self._getTopWords(-1, True) historic_topword_list = event._getTopWords(-1, True) n_ind = 0 ind = {} for word, freq in event_topword_list + historic_topword_list: if not ind.has_key(word): ind[word] = n_ind n_ind += 1 freq1 = [0] * n_ind freq2 = [0] * n_ind for word, freq in event_topword_list: freq1[ind[word]] = freq for word, freq in historic_topword_list: freq2[ind[word]] = freq topic_divergence = KLDivergence.averageKLDivergence(freq1, freq2) return [ event.getAvgPhotoDis(), topic_divergence, # event.getEntropy(entropy_para), entropy_divergence, event.getAvgCaptionLen(), event.getRatioOfPeopleToPhoto() ]
def filterRegions(self, region_list, percentage=InstagramConfig.region_percentage, test=False, n=10, m=10): if test: #n and m should be set if test is true #this is only for test new_region_list = [] folder = '/res/users/kx19/Citybeat/CityBeat/distributed_gp/utility/region_cache/' file_name = str(n) + '_' + str(m) + '.txt' fid = open(folder + file_name) for line in fid: region = line.split() for i in xrange(0, 4): region[i] = float(region[i]) region = Region(region) new_region_list.append(region) return new_region_list # this method should not be a member of this class # TODO: change the period to one week # print 'Begin to filter sparse regions with less photos than the threshold' end_time = 1359704845 - 7 * 3600 * 24 begin_time = end_time - 14 * 3600 * 24 pi = PhotoInterface() photos = pi.rangeQuery(period=[str(begin_time), str(end_time)]) region_number = len(region_list) number_photo_in_region = [0] * region_number for photo in photos: lat = float(photo['location']['latitude']) lng = float(photo['location']['longitude']) flag = 0 for i in xrange(region_number): if region_list[i].insideRegion([lat, lng]): number_photo_in_region[i] += 1 flag = 1 break if flag == 0: print 'bad photo:', photo region_tuples = [] for i in xrange(0, region_number): region_tuples.append((region_list[i], number_photo_in_region[i])) region_tuples.sort(key=operator.itemgetter(1), reverse=True) valid_region_number = int(0.5 + 1.0 * region_number * percentage) valid_regions = [] # print region_tuples[valid_region_number-1][1] for i in xrange(0, valid_region_number): region = region_tuples[i][0] lat = (self._region['min_lat'] + self._region['max_lat']) / 2 lng = (self._region['min_lng'] + self._region['max_lng']) / 2 cnt = region_tuples[i][1] for i in xrange(0, valid_region_number): valid_regions.append(region_tuples[i][0]) return valid_regions
valid_region_number = int(0.5 + 1.0 * region_number * percentage) valid_regions = [] # print region_tuples[valid_region_number-1][1] for i in xrange(0, valid_region_number): region = region_tuples[i][0] lat = (self._region['min_lat'] + self._region['max_lat']) / 2 lng = (self._region['min_lng'] + self._region['max_lng']) / 2 cnt = region_tuples[i][1] for i in xrange(0, valid_region_number): valid_regions.append(region_tuples[i][0]) return valid_regions if __name__ == "__main__": coordinates = [ InstagramConfig.photo_min_lat, InstagramConfig.photo_min_lng, InstagramConfig.photo_max_lat, InstagramConfig.photo_max_lng ] nyc = Region(coordinates) pi = PhotoInterface() pi.rangeQuery(nyc) region_list = nyc.divideRegions(20, 20) region_list = nyc.filterRegions(region_list, test=True, n=10, m=10) for region in region_list: region = region.toJSON() print region['min_lat'], region['min_lng'], region['max_lat'], region[ 'max_lng']