Exemple #1
0
	def getHistoricFeatures(self, entropy_para):
		# this method computes the features that capture the difference between current
		# event and background knowledge
		
		end_time = self.getLatestPhotoTime()
		begin_time = self.getEarliestPhotoTime()
		
		pi = PhotoInterface()
		pi.setDB('citybeat')
		pi.setCollection('photos')
		
		photos = []
		dt = 0
		for day in xrange(1,15):
			# here 15 is hard coded because we use 14 days' data as the training
			et = end_time - day * 24 * 3600 + dt / 2
			bt = begin_time - day * 24 * 3600 - dt / 2
			day_photos = pi.rangeQuery(self._event['region'], [str(bt), str(et)])
			for photo in day_photos:
				# since rangeQuery sorts the photos from the most current to the most early
				# thus all the photos in the List "photos" are sorted by their created time from 
				# the most current to the most early
				photos.append(photo)
				
		random.shuffle(photos)
		photos = photos[0:min(len(self._event['photos']), len(photos))]
		
		# fake a historic event
		historic_event = Event()
		historic_event.setPhotos(photos)
		historic_event.setRegion(self._event['region'])
		historic_event.setActualValue(historic_event._getActualValueByCounting())
		historic_event = EventFeature(historic_event)
		
		# compute the difference between entropy
		# this has been smoothed
		pro1 = self._divideAndCount(entropy_para)
		pro2 = historic_event._divideAndCount(entropy_para)
		entropy_divergence = KLDivergence.averageKLDivergence(pro1, pro2)
		
		# compute the difference between top words
		
		topic_divergence = self.computeWordKLDivergenceWith(historic_event)
		
		return [historic_event.getPhotoDisFeatures()[3], topic_divergence,
#		        historic_event.getEntropy(entropy_para),
		        entropy_divergence]
Exemple #2
0
    def getHistoricFeatures(self, entropy_para):
        # this method computes the features that capture the difference between current
        # event and background knowledge

        end_time = self.getLatestPhotoTime()
        begin_time = self.getEarliestPhotoTime()

        pi = PhotoInterface()
        pi.setDB('citybeat')
        pi.setCollection('photos')

        photos = []
        dt = 3600
        for day in xrange(1, 15):
            # here 15 is hard coded because we use 14 days' data as the training
            et = end_time - day * 24 * 3600 + dt / 2
            bt = begin_time - day * 24 * 3600 - dt / 2
            day_photos = pi.rangeQuery(self._event['region'],
                                       [str(bt), str(et)])
            for photo in day_photos:
                # since rangeQuery sorts the photos from the most current to the most early
                # thus all the photos in the List "photos" are sorted by their created time from
                # the most current to the most early
                photos.append(photo)

        event = Event()
        event.setPhotos(photos)
        event.setRegion(self._event['region'])
        event.setActualValue(event.getActualValueByCounting())
        event = EventFeature(event)

        # compute the difference between entropy
        # this has been smoothed
        pro1 = self._divideAndCount(entropy_para)
        pro2 = event._divideAndCount(entropy_para)
        entropy_divergence = KLDivergence.averageKLDivergence(pro1, pro2)

        # compute the difference between top words
        event_topword_list = self._getTopWords(-1, True)
        historic_topword_list = event._getTopWords(-1, True)

        n_ind = 0
        ind = {}
        for word, freq in event_topword_list + historic_topword_list:
            if not ind.has_key(word):
                ind[word] = n_ind
                n_ind += 1

        freq1 = [0] * n_ind
        freq2 = [0] * n_ind

        for word, freq in event_topword_list:
            freq1[ind[word]] = freq
        for word, freq in historic_topword_list:
            freq2[ind[word]] = freq

        topic_divergence = KLDivergence.averageKLDivergence(freq1, freq2)

        return [
            event.getAvgPhotoDis(),
            topic_divergence,
            #		        event.getEntropy(entropy_para),
            entropy_divergence,
            event.getAvgCaptionLen(),
            event.getRatioOfPeopleToPhoto()
        ]
Exemple #3
0
    def filterRegions(self,
                      region_list,
                      percentage=InstagramConfig.region_percentage,
                      test=False,
                      n=10,
                      m=10):
        if test:
            #n and m should be set if test is true
            #this is only for test
            new_region_list = []
            folder = '/res/users/kx19/Citybeat/CityBeat/distributed_gp/utility/region_cache/'
            file_name = str(n) + '_' + str(m) + '.txt'
            fid = open(folder + file_name)
            for line in fid:
                region = line.split()
                for i in xrange(0, 4):
                    region[i] = float(region[i])
                region = Region(region)
                new_region_list.append(region)
            return new_region_list

        # this method should not be a member of this class
        # TODO: change the period to one week


#		print 'Begin to filter sparse regions with less photos than the threshold'
        end_time = 1359704845 - 7 * 3600 * 24
        begin_time = end_time - 14 * 3600 * 24
        pi = PhotoInterface()
        photos = pi.rangeQuery(period=[str(begin_time), str(end_time)])
        region_number = len(region_list)
        number_photo_in_region = [0] * region_number
        for photo in photos:
            lat = float(photo['location']['latitude'])
            lng = float(photo['location']['longitude'])
            flag = 0
            for i in xrange(region_number):
                if region_list[i].insideRegion([lat, lng]):
                    number_photo_in_region[i] += 1
                    flag = 1
                    break
            if flag == 0:
                print 'bad photo:', photo

        region_tuples = []
        for i in xrange(0, region_number):
            region_tuples.append((region_list[i], number_photo_in_region[i]))

        region_tuples.sort(key=operator.itemgetter(1), reverse=True)

        valid_region_number = int(0.5 + 1.0 * region_number * percentage)
        valid_regions = []

        #		print region_tuples[valid_region_number-1][1]

        for i in xrange(0, valid_region_number):
            region = region_tuples[i][0]
            lat = (self._region['min_lat'] + self._region['max_lat']) / 2
            lng = (self._region['min_lng'] + self._region['max_lng']) / 2
            cnt = region_tuples[i][1]

        for i in xrange(0, valid_region_number):
            valid_regions.append(region_tuples[i][0])

        return valid_regions
Exemple #4
0
        valid_region_number = int(0.5 + 1.0 * region_number * percentage)
        valid_regions = []

        #		print region_tuples[valid_region_number-1][1]

        for i in xrange(0, valid_region_number):
            region = region_tuples[i][0]
            lat = (self._region['min_lat'] + self._region['max_lat']) / 2
            lng = (self._region['min_lng'] + self._region['max_lng']) / 2
            cnt = region_tuples[i][1]

        for i in xrange(0, valid_region_number):
            valid_regions.append(region_tuples[i][0])

        return valid_regions

if __name__ == "__main__":
    coordinates = [
        InstagramConfig.photo_min_lat, InstagramConfig.photo_min_lng,
        InstagramConfig.photo_max_lat, InstagramConfig.photo_max_lng
    ]
    nyc = Region(coordinates)
    pi = PhotoInterface()
    pi.rangeQuery(nyc)
    region_list = nyc.divideRegions(20, 20)
    region_list = nyc.filterRegions(region_list, test=True, n=10, m=10)
    for region in region_list:
        region = region.toJSON()
        print region['min_lat'], region['min_lng'], region['max_lat'], region[
            'max_lng']