def main(): pi = PhotoInterface() pi.setDB('citybeat') pi.setCollection('photos') pi2 = PhotoInterface() pi2.setDB('citybeat') pi2.setCollection('photos_no_duplicate') region = {} region['min_lat'] = 40.690531 region['min_lng'] = -74.058151 region['max_lat'] = 40.823163 region['max_lng'] = -73.857994 st = '1352937600' et = '1355615999' pc = pi.rangeQuery(region, [st, et]) # print pc.count() ids = set() for photo in pc: ids.add(photo['link']) print len(ids) print pi2.rangeQuery(region, [st, et]).count()
def getHistoricFeatures(self, entropy_para): # this method computes the features that capture the difference between current # event and background knowledge end_time = self.getLatestPhotoTime() begin_time = self.getEarliestPhotoTime() pi = PhotoInterface() pi.setDB('citybeat') pi.setCollection('photos') photos = [] dt = 0 for day in xrange(1,15): # here 15 is hard coded because we use 14 days' data as the training et = end_time - day * 24 * 3600 + dt / 2 bt = begin_time - day * 24 * 3600 - dt / 2 day_photos = pi.rangeQuery(self._event['region'], [str(bt), str(et)]) for photo in day_photos: # since rangeQuery sorts the photos from the most current to the most early # thus all the photos in the List "photos" are sorted by their created time from # the most current to the most early photos.append(photo) random.shuffle(photos) photos = photos[0:min(len(self._event['photos']), len(photos))] # fake a historic event historic_event = Event() historic_event.setPhotos(photos) historic_event.setRegion(self._event['region']) historic_event.setActualValue(historic_event._getActualValueByCounting()) historic_event = EventFeature(historic_event) # compute the difference between entropy # this has been smoothed pro1 = self._divideAndCount(entropy_para) pro2 = historic_event._divideAndCount(entropy_para) entropy_divergence = KLDivergence.averageKLDivergence(pro1, pro2) # compute the difference between top words topic_divergence = self.computeWordKLDivergenceWith(historic_event) return [historic_event.getPhotoDisFeatures()[3], topic_divergence, # historic_event.getEntropy(entropy_para), entropy_divergence]
def getCaptionStatistics(): pi = PhotoInterface() pi.setDB('citybeat') pi.setCollection('photos_no_duplicate') tot = 0 withCap = 0 l = 0 for photo in pi.getAllDocuments(): cap = Photo(photo).getCaption() tot += 1 if len(cap) == 0: continue withCap += 1 l += len(cap) print 1.0*withCap / tot print 1.0*l / withCap
from photo_interface import PhotoInterface from caption_parser import CaptionParser from photo import Photo from mongodb_interface import MongoDBInterface import random if __name__ == '__main__': pi = PhotoInterface() pi.setDB('citybeat') pi.setCollection('photos') mi = MongoDBInterface() mi.setDB('test_caption') mi.setCollection('captions') photos = pi.getAllDocuments() for photo in photos: i = random.randint(0,10) if i > 0: continue p = Photo(photo) cap = p.getCaption() if len(cap) > 0: cap = {'caption':cap} mi.saveDocument(cap)
def getHistoricFeatures(self, entropy_para): # this method computes the features that capture the difference between current # event and background knowledge end_time = self.getLatestPhotoTime() begin_time = self.getEarliestPhotoTime() pi = PhotoInterface() pi.setDB("citybeat") pi.setCollection("photos") photos = [] dt = 3600 for day in xrange(1, 15): # here 15 is hard coded because we use 14 days' data as the training et = end_time - day * 24 * 3600 + dt / 2 bt = begin_time - day * 24 * 3600 - dt / 2 day_photos = pi.rangeQuery(self._event["region"], [str(bt), str(et)]) for photo in day_photos: # since rangeQuery sorts the photos from the most current to the most early # thus all the photos in the List "photos" are sorted by their created time from # the most current to the most early photos.append(photo) event = Event() event.setPhotos(photos) event.setRegion(self._event["region"]) event.setActualValue(event.getActualValueByCounting()) event = EventFeature(event) # compute the difference between entropy # this has been smoothed pro1 = self._divideAndCount(entropy_para) pro2 = event._divideAndCount(entropy_para) entropy_divergence = KLDivergence.averageKLDivergence(pro1, pro2) # compute the difference between top words event_topword_list = self._getTopWords(-1, True) historic_topword_list = event._getTopWords(-1, True) n_ind = 0 ind = {} for word, freq in event_topword_list + historic_topword_list: if not ind.has_key(word): ind[word] = n_ind n_ind += 1 freq1 = [0] * n_ind freq2 = [0] * n_ind for word, freq in event_topword_list: freq1[ind[word]] = freq for word, freq in historic_topword_list: freq2[ind[word]] = freq topic_divergence = KLDivergence.averageKLDivergence(freq1, freq2) return [ event.getAvgPhotoDis(), topic_divergence, # event.getEntropy(entropy_para), entropy_divergence, event.getAvgCaptionLen(), event.getRatioOfPeopleToPhoto(), ]
def getHistoricFeatures(self, entropy_para): # this method computes the features that capture the difference between current # event and background knowledge end_time = self.getLatestPhotoTime() begin_time = self.getEarliestPhotoTime() pi = PhotoInterface() pi.setDB('citybeat') pi.setCollection('photos') photos = [] dt = 3600 for day in xrange(1, 15): # here 15 is hard coded because we use 14 days' data as the training et = end_time - day * 24 * 3600 + dt / 2 bt = begin_time - day * 24 * 3600 - dt / 2 day_photos = pi.rangeQuery(self._event['region'], [str(bt), str(et)]) for photo in day_photos: # since rangeQuery sorts the photos from the most current to the most early # thus all the photos in the List "photos" are sorted by their created time from # the most current to the most early photos.append(photo) event = Event() event.setPhotos(photos) event.setRegion(self._event['region']) event.setActualValue(event.getActualValueByCounting()) event = EventFeature(event) # compute the difference between entropy # this has been smoothed pro1 = self._divideAndCount(entropy_para) pro2 = event._divideAndCount(entropy_para) entropy_divergence = KLDivergence.averageKLDivergence(pro1, pro2) # compute the difference between top words event_topword_list = self._getTopWords(-1, True) historic_topword_list = event._getTopWords(-1, True) n_ind = 0 ind = {} for word, freq in event_topword_list + historic_topword_list: if not ind.has_key(word): ind[word] = n_ind n_ind += 1 freq1 = [0] * n_ind freq2 = [0] * n_ind for word, freq in event_topword_list: freq1[ind[word]] = freq for word, freq in historic_topword_list: freq2[ind[word]] = freq topic_divergence = KLDivergence.averageKLDivergence(freq1, freq2) return [ event.getAvgPhotoDis(), topic_divergence, # event.getEntropy(entropy_para), entropy_divergence, event.getAvgCaptionLen(), event.getRatioOfPeopleToPhoto() ]
def findPhotos(): pi = PhotoInterface() pi.setDB('') pi.setCollection('')
from photo_interface import PhotoInterface from caption_parser import CaptionParser from photo import Photo from mongodb_interface import MongoDBInterface import random if __name__ == '__main__': pi = PhotoInterface() pi.setDB('citybeat') pi.setCollection('photos') mi = MongoDBInterface() mi.setDB('test_caption') mi.setCollection('captions') photos = pi.getAllDocuments() for photo in photos: i = random.randint(0, 10) if i > 0: continue p = Photo(photo) cap = p.getCaption() if len(cap) > 0: cap = {'caption': cap} mi.saveDocument(cap)