Beispiel #1
0
def main():
	pi = PhotoInterface()
	pi.setDB('citybeat')
	pi.setCollection('photos')
	
	pi2 = PhotoInterface()
	pi2.setDB('citybeat')
	pi2.setCollection('photos_no_duplicate')
	
	region = {}
	region['min_lat'] = 40.690531
	region['min_lng'] = -74.058151
	region['max_lat'] = 40.823163
	region['max_lng'] = -73.857994
	st = '1352937600'
	et = '1355615999'
	pc = pi.rangeQuery(region, [st, et])
#	print pc.count()
	
	ids = set()
	for photo in pc:
		ids.add(photo['link'])

	print len(ids)
	print pi2.rangeQuery(region, [st, et]).count()
Beispiel #2
0
	def getHistoricFeatures(self, entropy_para):
		# this method computes the features that capture the difference between current
		# event and background knowledge
		
		end_time = self.getLatestPhotoTime()
		begin_time = self.getEarliestPhotoTime()
		
		pi = PhotoInterface()
		pi.setDB('citybeat')
		pi.setCollection('photos')
		
		photos = []
		dt = 0
		for day in xrange(1,15):
			# here 15 is hard coded because we use 14 days' data as the training
			et = end_time - day * 24 * 3600 + dt / 2
			bt = begin_time - day * 24 * 3600 - dt / 2
			day_photos = pi.rangeQuery(self._event['region'], [str(bt), str(et)])
			for photo in day_photos:
				# since rangeQuery sorts the photos from the most current to the most early
				# thus all the photos in the List "photos" are sorted by their created time from 
				# the most current to the most early
				photos.append(photo)
				
		random.shuffle(photos)
		photos = photos[0:min(len(self._event['photos']), len(photos))]
		
		# fake a historic event
		historic_event = Event()
		historic_event.setPhotos(photos)
		historic_event.setRegion(self._event['region'])
		historic_event.setActualValue(historic_event._getActualValueByCounting())
		historic_event = EventFeature(historic_event)
		
		# compute the difference between entropy
		# this has been smoothed
		pro1 = self._divideAndCount(entropy_para)
		pro2 = historic_event._divideAndCount(entropy_para)
		entropy_divergence = KLDivergence.averageKLDivergence(pro1, pro2)
		
		# compute the difference between top words
		
		topic_divergence = self.computeWordKLDivergenceWith(historic_event)
		
		return [historic_event.getPhotoDisFeatures()[3], topic_divergence,
#		        historic_event.getEntropy(entropy_para),
		        entropy_divergence]
Beispiel #3
0
def getCaptionStatistics():
	pi = PhotoInterface()
	pi.setDB('citybeat')
	pi.setCollection('photos_no_duplicate')
	tot = 0
	withCap = 0
	l = 0
	for photo in pi.getAllDocuments():
		cap = Photo(photo).getCaption()
		tot += 1
		if len(cap) == 0:
			continue
		withCap += 1
		l += len(cap)
	
	print 1.0*withCap / tot
	print 1.0*l / withCap
from photo_interface import PhotoInterface
from caption_parser import CaptionParser
from photo import Photo
from mongodb_interface import MongoDBInterface

import random


if __name__ == '__main__':
	pi = PhotoInterface()
	pi.setDB('citybeat')
	pi.setCollection('photos')
	
	mi = MongoDBInterface()
	mi.setDB('test_caption')
	mi.setCollection('captions')
	
	photos = pi.getAllDocuments()
	for photo in photos:
		i = random.randint(0,10)
		if i > 0:
			continue
		p = Photo(photo)
		cap = p.getCaption()
		if len(cap) > 0:
			cap = {'caption':cap}
			mi.saveDocument(cap)
Beispiel #5
0
    def getHistoricFeatures(self, entropy_para):
        # this method computes the features that capture the difference between current
        # event and background knowledge

        end_time = self.getLatestPhotoTime()
        begin_time = self.getEarliestPhotoTime()

        pi = PhotoInterface()
        pi.setDB("citybeat")
        pi.setCollection("photos")

        photos = []
        dt = 3600
        for day in xrange(1, 15):
            # here 15 is hard coded because we use 14 days' data as the training
            et = end_time - day * 24 * 3600 + dt / 2
            bt = begin_time - day * 24 * 3600 - dt / 2
            day_photos = pi.rangeQuery(self._event["region"], [str(bt), str(et)])
            for photo in day_photos:
                # since rangeQuery sorts the photos from the most current to the most early
                # thus all the photos in the List "photos" are sorted by their created time from
                # the most current to the most early
                photos.append(photo)

        event = Event()
        event.setPhotos(photos)
        event.setRegion(self._event["region"])
        event.setActualValue(event.getActualValueByCounting())
        event = EventFeature(event)

        # compute the difference between entropy
        # this has been smoothed
        pro1 = self._divideAndCount(entropy_para)
        pro2 = event._divideAndCount(entropy_para)
        entropy_divergence = KLDivergence.averageKLDivergence(pro1, pro2)

        # compute the difference between top words
        event_topword_list = self._getTopWords(-1, True)
        historic_topword_list = event._getTopWords(-1, True)

        n_ind = 0
        ind = {}
        for word, freq in event_topword_list + historic_topword_list:
            if not ind.has_key(word):
                ind[word] = n_ind
                n_ind += 1

        freq1 = [0] * n_ind
        freq2 = [0] * n_ind

        for word, freq in event_topword_list:
            freq1[ind[word]] = freq
        for word, freq in historic_topword_list:
            freq2[ind[word]] = freq

        topic_divergence = KLDivergence.averageKLDivergence(freq1, freq2)

        return [
            event.getAvgPhotoDis(),
            topic_divergence,
            # 		        event.getEntropy(entropy_para),
            entropy_divergence,
            event.getAvgCaptionLen(),
            event.getRatioOfPeopleToPhoto(),
        ]
Beispiel #6
0
    def getHistoricFeatures(self, entropy_para):
        # this method computes the features that capture the difference between current
        # event and background knowledge

        end_time = self.getLatestPhotoTime()
        begin_time = self.getEarliestPhotoTime()

        pi = PhotoInterface()
        pi.setDB('citybeat')
        pi.setCollection('photos')

        photos = []
        dt = 3600
        for day in xrange(1, 15):
            # here 15 is hard coded because we use 14 days' data as the training
            et = end_time - day * 24 * 3600 + dt / 2
            bt = begin_time - day * 24 * 3600 - dt / 2
            day_photos = pi.rangeQuery(self._event['region'],
                                       [str(bt), str(et)])
            for photo in day_photos:
                # since rangeQuery sorts the photos from the most current to the most early
                # thus all the photos in the List "photos" are sorted by their created time from
                # the most current to the most early
                photos.append(photo)

        event = Event()
        event.setPhotos(photos)
        event.setRegion(self._event['region'])
        event.setActualValue(event.getActualValueByCounting())
        event = EventFeature(event)

        # compute the difference between entropy
        # this has been smoothed
        pro1 = self._divideAndCount(entropy_para)
        pro2 = event._divideAndCount(entropy_para)
        entropy_divergence = KLDivergence.averageKLDivergence(pro1, pro2)

        # compute the difference between top words
        event_topword_list = self._getTopWords(-1, True)
        historic_topword_list = event._getTopWords(-1, True)

        n_ind = 0
        ind = {}
        for word, freq in event_topword_list + historic_topword_list:
            if not ind.has_key(word):
                ind[word] = n_ind
                n_ind += 1

        freq1 = [0] * n_ind
        freq2 = [0] * n_ind

        for word, freq in event_topword_list:
            freq1[ind[word]] = freq
        for word, freq in historic_topword_list:
            freq2[ind[word]] = freq

        topic_divergence = KLDivergence.averageKLDivergence(freq1, freq2)

        return [
            event.getAvgPhotoDis(),
            topic_divergence,
            #		        event.getEntropy(entropy_para),
            entropy_divergence,
            event.getAvgCaptionLen(),
            event.getRatioOfPeopleToPhoto()
        ]
Beispiel #7
0
def findPhotos():
    pi = PhotoInterface()
    pi.setDB('')
    pi.setCollection('')
Beispiel #8
0
from photo_interface import PhotoInterface
from caption_parser import CaptionParser
from photo import Photo
from mongodb_interface import MongoDBInterface

import random

if __name__ == '__main__':
    pi = PhotoInterface()
    pi.setDB('citybeat')
    pi.setCollection('photos')

    mi = MongoDBInterface()
    mi.setDB('test_caption')
    mi.setCollection('captions')

    photos = pi.getAllDocuments()
    for photo in photos:
        i = random.randint(0, 10)
        if i > 0:
            continue
        p = Photo(photo)
        cap = p.getCaption()
        if len(cap) > 0:
            cap = {'caption': cap}
            mi.saveDocument(cap)