Example #1
0
	def filterRegions(self, region_list, percentage=InstagramConfig.region_percentage,test=False, n=10, m=10):
		if test:
			#n and m should be set if test is true
			#this is only for test
			new_region_list = []
			folder = '/res/users/kx19/Citybeat/CityBeat/distributed_gp/utility/region_cache/'
			file_name = str(n)+'_'+str(m)+'.txt'
			fid = open(folder + file_name)
			for line in fid:
				region = line.split()
				for i in xrange(0,4):
					region[i] = float(region[i])
				region = Region(region)
				new_region_list.append(region)
			return new_region_list
			
			
			
		# this method should not be a member of this class
		# TODO: change the period to one week
#		print 'Begin to filter sparse regions with less photos than the threshold'
		end_time = 1359704845 - 7*3600*24
		begin_time = end_time - 14*3600*24
		pi = PhotoInterface()
		photos = pi.rangeQuery(period=[str(begin_time), str(end_time)])
		region_number = len(region_list)
		number_photo_in_region = [0]*region_number
		for photo in photos:
			lat = float(photo['location']['latitude'])
			lng = float(photo['location']['longitude'])
			flag = 0
			for i in xrange(region_number):
				if region_list[i].insideRegion([lat, lng]):
					number_photo_in_region[i] += 1
					flag = 1
					break
			if flag == 0:
				print 'bad photo:',photo
		
		region_tuples = []
		for i in xrange(0, region_number):
			region_tuples.append((region_list[i], number_photo_in_region[i]))
		
		region_tuples.sort(key=operator.itemgetter(1), reverse=True)

		valid_region_number = int(0.5 + 1.0 * region_number * percentage)
		valid_regions = []
		
#		print region_tuples[valid_region_number-1][1]

		for i in xrange(0, valid_region_number):
			region = region_tuples[i][0]
			lat = (self._region['min_lat'] + self._region['max_lat'])/2
			lng = (self._region['min_lng'] + self._region['max_lng'])/2
			cnt = region_tuples[i][1]
		
		for i in xrange(0, valid_region_number):
			valid_regions.append(region_tuples[i][0])
		
		return valid_regions
Example #2
0
    def getHistoricFeatures(self, entropy_para):
        # this method computes the features that capture the difference between current
        # event and background knowledge

        end_time = self.getLatestElementTime()
        begin_time = self.getEarliestElementTime()
        if self._element_type == "photos":
            pi = PhotoInterface()
        else:
            pi = TweetInterface()

        elements = []
        dt = 0
        for day in xrange(1, 8):
            # here 15 is hard coded because we use 14 days' data as the training
            et = end_time - day * 24 * 3600 + dt / 2
            bt = begin_time - day * 24 * 3600 - dt / 2
            day_elements = pi.rangeQuery(self._event["region"], [str(bt), str(et)])
            inds = range(0, day_elements.count())
            # only select 40 elements
            if len(inds) > 40:
                random.shuffle(inds)
                inds = inds[0:40]
            for i in inds:
                elements.append(day_elements[i])

        random.shuffle(elements)
        elements = elements[0 : min(len(self._event[self._element_type]), len(elements))]

        if len(elements) == 0:
            # TODO: refine
            return [1, 10, 10]

        # fake a historic event
        historic_event = BaseEvent(self._element_type)
        historic_event.setElements(elements)
        historic_event.setRegion(self._event["region"])
        historic_event.setActualValue(historic_event._getActualValueByCounting())
        historic_event = BaseFeature(historic_event)

        # compute the difference between entropy
        # this has been smoothed
        pro1 = self._divideAndCount(entropy_para)
        pro2 = historic_event._divideAndCount(entropy_para)
        entropy_divergence = KLDivergence.averageKLDivergence(pro1, pro2)

        # compute the difference between top words

        topic_divergence = self.computeWordKLDivergenceWith(historic_event)

        return [
            historic_event.getElementDisFeatures()[1],
            topic_divergence,
            #               historic_event.getEntropy(entropy_para),
            entropy_divergence,
        ]
    def getHistoricFeatures(self, entropy_para):
        # this method computes the features that capture the difference between current
        # event and background knowledge

        end_time = self.getLatestPhotoTime()
        begin_time = self.getEarliestPhotoTime()

        pi = PhotoInterface()

        photos = []
        dt = 0
        for day in xrange(1, 15):
            # here 15 is hard coded because we use 14 days' data as the training
            et = end_time - day * 24 * 3600 + dt / 2
            bt = begin_time - day * 24 * 3600 - dt / 2
            day_photos = pi.rangeQuery(self._event['region'], [str(bt), str(et)])
            for photo in day_photos:
                # since rangeQuery sorts the photos from the most current to the most early
                # thus all the photos in the List "photos" are sorted by their created time from 
                # the most current to the most early
                photos.append(photo)

        random.shuffle(photos)
        photos = photos[0:min(len(self._event['photos']), len(photos))]

        if len(photos) == 0:
            # TODO: refine
            return [1, 10, 10]

        # fake a historic event
        historic_event = Event()
        historic_event.setPhotos(photos)
        historic_event.setRegion(self._event['region'])
        historic_event.setActualValue(historic_event._getActualValueByCounting())
        historic_event = BaseFeature(historic_event)

        # compute the difference between entropy
        # this has been smoothed
        pro1 = self._divideAndCount(entropy_para)
        pro2 = historic_event._divideAndCount(entropy_para)
        entropy_divergence = KLDivergence.averageKLDivergence(pro1, pro2)

        # compute the difference between top words

        topic_divergence = self.computeWordKLDivergenceWith(historic_event)

        return [historic_event.getPhotoDisFeatures()[3], topic_divergence,
                #               historic_event.getEntropy(entropy_para),
                entropy_divergence]
Example #4
0
    def buildCorpus(self, region, time_interval, element_type='photos', paras={}):
        # time_interval should be [start, end]
        text = []
        if element_type == 'photos':
            ei = PhotoInterface()
            cur = ei.rangeQuery(region, time_interval, 'caption.text')
        else:
            ei = TweetInterface()
            cur = ei.rangeQuery(region, time_interval, 'text')
        for t in cur:
            try:
                if element_type == 'photos':
                    text.append(t['caption']['text'])
                else:
                    text.append(t['text'])
            except:
                pass

        # it is not proper here to set up stopwords
        self._vectorizer = TfidfVectorizer(max_df=paras.get('max_df', 0.2),
                                           min_df=paras.get('min_df', 0.0),
                                           strip_accents=paras.get('strip_accents', 'ascii'),
                                           preprocessor=paras.get('preprocessor', tool.textPreprocessor),
                                           smooth_idf=paras.get('smooth_idf', True),
                                           sublinear_tf=paras.get('sublinear_tf', True),
                                           norm=paras.get('norm', 'l2'),
                                           analyzer=paras.get('analyzer', 'word'),
                                           ngram_range=paras.get('ngram_range', (1, 1)),
                                           stop_words=paras.get('stop_words', 'english')
        )

        # If the program do not break here, we may ignore the bug
        try:
            self._vectorizer.fit_transform(text)
        except Exception as error :
            logging.warn(error)
Example #5
0
	def getHistoricFeatures(self, entropy_para):
		# this method computes the features that capture the difference between current
		# event and background knowledge
		
		end_time = self.getLatestPhotoTime()
		begin_time = self.getEarliestPhotoTime()
		
		pi = PhotoInterface()
		pi.setDB('citybeat')
		pi.setCollection('photos')
		
		photos = []
		dt = 0
		for day in xrange(1,15):
			# here 15 is hard coded because we use 14 days' data as the training
			et = end_time - day * 24 * 3600 + dt / 2
			bt = begin_time - day * 24 * 3600 - dt / 2
			day_photos = pi.rangeQuery(self._event['region'], [str(bt), str(et)])
			for photo in day_photos:
				# since rangeQuery sorts the photos from the most current to the most early
				# thus all the photos in the List "photos" are sorted by their created time from 
				# the most current to the most early
				photos.append(photo)
				
		random.shuffle(photos)
		photos = photos[0:min(len(self._event['photos']), len(photos))]
		
		# fake a historic event
		historic_event = Event()
		historic_event.setPhotos(photos)
		historic_event.setRegion(self._event['region'])
		historic_event.setActualValue(historic_event._getActualValueByCounting())
		historic_event = EventFeature(historic_event)
		
		# compute the difference between entropy
		# this has been smoothed
		pro1 = self._divideAndCount(entropy_para)
		pro2 = historic_event._divideAndCount(entropy_para)
		entropy_divergence = KLDivergence.averageKLDivergence(pro1, pro2)
		
		# compute the difference between top words
		
		topic_divergence = self.computeWordKLDivergenceWith(historic_event)
		
		return [historic_event.getPhotoDisFeatures()[3], topic_divergence,
#		        historic_event.getEntropy(entropy_para),
		        entropy_divergence]
Example #6
0
def getCaptionStatistics():
	pi = PhotoInterface()
	pi.setDB('citybeat')
	pi.setCollection('photos_no_duplicate')
	tot = 0
	withCap = 0
	l = 0
	for photo in pi.getAllDocuments():
		cap = Photo(photo).getCaption()
		tot += 1
		if len(cap) == 0:
			continue
		withCap += 1
		l += len(cap)
	
	print 1.0*withCap / tot
	print 1.0*l / withCap
Example #7
0
			region_tuples.append((region_list[i], number_photo_in_region[i]))
		
		region_tuples.sort(key=operator.itemgetter(1), reverse=True)

		valid_region_number = int(0.5 + 1.0 * region_number * percentage)
		valid_regions = []
		
#		print region_tuples[valid_region_number-1][1]

		for i in xrange(0, valid_region_number):
			region = region_tuples[i][0]
			lat = (self._region['min_lat'] + self._region['max_lat'])/2
			lng = (self._region['min_lng'] + self._region['max_lng'])/2
			cnt = region_tuples[i][1]
		
		for i in xrange(0, valid_region_number):
			valid_regions.append(region_tuples[i][0])
		
		return valid_regions

if __name__=="__main__":
	coordinates = [InstagramConfig.photo_min_lat, InstagramConfig.photo_min_lng,
	               InstagramConfig.photo_max_lat, InstagramConfig.photo_max_lng]
	nyc = Region(coordinates)
	pi = PhotoInterface()
	pi.rangeQuery(nyc)
	region_list = nyc.divideRegions(20, 20)
	region_list = nyc.filterRegions(region_list, test=True, n=10, m=10)
	for region in region_list:
		region = region.toJSON()
		print region['min_lat'], region['min_lng'], region['max_lat'], region['max_lng']
Example #8
0
from photo_interface import PhotoInterface
from caption_parser import CaptionParser
from photo import Photo
from mongodb_interface import MongoDBInterface

import random


if __name__ == '__main__':
	pi = PhotoInterface()
	pi.setDB('citybeat')
	pi.setCollection('photos')
	
	mi = MongoDBInterface()
	mi.setDB('test_caption')
	mi.setCollection('captions')
	
	photos = pi.getAllDocuments()
	for photo in photos:
		i = random.randint(0,10)
		if i > 0:
			continue
		p = Photo(photo)
		cap = p.getCaption()
		if len(cap) > 0:
			cap = {'caption':cap}
			mi.saveDocument(cap)
Example #9
0
    def getHistoricFeatures(self, entropy_para):
        # this method computes the features that capture the difference between current
        # event and background knowledge

        end_time = self.getLatestPhotoTime()
        begin_time = self.getEarliestPhotoTime()

        pi = PhotoInterface()
        pi.setDB("citybeat")
        pi.setCollection("photos")

        photos = []
        dt = 3600
        for day in xrange(1, 15):
            # here 15 is hard coded because we use 14 days' data as the training
            et = end_time - day * 24 * 3600 + dt / 2
            bt = begin_time - day * 24 * 3600 - dt / 2
            day_photos = pi.rangeQuery(self._event["region"], [str(bt), str(et)])
            for photo in day_photos:
                # since rangeQuery sorts the photos from the most current to the most early
                # thus all the photos in the List "photos" are sorted by their created time from
                # the most current to the most early
                photos.append(photo)

        event = Event()
        event.setPhotos(photos)
        event.setRegion(self._event["region"])
        event.setActualValue(event.getActualValueByCounting())
        event = EventFeature(event)

        # compute the difference between entropy
        # this has been smoothed
        pro1 = self._divideAndCount(entropy_para)
        pro2 = event._divideAndCount(entropy_para)
        entropy_divergence = KLDivergence.averageKLDivergence(pro1, pro2)

        # compute the difference between top words
        event_topword_list = self._getTopWords(-1, True)
        historic_topword_list = event._getTopWords(-1, True)

        n_ind = 0
        ind = {}
        for word, freq in event_topword_list + historic_topword_list:
            if not ind.has_key(word):
                ind[word] = n_ind
                n_ind += 1

        freq1 = [0] * n_ind
        freq2 = [0] * n_ind

        for word, freq in event_topword_list:
            freq1[ind[word]] = freq
        for word, freq in historic_topword_list:
            freq2[ind[word]] = freq

        topic_divergence = KLDivergence.averageKLDivergence(freq1, freq2)

        return [
            event.getAvgPhotoDis(),
            topic_divergence,
            # 		        event.getEntropy(entropy_para),
            entropy_divergence,
            event.getAvgCaptionLen(),
            event.getRatioOfPeopleToPhoto(),
        ]
Example #10
0
def main():
	pi = PhotoInterface()
	pi.setDB('citybeat')
	pi.setCollection('photos')
	
	pi2 = PhotoInterface()
	pi2.setDB('citybeat')
	pi2.setCollection('photos_no_duplicate')
	
	region = {}
	region['min_lat'] = 40.690531
	region['min_lng'] = -74.058151
	region['max_lat'] = 40.823163
	region['max_lng'] = -73.857994
	st = '1352937600'
	et = '1355615999'
	pc = pi.rangeQuery(region, [st, et])
#	print pc.count()
	
	ids = set()
	for photo in pc:
		ids.add(photo['link'])

	print len(ids)
	print pi2.rangeQuery(region, [st, et]).count()
Example #11
0
    def getHistoricFeatures(self, entropy_para):
        # this method computes the features that capture the difference between current
        # event and background knowledge

        end_time = self.getLatestPhotoTime()
        begin_time = self.getEarliestPhotoTime()

        pi = PhotoInterface()
        pi.setDB('citybeat')
        pi.setCollection('photos')

        photos = []
        dt = 3600
        for day in xrange(1, 15):
            # here 15 is hard coded because we use 14 days' data as the training
            et = end_time - day * 24 * 3600 + dt / 2
            bt = begin_time - day * 24 * 3600 - dt / 2
            day_photos = pi.rangeQuery(self._event['region'],
                                       [str(bt), str(et)])
            for photo in day_photos:
                # since rangeQuery sorts the photos from the most current to the most early
                # thus all the photos in the List "photos" are sorted by their created time from
                # the most current to the most early
                photos.append(photo)

        event = Event()
        event.setPhotos(photos)
        event.setRegion(self._event['region'])
        event.setActualValue(event.getActualValueByCounting())
        event = EventFeature(event)

        # compute the difference between entropy
        # this has been smoothed
        pro1 = self._divideAndCount(entropy_para)
        pro2 = event._divideAndCount(entropy_para)
        entropy_divergence = KLDivergence.averageKLDivergence(pro1, pro2)

        # compute the difference between top words
        event_topword_list = self._getTopWords(-1, True)
        historic_topword_list = event._getTopWords(-1, True)

        n_ind = 0
        ind = {}
        for word, freq in event_topword_list + historic_topword_list:
            if not ind.has_key(word):
                ind[word] = n_ind
                n_ind += 1

        freq1 = [0] * n_ind
        freq2 = [0] * n_ind

        for word, freq in event_topword_list:
            freq1[ind[word]] = freq
        for word, freq in historic_topword_list:
            freq2[ind[word]] = freq

        topic_divergence = KLDivergence.averageKLDivergence(freq1, freq2)

        return [
            event.getAvgPhotoDis(),
            topic_divergence,
            #		        event.getEntropy(entropy_para),
            entropy_divergence,
            event.getAvgCaptionLen(),
            event.getRatioOfPeopleToPhoto()
        ]
Example #12
0
    def filterRegions(self, region_list, percentage=InstagramConfig.region_percentage, test=False, n=10, m=10,
                      element_type='photos'):
        assert element_type in ['photos', 'tweets']
        if test:
            #n and m should be set if test is true
            #this is only for test
            new_region_list = []
            #folder = '/res/users/kx19/Citybeat/CityBeat/distributed_gp/utility/region_cache/'
            # grand : res ; joust : grad 
            folder = BaseConfig.getRegionListPath()
            file_name = element_type + '_'
            file_name += str(n) + '_' + str(m) + '.txt'
            fid = open(folder + file_name)
            for line in fid:
                region = line.split()
                for i in xrange(0, 4):
                    region[i] = float(region[i])
                region = Region(region)
                new_region_list.append(region)
            return new_region_list

            # this method should not be a member of this class
            # TODO: change the period to one week

        #       end_time = 1359704845
        #       begin_time = 1299704845
        end_time = 1962096000
        begin_time = 1362096000
        if element_type == 'photos':
            di = PhotoInterface()
        else:
            di = TweetInterface()
        document_cur = di.rangeQuery(period=[str(begin_time), str(end_time)])
        region_number = len(region_list)
        number_document_in_region = [0] * region_number
        bad_documents = 0
        total_documents = 0
        for document in document_cur:
            total_documents += 1
            lat = float(document['location']['latitude'])
            lng = float(document['location']['longitude'])
            flag = 0
            for i in xrange(region_number):
                if region_list[i].insideRegion([lat, lng]):
                    number_document_in_region[i] += 1
                    flag = 1
                    break
            if flag == 0:
                bad_documents += 1

        print str(bad_documents) + ' out of ' + str(total_documents) + ' documents are bad(not in NY)'

        region_tuples = []
        for i in xrange(0, region_number):
            region_tuples.append((region_list[i], number_document_in_region[i]))

        region_tuples.sort(key=operator.itemgetter(1), reverse=True)

        valid_region_number = int(0.5 + 1.0 * region_number * percentage)
        valid_regions = []

        #       print region_tuples[valid_region_number-1][1]

        for i in xrange(0, valid_region_number):
            region = region_tuples[i][0]
            lat = (self._region['min_lat'] + self._region['max_lat']) / 2
            lng = (self._region['min_lng'] + self._region['max_lng']) / 2
            cnt = region_tuples[i][1]

        for i in xrange(0, valid_region_number):
            valid_regions.append(region_tuples[i][0])

        return valid_regions
Example #13
0
    def filterRegions(self,
                      region_list,
                      percentage=InstagramConfig.region_percentage,
                      test=False,
                      n=10,
                      m=10):
        if test:
            #n and m should be set if test is true
            #this is only for test
            new_region_list = []
            folder = '/res/users/kx19/Citybeat/CityBeat/distributed_gp/utility/region_cache/'
            file_name = str(n) + '_' + str(m) + '.txt'
            fid = open(folder + file_name)
            for line in fid:
                region = line.split()
                for i in xrange(0, 4):
                    region[i] = float(region[i])
                region = Region(region)
                new_region_list.append(region)
            return new_region_list

        # this method should not be a member of this class
        # TODO: change the period to one week


#		print 'Begin to filter sparse regions with less photos than the threshold'
        end_time = 1359704845 - 7 * 3600 * 24
        begin_time = end_time - 14 * 3600 * 24
        pi = PhotoInterface()
        photos = pi.rangeQuery(period=[str(begin_time), str(end_time)])
        region_number = len(region_list)
        number_photo_in_region = [0] * region_number
        for photo in photos:
            lat = float(photo['location']['latitude'])
            lng = float(photo['location']['longitude'])
            flag = 0
            for i in xrange(region_number):
                if region_list[i].insideRegion([lat, lng]):
                    number_photo_in_region[i] += 1
                    flag = 1
                    break
            if flag == 0:
                print 'bad photo:', photo

        region_tuples = []
        for i in xrange(0, region_number):
            region_tuples.append((region_list[i], number_photo_in_region[i]))

        region_tuples.sort(key=operator.itemgetter(1), reverse=True)

        valid_region_number = int(0.5 + 1.0 * region_number * percentage)
        valid_regions = []

        #		print region_tuples[valid_region_number-1][1]

        for i in xrange(0, valid_region_number):
            region = region_tuples[i][0]
            lat = (self._region['min_lat'] + self._region['max_lat']) / 2
            lng = (self._region['min_lng'] + self._region['max_lng']) / 2
            cnt = region_tuples[i][1]

        for i in xrange(0, valid_region_number):
            valid_regions.append(region_tuples[i][0])

        return valid_regions
Example #14
0
        valid_region_number = int(0.5 + 1.0 * region_number * percentage)
        valid_regions = []

        #		print region_tuples[valid_region_number-1][1]

        for i in xrange(0, valid_region_number):
            region = region_tuples[i][0]
            lat = (self._region['min_lat'] + self._region['max_lat']) / 2
            lng = (self._region['min_lng'] + self._region['max_lng']) / 2
            cnt = region_tuples[i][1]

        for i in xrange(0, valid_region_number):
            valid_regions.append(region_tuples[i][0])

        return valid_regions

if __name__ == "__main__":
    coordinates = [
        InstagramConfig.photo_min_lat, InstagramConfig.photo_min_lng,
        InstagramConfig.photo_max_lat, InstagramConfig.photo_max_lng
    ]
    nyc = Region(coordinates)
    pi = PhotoInterface()
    pi.rangeQuery(nyc)
    region_list = nyc.divideRegions(20, 20)
    region_list = nyc.filterRegions(region_list, test=True, n=10, m=10)
    for region in region_list:
        region = region.toJSON()
        print region['min_lat'], region['min_lng'], region['max_lat'], region[
            'max_lng']
Example #15
0
def findPhotos():
    pi = PhotoInterface()
    pi.setDB('')
    pi.setCollection('')
Example #16
0
from photo_interface import PhotoInterface
from caption_parser import CaptionParser
from photo import Photo
from mongodb_interface import MongoDBInterface

import random

if __name__ == '__main__':
    pi = PhotoInterface()
    pi.setDB('citybeat')
    pi.setCollection('photos')

    mi = MongoDBInterface()
    mi.setDB('test_caption')
    mi.setCollection('captions')

    photos = pi.getAllDocuments()
    for photo in photos:
        i = random.randint(0, 10)
        if i > 0:
            continue
        p = Photo(photo)
        cap = p.getCaption()
        if len(cap) > 0:
            cap = {'caption': cap}
            mi.saveDocument(cap)