def filterRegions(self, region_list, percentage=InstagramConfig.region_percentage,test=False, n=10, m=10): if test: #n and m should be set if test is true #this is only for test new_region_list = [] folder = '/res/users/kx19/Citybeat/CityBeat/distributed_gp/utility/region_cache/' file_name = str(n)+'_'+str(m)+'.txt' fid = open(folder + file_name) for line in fid: region = line.split() for i in xrange(0,4): region[i] = float(region[i]) region = Region(region) new_region_list.append(region) return new_region_list # this method should not be a member of this class # TODO: change the period to one week # print 'Begin to filter sparse regions with less photos than the threshold' end_time = 1359704845 - 7*3600*24 begin_time = end_time - 14*3600*24 pi = PhotoInterface() photos = pi.rangeQuery(period=[str(begin_time), str(end_time)]) region_number = len(region_list) number_photo_in_region = [0]*region_number for photo in photos: lat = float(photo['location']['latitude']) lng = float(photo['location']['longitude']) flag = 0 for i in xrange(region_number): if region_list[i].insideRegion([lat, lng]): number_photo_in_region[i] += 1 flag = 1 break if flag == 0: print 'bad photo:',photo region_tuples = [] for i in xrange(0, region_number): region_tuples.append((region_list[i], number_photo_in_region[i])) region_tuples.sort(key=operator.itemgetter(1), reverse=True) valid_region_number = int(0.5 + 1.0 * region_number * percentage) valid_regions = [] # print region_tuples[valid_region_number-1][1] for i in xrange(0, valid_region_number): region = region_tuples[i][0] lat = (self._region['min_lat'] + self._region['max_lat'])/2 lng = (self._region['min_lng'] + self._region['max_lng'])/2 cnt = region_tuples[i][1] for i in xrange(0, valid_region_number): valid_regions.append(region_tuples[i][0]) return valid_regions
def getHistoricFeatures(self, entropy_para): # this method computes the features that capture the difference between current # event and background knowledge end_time = self.getLatestElementTime() begin_time = self.getEarliestElementTime() if self._element_type == "photos": pi = PhotoInterface() else: pi = TweetInterface() elements = [] dt = 0 for day in xrange(1, 8): # here 15 is hard coded because we use 14 days' data as the training et = end_time - day * 24 * 3600 + dt / 2 bt = begin_time - day * 24 * 3600 - dt / 2 day_elements = pi.rangeQuery(self._event["region"], [str(bt), str(et)]) inds = range(0, day_elements.count()) # only select 40 elements if len(inds) > 40: random.shuffle(inds) inds = inds[0:40] for i in inds: elements.append(day_elements[i]) random.shuffle(elements) elements = elements[0 : min(len(self._event[self._element_type]), len(elements))] if len(elements) == 0: # TODO: refine return [1, 10, 10] # fake a historic event historic_event = BaseEvent(self._element_type) historic_event.setElements(elements) historic_event.setRegion(self._event["region"]) historic_event.setActualValue(historic_event._getActualValueByCounting()) historic_event = BaseFeature(historic_event) # compute the difference between entropy # this has been smoothed pro1 = self._divideAndCount(entropy_para) pro2 = historic_event._divideAndCount(entropy_para) entropy_divergence = KLDivergence.averageKLDivergence(pro1, pro2) # compute the difference between top words topic_divergence = self.computeWordKLDivergenceWith(historic_event) return [ historic_event.getElementDisFeatures()[1], topic_divergence, # historic_event.getEntropy(entropy_para), entropy_divergence, ]
def getHistoricFeatures(self, entropy_para): # this method computes the features that capture the difference between current # event and background knowledge end_time = self.getLatestPhotoTime() begin_time = self.getEarliestPhotoTime() pi = PhotoInterface() photos = [] dt = 0 for day in xrange(1, 15): # here 15 is hard coded because we use 14 days' data as the training et = end_time - day * 24 * 3600 + dt / 2 bt = begin_time - day * 24 * 3600 - dt / 2 day_photos = pi.rangeQuery(self._event['region'], [str(bt), str(et)]) for photo in day_photos: # since rangeQuery sorts the photos from the most current to the most early # thus all the photos in the List "photos" are sorted by their created time from # the most current to the most early photos.append(photo) random.shuffle(photos) photos = photos[0:min(len(self._event['photos']), len(photos))] if len(photos) == 0: # TODO: refine return [1, 10, 10] # fake a historic event historic_event = Event() historic_event.setPhotos(photos) historic_event.setRegion(self._event['region']) historic_event.setActualValue(historic_event._getActualValueByCounting()) historic_event = BaseFeature(historic_event) # compute the difference between entropy # this has been smoothed pro1 = self._divideAndCount(entropy_para) pro2 = historic_event._divideAndCount(entropy_para) entropy_divergence = KLDivergence.averageKLDivergence(pro1, pro2) # compute the difference between top words topic_divergence = self.computeWordKLDivergenceWith(historic_event) return [historic_event.getPhotoDisFeatures()[3], topic_divergence, # historic_event.getEntropy(entropy_para), entropy_divergence]
def buildCorpus(self, region, time_interval, element_type='photos', paras={}): # time_interval should be [start, end] text = [] if element_type == 'photos': ei = PhotoInterface() cur = ei.rangeQuery(region, time_interval, 'caption.text') else: ei = TweetInterface() cur = ei.rangeQuery(region, time_interval, 'text') for t in cur: try: if element_type == 'photos': text.append(t['caption']['text']) else: text.append(t['text']) except: pass # it is not proper here to set up stopwords self._vectorizer = TfidfVectorizer(max_df=paras.get('max_df', 0.2), min_df=paras.get('min_df', 0.0), strip_accents=paras.get('strip_accents', 'ascii'), preprocessor=paras.get('preprocessor', tool.textPreprocessor), smooth_idf=paras.get('smooth_idf', True), sublinear_tf=paras.get('sublinear_tf', True), norm=paras.get('norm', 'l2'), analyzer=paras.get('analyzer', 'word'), ngram_range=paras.get('ngram_range', (1, 1)), stop_words=paras.get('stop_words', 'english') ) # If the program do not break here, we may ignore the bug try: self._vectorizer.fit_transform(text) except Exception as error : logging.warn(error)
def getHistoricFeatures(self, entropy_para): # this method computes the features that capture the difference between current # event and background knowledge end_time = self.getLatestPhotoTime() begin_time = self.getEarliestPhotoTime() pi = PhotoInterface() pi.setDB('citybeat') pi.setCollection('photos') photos = [] dt = 0 for day in xrange(1,15): # here 15 is hard coded because we use 14 days' data as the training et = end_time - day * 24 * 3600 + dt / 2 bt = begin_time - day * 24 * 3600 - dt / 2 day_photos = pi.rangeQuery(self._event['region'], [str(bt), str(et)]) for photo in day_photos: # since rangeQuery sorts the photos from the most current to the most early # thus all the photos in the List "photos" are sorted by their created time from # the most current to the most early photos.append(photo) random.shuffle(photos) photos = photos[0:min(len(self._event['photos']), len(photos))] # fake a historic event historic_event = Event() historic_event.setPhotos(photos) historic_event.setRegion(self._event['region']) historic_event.setActualValue(historic_event._getActualValueByCounting()) historic_event = EventFeature(historic_event) # compute the difference between entropy # this has been smoothed pro1 = self._divideAndCount(entropy_para) pro2 = historic_event._divideAndCount(entropy_para) entropy_divergence = KLDivergence.averageKLDivergence(pro1, pro2) # compute the difference between top words topic_divergence = self.computeWordKLDivergenceWith(historic_event) return [historic_event.getPhotoDisFeatures()[3], topic_divergence, # historic_event.getEntropy(entropy_para), entropy_divergence]
def getCaptionStatistics(): pi = PhotoInterface() pi.setDB('citybeat') pi.setCollection('photos_no_duplicate') tot = 0 withCap = 0 l = 0 for photo in pi.getAllDocuments(): cap = Photo(photo).getCaption() tot += 1 if len(cap) == 0: continue withCap += 1 l += len(cap) print 1.0*withCap / tot print 1.0*l / withCap
region_tuples.append((region_list[i], number_photo_in_region[i])) region_tuples.sort(key=operator.itemgetter(1), reverse=True) valid_region_number = int(0.5 + 1.0 * region_number * percentage) valid_regions = [] # print region_tuples[valid_region_number-1][1] for i in xrange(0, valid_region_number): region = region_tuples[i][0] lat = (self._region['min_lat'] + self._region['max_lat'])/2 lng = (self._region['min_lng'] + self._region['max_lng'])/2 cnt = region_tuples[i][1] for i in xrange(0, valid_region_number): valid_regions.append(region_tuples[i][0]) return valid_regions if __name__=="__main__": coordinates = [InstagramConfig.photo_min_lat, InstagramConfig.photo_min_lng, InstagramConfig.photo_max_lat, InstagramConfig.photo_max_lng] nyc = Region(coordinates) pi = PhotoInterface() pi.rangeQuery(nyc) region_list = nyc.divideRegions(20, 20) region_list = nyc.filterRegions(region_list, test=True, n=10, m=10) for region in region_list: region = region.toJSON() print region['min_lat'], region['min_lng'], region['max_lat'], region['max_lng']
from photo_interface import PhotoInterface from caption_parser import CaptionParser from photo import Photo from mongodb_interface import MongoDBInterface import random if __name__ == '__main__': pi = PhotoInterface() pi.setDB('citybeat') pi.setCollection('photos') mi = MongoDBInterface() mi.setDB('test_caption') mi.setCollection('captions') photos = pi.getAllDocuments() for photo in photos: i = random.randint(0,10) if i > 0: continue p = Photo(photo) cap = p.getCaption() if len(cap) > 0: cap = {'caption':cap} mi.saveDocument(cap)
def getHistoricFeatures(self, entropy_para): # this method computes the features that capture the difference between current # event and background knowledge end_time = self.getLatestPhotoTime() begin_time = self.getEarliestPhotoTime() pi = PhotoInterface() pi.setDB("citybeat") pi.setCollection("photos") photos = [] dt = 3600 for day in xrange(1, 15): # here 15 is hard coded because we use 14 days' data as the training et = end_time - day * 24 * 3600 + dt / 2 bt = begin_time - day * 24 * 3600 - dt / 2 day_photos = pi.rangeQuery(self._event["region"], [str(bt), str(et)]) for photo in day_photos: # since rangeQuery sorts the photos from the most current to the most early # thus all the photos in the List "photos" are sorted by their created time from # the most current to the most early photos.append(photo) event = Event() event.setPhotos(photos) event.setRegion(self._event["region"]) event.setActualValue(event.getActualValueByCounting()) event = EventFeature(event) # compute the difference between entropy # this has been smoothed pro1 = self._divideAndCount(entropy_para) pro2 = event._divideAndCount(entropy_para) entropy_divergence = KLDivergence.averageKLDivergence(pro1, pro2) # compute the difference between top words event_topword_list = self._getTopWords(-1, True) historic_topword_list = event._getTopWords(-1, True) n_ind = 0 ind = {} for word, freq in event_topword_list + historic_topword_list: if not ind.has_key(word): ind[word] = n_ind n_ind += 1 freq1 = [0] * n_ind freq2 = [0] * n_ind for word, freq in event_topword_list: freq1[ind[word]] = freq for word, freq in historic_topword_list: freq2[ind[word]] = freq topic_divergence = KLDivergence.averageKLDivergence(freq1, freq2) return [ event.getAvgPhotoDis(), topic_divergence, # event.getEntropy(entropy_para), entropy_divergence, event.getAvgCaptionLen(), event.getRatioOfPeopleToPhoto(), ]
def main(): pi = PhotoInterface() pi.setDB('citybeat') pi.setCollection('photos') pi2 = PhotoInterface() pi2.setDB('citybeat') pi2.setCollection('photos_no_duplicate') region = {} region['min_lat'] = 40.690531 region['min_lng'] = -74.058151 region['max_lat'] = 40.823163 region['max_lng'] = -73.857994 st = '1352937600' et = '1355615999' pc = pi.rangeQuery(region, [st, et]) # print pc.count() ids = set() for photo in pc: ids.add(photo['link']) print len(ids) print pi2.rangeQuery(region, [st, et]).count()
def getHistoricFeatures(self, entropy_para): # this method computes the features that capture the difference between current # event and background knowledge end_time = self.getLatestPhotoTime() begin_time = self.getEarliestPhotoTime() pi = PhotoInterface() pi.setDB('citybeat') pi.setCollection('photos') photos = [] dt = 3600 for day in xrange(1, 15): # here 15 is hard coded because we use 14 days' data as the training et = end_time - day * 24 * 3600 + dt / 2 bt = begin_time - day * 24 * 3600 - dt / 2 day_photos = pi.rangeQuery(self._event['region'], [str(bt), str(et)]) for photo in day_photos: # since rangeQuery sorts the photos from the most current to the most early # thus all the photos in the List "photos" are sorted by their created time from # the most current to the most early photos.append(photo) event = Event() event.setPhotos(photos) event.setRegion(self._event['region']) event.setActualValue(event.getActualValueByCounting()) event = EventFeature(event) # compute the difference between entropy # this has been smoothed pro1 = self._divideAndCount(entropy_para) pro2 = event._divideAndCount(entropy_para) entropy_divergence = KLDivergence.averageKLDivergence(pro1, pro2) # compute the difference between top words event_topword_list = self._getTopWords(-1, True) historic_topword_list = event._getTopWords(-1, True) n_ind = 0 ind = {} for word, freq in event_topword_list + historic_topword_list: if not ind.has_key(word): ind[word] = n_ind n_ind += 1 freq1 = [0] * n_ind freq2 = [0] * n_ind for word, freq in event_topword_list: freq1[ind[word]] = freq for word, freq in historic_topword_list: freq2[ind[word]] = freq topic_divergence = KLDivergence.averageKLDivergence(freq1, freq2) return [ event.getAvgPhotoDis(), topic_divergence, # event.getEntropy(entropy_para), entropy_divergence, event.getAvgCaptionLen(), event.getRatioOfPeopleToPhoto() ]
def filterRegions(self, region_list, percentage=InstagramConfig.region_percentage, test=False, n=10, m=10, element_type='photos'): assert element_type in ['photos', 'tweets'] if test: #n and m should be set if test is true #this is only for test new_region_list = [] #folder = '/res/users/kx19/Citybeat/CityBeat/distributed_gp/utility/region_cache/' # grand : res ; joust : grad folder = BaseConfig.getRegionListPath() file_name = element_type + '_' file_name += str(n) + '_' + str(m) + '.txt' fid = open(folder + file_name) for line in fid: region = line.split() for i in xrange(0, 4): region[i] = float(region[i]) region = Region(region) new_region_list.append(region) return new_region_list # this method should not be a member of this class # TODO: change the period to one week # end_time = 1359704845 # begin_time = 1299704845 end_time = 1962096000 begin_time = 1362096000 if element_type == 'photos': di = PhotoInterface() else: di = TweetInterface() document_cur = di.rangeQuery(period=[str(begin_time), str(end_time)]) region_number = len(region_list) number_document_in_region = [0] * region_number bad_documents = 0 total_documents = 0 for document in document_cur: total_documents += 1 lat = float(document['location']['latitude']) lng = float(document['location']['longitude']) flag = 0 for i in xrange(region_number): if region_list[i].insideRegion([lat, lng]): number_document_in_region[i] += 1 flag = 1 break if flag == 0: bad_documents += 1 print str(bad_documents) + ' out of ' + str(total_documents) + ' documents are bad(not in NY)' region_tuples = [] for i in xrange(0, region_number): region_tuples.append((region_list[i], number_document_in_region[i])) region_tuples.sort(key=operator.itemgetter(1), reverse=True) valid_region_number = int(0.5 + 1.0 * region_number * percentage) valid_regions = [] # print region_tuples[valid_region_number-1][1] for i in xrange(0, valid_region_number): region = region_tuples[i][0] lat = (self._region['min_lat'] + self._region['max_lat']) / 2 lng = (self._region['min_lng'] + self._region['max_lng']) / 2 cnt = region_tuples[i][1] for i in xrange(0, valid_region_number): valid_regions.append(region_tuples[i][0]) return valid_regions
def filterRegions(self, region_list, percentage=InstagramConfig.region_percentage, test=False, n=10, m=10): if test: #n and m should be set if test is true #this is only for test new_region_list = [] folder = '/res/users/kx19/Citybeat/CityBeat/distributed_gp/utility/region_cache/' file_name = str(n) + '_' + str(m) + '.txt' fid = open(folder + file_name) for line in fid: region = line.split() for i in xrange(0, 4): region[i] = float(region[i]) region = Region(region) new_region_list.append(region) return new_region_list # this method should not be a member of this class # TODO: change the period to one week # print 'Begin to filter sparse regions with less photos than the threshold' end_time = 1359704845 - 7 * 3600 * 24 begin_time = end_time - 14 * 3600 * 24 pi = PhotoInterface() photos = pi.rangeQuery(period=[str(begin_time), str(end_time)]) region_number = len(region_list) number_photo_in_region = [0] * region_number for photo in photos: lat = float(photo['location']['latitude']) lng = float(photo['location']['longitude']) flag = 0 for i in xrange(region_number): if region_list[i].insideRegion([lat, lng]): number_photo_in_region[i] += 1 flag = 1 break if flag == 0: print 'bad photo:', photo region_tuples = [] for i in xrange(0, region_number): region_tuples.append((region_list[i], number_photo_in_region[i])) region_tuples.sort(key=operator.itemgetter(1), reverse=True) valid_region_number = int(0.5 + 1.0 * region_number * percentage) valid_regions = [] # print region_tuples[valid_region_number-1][1] for i in xrange(0, valid_region_number): region = region_tuples[i][0] lat = (self._region['min_lat'] + self._region['max_lat']) / 2 lng = (self._region['min_lng'] + self._region['max_lng']) / 2 cnt = region_tuples[i][1] for i in xrange(0, valid_region_number): valid_regions.append(region_tuples[i][0]) return valid_regions
valid_region_number = int(0.5 + 1.0 * region_number * percentage) valid_regions = [] # print region_tuples[valid_region_number-1][1] for i in xrange(0, valid_region_number): region = region_tuples[i][0] lat = (self._region['min_lat'] + self._region['max_lat']) / 2 lng = (self._region['min_lng'] + self._region['max_lng']) / 2 cnt = region_tuples[i][1] for i in xrange(0, valid_region_number): valid_regions.append(region_tuples[i][0]) return valid_regions if __name__ == "__main__": coordinates = [ InstagramConfig.photo_min_lat, InstagramConfig.photo_min_lng, InstagramConfig.photo_max_lat, InstagramConfig.photo_max_lng ] nyc = Region(coordinates) pi = PhotoInterface() pi.rangeQuery(nyc) region_list = nyc.divideRegions(20, 20) region_list = nyc.filterRegions(region_list, test=True, n=10, m=10) for region in region_list: region = region.toJSON() print region['min_lat'], region['min_lng'], region['max_lat'], region[ 'max_lng']
def findPhotos(): pi = PhotoInterface() pi.setDB('') pi.setCollection('')
from photo_interface import PhotoInterface from caption_parser import CaptionParser from photo import Photo from mongodb_interface import MongoDBInterface import random if __name__ == '__main__': pi = PhotoInterface() pi.setDB('citybeat') pi.setCollection('photos') mi = MongoDBInterface() mi.setDB('test_caption') mi.setCollection('captions') photos = pi.getAllDocuments() for photo in photos: i = random.randint(0, 10) if i > 0: continue p = Photo(photo) cap = p.getCaption() if len(cap) > 0: cap = {'caption': cap} mi.saveDocument(cap)