def run(data_source): coordinates = [InstagramConfig.photo_min_lat, InstagramConfig.photo_min_lng, InstagramConfig.photo_max_lat, InstagramConfig.photo_max_lng ] alarm_region_size = 25 nyc_region = Region(coordinates) regions = nyc_region.divideRegions(alarm_region_size, alarm_region_size) if data_source == 'twitter': regions = nyc_region.filterRegions(region_list=regions, test=True, n=alarm_region_size, m=alarm_region_size, element_type='tweets') elif data_source == 'instagram': regions = nyc_region.filterRegions(region_list=regions, test=True, n=alarm_region_size, m=alarm_region_size, element_type='photos') cur_utc_time = getCurrentStampUTC() for region in regions: start_of_time = cur_utc_time end_of_time = cur_utc_time if data_source == 'twitter': alarm = Alarm(region, start_of_time, end_of_time, TwitterConfig.prediction_collection, TwitterConfig.event_collection, data_source) elif data_source == 'instagram': alarm = Alarm(region, start_of_time, end_of_time, InstagramConfig.prediction_collection, InstagramConfig.event_collection, data_source) #for test only #alarm = Alarm(region, start_of_time, end_of_time, InstagramConfig.prediction_collection, "tmp_remove", data_source) region.display() alarm.fireAlarm()
def findLast24HourEvents(): ei = EventInterface() ei.setCollection(InstagramConfig.front_end_events) now = int(getCurrentStampUTC()) # for merge reason, delay one hour offset = 60 * 60 end_time = now - offset begin_time = end_time - 24 * 3600 conditions = {'created_time':{'$gte':str(begin_time), '$lte':str(end_time)}} fields = ['_id'] cur = ei.getAllFields(fields=fields, condition=conditions) event_count = 0 with open(csv_file, 'wb') as csvfile: event_writer = csv.writer(csvfile, delimiter=',') events = [] for event in cur: url = 'http://ec2-23-22-67-45.compute-1.amazonaws.com/cb/event/' + str(event['_id']) events.append([url]) event_count += 1 event_writer.writerows(events) return event_count
def run(): coordinates = [ InstagramConfig.photo_min_lat, InstagramConfig.photo_min_lng, InstagramConfig.photo_max_lat, InstagramConfig.photo_max_lng ] huge_region = Region(coordinates) alarm_region_size = 25 regions = huge_region.divideRegions(alarm_region_size, alarm_region_size) filtered_regions = huge_region.filterRegions(region_list=regions, test=True, n=alarm_region_size, m=alarm_region_size) cur_utc_time = getCurrentStampUTC() regions = filtered_regions print 'all regions', len(regions) for region in regions: start_of_time = cur_utc_time end_of_time = cur_utc_time alarm = Alarm(region, start_of_time, end_of_time, 'online_prediction', 'online_candidate') region.display() alarm.fireAlarm()
def goThroughCandidateDB(self): """Go through candidate event db and classify whatever is left""" ei = EventInterface(self.candidate_db, self.candidate_collection) ei_classified = EventInterface(self.classified_event_db, self.classified_event_collection) cnt = 0 # consider past 2 hours for merge low_bound = str(int(getCurrentStampUTC()) - 60 * 60 * 2) condition = {'created_time':{ '$gte': low_bound}} for e in ei.getAllDocuments(condition=condition): logging.warning("Classifying %d-th candidate event..." % cnt) e = Event(e) cnt += 1 region = Region(e.getRegion()) corpus = self.all_corpus[region.getKey()] ef = BaseFeatureProduction(e, corpus) prob = self.clf.classify(ef.extractFeatures()) if ei_classified.getEventByID(e.getID()) is not None: if prob > 0.5: print 'already in front end collection, merge it' ei_classified.addEvent(e) else: print 'after merge it becomes none event, delete it' ei_classified.deleteEventByID(e.getID()) else: if prob > 0.5: print 'new events find in collection but not in front end , add it' ei_classified.addEvent(e)
def _extractTweetTopMentions(self, k=10): # 60 minutes now = int(getCurrentStampUTC()) time_span = 60 * 60 end_time = now begin_time = end_time - time_span cur = self._tweet_interface.rangeQuery(period=[begin_time, end_time], fields=['text']) users = {} twitter_username_re = re.compile(r'(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9_-]+)') for tweet in cur: text = tweet['text'] mentions = twitter_username_re.findall(text) for mention in mentions: count = users.get(mention, 0) + 1 users[mention] = count users = sorted(users.iteritems(), key=operator.itemgetter(1), reverse=True) res = [] for key, value in users: res_pair = {} res_pair['user_name'] = key res_pair['count'] = value res.append(res_pair) if len(res) >= 10: break return res
def _extractPhotoCount(self): now = int(getCurrentStampUTC()) offset = 4 * 60 current_count = self._photo_interface.rangeQuery(period=[now - offset - 60, now - offset]).count() baseline_count = self._photo_interface.rangeQuery(period=[now - 60 * 21 - offset, now - offset - 60]).count() / 20.0 if baseline_count == 0.0: return [current_count, stats_config.NO_BASE_LINE] else: return [current_count, (current_count - baseline_count) / baseline_count]
def _extractTweetCount(self): now = int(getCurrentStampUTC()) # 5 seconds as the latency current_count = self._tweet_interface.rangeQuery(period=[now - 65, now - 5]).count() baseline_count = self._tweet_interface.rangeQuery(period=[now - 65 - 60 * 20, now - 65]).count() / 20.0 if baseline_count == 0.0: return [current_count, stats_config.NO_BASE_LINE] else: return [current_count, (current_count - baseline_count) / baseline_count]
def run(): coordinates = [ InstagramConfig.photo_min_lat, InstagramConfig.photo_min_lng, InstagramConfig.photo_max_lat, InstagramConfig.photo_max_lng ] huge_region = Region(coordinates) regions = huge_region.divideRegions(25, 25) filtered_regions = huge_region.filterRegions(regions) regions = filtered_regions for r in regions: r.display() cur_utc_timestamp = getCurrentStampUTC() #experiment start time - Dec 1 00:00 clock = 1354320000 + 7 * 24 * 3600 end_of_time = 1354320000 + 7 * 24 * 3600 + 7 * 24 * 3600 days_passed = 0 _results = {} _saved = {} redis_conn = Redis("tall4") redis_queue = Queue(connection=redis_conn) while clock < end_of_time: print 'working on day ', days_passed days_passed += 1 # use 14 days of data as training fourteen_days_ago = clock - 14 * 24 * 3600 for i in range(len(regions)): #for i in range(1): test_region = regions[i] try: gp = GaussianProcessJob(test_region, str(fourteen_days_ago), str(clock), redis_queue) res, pred_time = gp.submit() except Exception as e: print 'Initialization of gp error. continue, error message %s' % ( e) continue _results[gp.getID()] = (test_region, res, pred_time) _saved[gp.getID()] = False save_to_mongo(_results, _saved, cur_utc_timestamp) clock += 3600 * 24 done = False while not done: done = save_to_mongo(_results, _saved, cur_utc_timestamp) time.sleep(10) print 'finish work'
def run(): coordinates = [InstagramConfig.photo_min_lat, InstagramConfig.photo_min_lng, InstagramConfig.photo_max_lat, InstagramConfig.photo_max_lng ] huge_region = Region(coordinates) regions = huge_region.divideRegions(25,25) filtered_regions = huge_region.filterRegions( regions ) regions = filtered_regions for r in regions: r.display() cur_utc_timestamp = getCurrentStampUTC() #experiment start time - Dec 1 00:00 clock = 1354320000 + 7*24*3600 end_of_time = 1354320000 + 7*24*3600 + 7*24*3600 days_passed = 0 _results = {} _saved = {} redis_conn = Redis("tall4") redis_queue = Queue(connection = redis_conn) while clock<end_of_time: print 'working on day ',days_passed days_passed+=1 # use 14 days of data as training fourteen_days_ago = clock - 14*24*3600 for i in range(len(regions)): #for i in range(1): test_region = regions[i] try: gp = GaussianProcessJob( test_region, str(fourteen_days_ago), str(clock) , redis_queue) res, pred_time = gp.submit() except Exception as e: print 'Initialization of gp error. continue, error message %s'%(e) continue _results[gp.getID()] = (test_region, res, pred_time) _saved[ gp.getID() ] = False save_to_mongo(_results, _saved, cur_utc_timestamp) clock+=3600*24 done = False while not done: done = save_to_mongo(_results, _saved, cur_utc_timestamp) time.sleep(10) print 'finish work'
def _extract24HoursCountsStats(self, past_week=False, type='tweets'): now = int(getCurrentStampUTC()) offset = 0 if past_week: offset = 7 * 24 count_during_past_24_hours = [] for hour in xrange(24): end_time = now - 3600 * (hour + offset) begin_time = end_time - 3600 if type == 'tweets': count_during_past_24_hours.append(self._tweet_interface.rangeQuery(period=[begin_time, end_time]).count()) else: count_during_past_24_hours.append(self._photo_interface.rangeQuery(period=[begin_time, end_time]).count()) return count_during_past_24_hours
def run(data_source): coordinates = [ InstagramConfig.photo_min_lat, InstagramConfig.photo_min_lng, InstagramConfig.photo_max_lat, InstagramConfig.photo_max_lng, ] nyc_region = Region(coordinates) regions = nyc_region.divideRegions(25, 25) if data_source == "twitter": regions = nyc_region.filterRegions(regions, test=True, n=25, m=25, element_type="tweets") elif data_source == "instagram": regions = nyc_region.filterRegions(regions, test=True, n=25, m=25, element_type="photos") for r in regions: r.display() cur_utc_timestamp = getCurrentStampUTC() _results = {} _saved = {} redis_conn = Redis("tall4") redis_queue = Queue(connection=redis_conn) fourteen_days_ago = cur_utc_timestamp - 24 * 14 * 3600 for i in range(len(regions)): logging.warn("Working on region %d" % i) test_region = regions[i] # try: gp = GaussianProcessJob(test_region, str(fourteen_days_ago), str(cur_utc_timestamp), redis_queue) res, pred_time = gp.submit() # except Exception as e: # logging.warn("Initialization of gp error. continue, error message %s" % e) # continue _results[gp.getID()] = (test_region, res, pred_time) _saved[gp.getID()] = False save_to_mongo(_results, _saved, cur_utc_timestamp, data_source) done = False while not done: done = save_to_mongo(_results, _saved, cur_utc_timestamp, data_source) time.sleep(10) logging.warn("Waiting for completing...") logging.warn("Work done.")
def run(): coordinates = [ InstagramConfig.photo_min_lat, InstagramConfig.photo_min_lng, InstagramConfig.photo_max_lat, InstagramConfig.photo_max_lng ] huge_region = Region(coordinates) regions = huge_region.divideRegions(25, 25) filtered_regions = huge_region.filterRegions(regions) regions = filtered_regions for r in regions: r.display() cur_utc_timestamp = getCurrentStampUTC() _results = {} _saved = {} redis_conn = Redis("tall4") redis_queue = Queue(connection=redis_conn) fourteen_days_ago = cur_utc_timestamp - 24 * 14 * 3600 for i in range(len(regions)): test_region = regions[i] try: gp = GaussianProcessJob(test_region, str(fourteen_days_ago), str(cur_utc_timestamp), redis_queue) res, pred_time = gp.submit() except Exception as e: print 'Initialization of gp error. continue, error message %s' % ( e) continue _results[gp.getID()] = (test_region, res, pred_time) _saved[gp.getID()] = False save_to_mongo(_results, _saved, cur_utc_timestamp) done = False while not done: done = save_to_mongo(_results, _saved, cur_utc_timestamp) time.sleep(10) print 'finish work'
def getTweetAndPhotoStats(self): stats = {} tweet_basic_count = {} photo_basic_count = {} photo_basic_count['last_minute'] = self._getCurrentCountStats('photos') photo_basic_count['last_24_hour'] = self._get24HoursCountStats('photos') tweet_basic_count['last_minute'] = self._getCurrentCountStats('tweets') tweet_basic_count['last_24_hour'] = self._get24HoursCountStats('tweets') res = self._extractMostPopularTweet() stats['photo_basic_count'] = photo_basic_count stats['tweet_basic_count'] = tweet_basic_count stats['created_time'] = str(getCurrentStampUTC()) stats['tweet_top_mentions'] = self._extractTweetTopMentions() stats['most_popular_tweet'] = res[0] stats['tweet_vs_retweet'] = res[1] return stats
def run(): coordinates = [InstagramConfig.photo_min_lat, InstagramConfig.photo_min_lng, InstagramConfig.photo_max_lat, InstagramConfig.photo_max_lng ] huge_region = Region(coordinates) regions = huge_region.divideRegions(25,25) filtered_regions = huge_region.filterRegions( regions ) regions = filtered_regions for r in regions: r.display() cur_utc_timestamp = getCurrentStampUTC() _results = {} _saved = {} redis_conn = Redis("tall4") redis_queue = Queue(connection = redis_conn) fourteen_days_ago = cur_utc_timestamp - 24*14*3600 for i in range(len(regions)): test_region = regions[i] try: gp = GaussianProcessJob( test_region, str(fourteen_days_ago), str(cur_utc_timestamp) , redis_queue) res, pred_time = gp.submit() except Exception as e: print 'Initialization of gp error. continue, error message %s'%(e) continue _results[gp.getID()] = (test_region, res, pred_time) _saved[ gp.getID() ] = False save_to_mongo(_results, _saved, cur_utc_timestamp) done = False while not done: done = save_to_mongo(_results, _saved, cur_utc_timestamp) time.sleep(10) print 'finish work'
def getAllEvents(self): now = int(getCurrentStampUTC()) two_days_before = now - 3 * 24 * 3600 event_cursor = self.ei.getAllDocuments({'created_time':{'$gte':str(two_days_before)}}) events = [] for e in event_cursor: #representor #rep_photos = self.representor.getRepresentivePhotos(e) #e['photos'] = rep_photos[:min(5,len(rep_photos))] e['_id'] = str(e['_id']) e['urgency'] = 58 e['volume'] = 99 e['stats'] = {'photos':50, 'tweets':0, 'checkins':0} #print e['photos'] if e['actual_value']>=6 and e['zscore']>3.0: events.append(e) events = sorted(events, key = lambda x:x['created_time'], reverse=True) for w in events: print w['created_time'] events = events[:5] return json.dumps(events)
def _extractMostPopularTweet(self): ti = TweetInterface(collection=TwitterConfig.extended_tweet_collection) tweets = {} most_popular_tweet_text = '' max_retweet_count = -1 user_name = '' # 60 minutes now = int(getCurrentStampUTC()) time_span = 60 * 60 end_time = now begin_time = end_time - time_span for tweet in ti.rangeQuery(period=[begin_time, end_time], fields=['text', 'user.screen_name']): text = tweet['text'] count = tweets.get(text, 0) + 1 tweets[text] = count if count > max_retweet_count: max_retweet_count = count most_popular_tweet_text = text user_name = tweet['user']['screen_name'] single_tweet_count = 0 retweet_count = 0 for key, value in tweets.items(): if value == 1: single_tweet_count += 1 else: retweet_count += value most_popular_tweet = {} most_popular_tweet['user_name'] = user_name most_popular_tweet['text'] = most_popular_tweet_text most_popular_tweet['count'] = max_retweet_count tweets_count = {} tweets_count['tweet_percentage'] = 1.0 * single_tweet_count / (single_tweet_count + retweet_count) tweets_count['retweet_percentage'] = 1.0 * retweet_count / (single_tweet_count + retweet_count) return [most_popular_tweet, tweets_count]
def run(): coordinates = [InstagramConfig.photo_min_lat, InstagramConfig.photo_min_lng, InstagramConfig.photo_max_lat, InstagramConfig.photo_max_lng ] huge_region = Region(coordinates) alarm_region_size = 25 regions = huge_region.divideRegions(alarm_region_size,alarm_region_size) filtered_regions = huge_region.filterRegions( region_list = regions, test=True, n=alarm_region_size, m = alarm_region_size) cur_utc_time = getCurrentStampUTC() regions = filtered_regions print 'all regions',len(regions) for region in regions: start_of_time = cur_utc_time end_of_time = cur_utc_time alarm = Alarm(region, start_of_time, end_of_time, 'online_prediction', 'online_candidate') region.display() alarm.fireAlarm()
def getLatestStats(self): now = int(getCurrentStampUTC()) - 5 * 60 condition = {'created_time': {"$gte": str(now)}} most_recent_stats = self.stats_interface.getAllDocuments(condition=condition).sort('created_time', -1)[0] most_recent_stats['_id'] = str(most_recent_stats['_id']) return json.dumps(most_recent_stats)