Ejemplo n.º 1
0
def run(data_source):
    coordinates = [InstagramConfig.photo_min_lat,
                   InstagramConfig.photo_min_lng,
                   InstagramConfig.photo_max_lat,
                   InstagramConfig.photo_max_lng
    ]

    alarm_region_size = 25
    nyc_region = Region(coordinates)
    regions = nyc_region.divideRegions(alarm_region_size, alarm_region_size)

    if data_source == 'twitter':
        regions = nyc_region.filterRegions(region_list=regions, test=True, n=alarm_region_size, m=alarm_region_size,
                                           element_type='tweets')
    elif data_source == 'instagram':
        regions = nyc_region.filterRegions(region_list=regions, test=True, n=alarm_region_size, m=alarm_region_size,
                                           element_type='photos')

    cur_utc_time = getCurrentStampUTC()

    for region in regions:
        start_of_time = cur_utc_time
        end_of_time = cur_utc_time
        if data_source == 'twitter':
            alarm = Alarm(region, start_of_time, end_of_time, TwitterConfig.prediction_collection,
                          TwitterConfig.event_collection, data_source)
        elif data_source == 'instagram':
            alarm = Alarm(region, start_of_time, end_of_time, InstagramConfig.prediction_collection,
                          InstagramConfig.event_collection, data_source)
            #for test only
            #alarm = Alarm(region, start_of_time, end_of_time, InstagramConfig.prediction_collection, "tmp_remove", data_source)
        region.display()
        alarm.fireAlarm()
def findLast24HourEvents():
    ei = EventInterface()
    ei.setCollection(InstagramConfig.front_end_events)

    now = int(getCurrentStampUTC())
    # for merge reason, delay one hour
    offset = 60 * 60
    end_time = now - offset
    begin_time = end_time - 24 * 3600

    conditions = {'created_time':{'$gte':str(begin_time), '$lte':str(end_time)}}
    fields = ['_id']
    cur = ei.getAllFields(fields=fields, condition=conditions)

    event_count = 0
    with open(csv_file, 'wb') as csvfile:
        event_writer = csv.writer(csvfile, delimiter=',')
        events = []
        for event in cur:
            url = 'http://ec2-23-22-67-45.compute-1.amazonaws.com/cb/event/' + str(event['_id'])
            events.append([url])
            event_count += 1
        event_writer.writerows(events)

    return event_count
Ejemplo n.º 3
0
def run():
    coordinates = [
        InstagramConfig.photo_min_lat, InstagramConfig.photo_min_lng,
        InstagramConfig.photo_max_lat, InstagramConfig.photo_max_lng
    ]
    huge_region = Region(coordinates)

    alarm_region_size = 25

    regions = huge_region.divideRegions(alarm_region_size, alarm_region_size)
    filtered_regions = huge_region.filterRegions(region_list=regions,
                                                 test=True,
                                                 n=alarm_region_size,
                                                 m=alarm_region_size)

    cur_utc_time = getCurrentStampUTC()

    regions = filtered_regions
    print 'all regions', len(regions)
    for region in regions:
        start_of_time = cur_utc_time
        end_of_time = cur_utc_time
        alarm = Alarm(region, start_of_time, end_of_time, 'online_prediction',
                      'online_candidate')
        region.display()
        alarm.fireAlarm()
Ejemplo n.º 4
0
    def goThroughCandidateDB(self):
        """Go through candidate event db and classify whatever is left"""
        ei = EventInterface(self.candidate_db, self.candidate_collection)
        ei_classified = EventInterface(self.classified_event_db, self.classified_event_collection)
        cnt = 0
        # consider past 2 hours for merge
        low_bound = str(int(getCurrentStampUTC()) - 60 * 60 * 2)
        condition = {'created_time':{ '$gte':  low_bound}}
        for e in ei.getAllDocuments(condition=condition):
            logging.warning("Classifying %d-th candidate event..." % cnt)
            e = Event(e)
            cnt += 1
            region = Region(e.getRegion())
            corpus = self.all_corpus[region.getKey()]
            ef = BaseFeatureProduction(e, corpus)
            prob = self.clf.classify(ef.extractFeatures())

            if ei_classified.getEventByID(e.getID()) is not None:
                if prob > 0.5:
                    print 'already in front end collection, merge it'
                    ei_classified.addEvent(e)
                else:
                    print 'after merge it becomes none event, delete it'
                    ei_classified.deleteEventByID(e.getID())
            else:
                if prob > 0.5:
                    print 'new events find in collection but not in front end , add it'
                    ei_classified.addEvent(e)
Ejemplo n.º 5
0
    def _extractTweetTopMentions(self, k=10):
        # 60 minutes
        now = int(getCurrentStampUTC())
        time_span = 60 * 60
        end_time = now
        begin_time = end_time - time_span
        cur = self._tweet_interface.rangeQuery(period=[begin_time, end_time], fields=['text'])

        users = {}
        twitter_username_re = re.compile(r'(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9_-]+)')
        for tweet in cur:
            text = tweet['text']
            mentions = twitter_username_re.findall(text)
            for mention in mentions:
                count = users.get(mention, 0) + 1
                users[mention] = count

        users = sorted(users.iteritems(), key=operator.itemgetter(1), reverse=True)
        res = []
        for key, value in users:
            res_pair = {}
            res_pair['user_name'] = key
            res_pair['count'] = value
            res.append(res_pair)
            if len(res) >= 10:
                break
        return res
Ejemplo n.º 6
0
 def _extractPhotoCount(self):
     now = int(getCurrentStampUTC())
     offset = 4 * 60
     current_count = self._photo_interface.rangeQuery(period=[now - offset - 60, now - offset]).count()
     baseline_count = self._photo_interface.rangeQuery(period=[now - 60 * 21 - offset, now - offset - 60]).count() / 20.0
     if baseline_count == 0.0:
         return [current_count, stats_config.NO_BASE_LINE]
     else:
         return [current_count, (current_count - baseline_count) / baseline_count]
Ejemplo n.º 7
0
 def _extractTweetCount(self):
     now = int(getCurrentStampUTC())
     # 5 seconds as the latency
     current_count = self._tweet_interface.rangeQuery(period=[now - 65, now - 5]).count()
     baseline_count = self._tweet_interface.rangeQuery(period=[now - 65 - 60 * 20, now - 65]).count() / 20.0
     if baseline_count == 0.0:
         return [current_count, stats_config.NO_BASE_LINE]
     else:
         return [current_count, (current_count - baseline_count) / baseline_count]
Ejemplo n.º 8
0
def run():
    coordinates = [
        InstagramConfig.photo_min_lat, InstagramConfig.photo_min_lng,
        InstagramConfig.photo_max_lat, InstagramConfig.photo_max_lng
    ]
    huge_region = Region(coordinates)

    regions = huge_region.divideRegions(25, 25)
    filtered_regions = huge_region.filterRegions(regions)
    regions = filtered_regions

    for r in regions:
        r.display()

    cur_utc_timestamp = getCurrentStampUTC()
    #experiment start time - Dec 1 00:00
    clock = 1354320000 + 7 * 24 * 3600
    end_of_time = 1354320000 + 7 * 24 * 3600 + 7 * 24 * 3600
    days_passed = 0
    _results = {}
    _saved = {}

    redis_conn = Redis("tall4")
    redis_queue = Queue(connection=redis_conn)

    while clock < end_of_time:
        print 'working on day ', days_passed
        days_passed += 1
        # use 14 days of data as training
        fourteen_days_ago = clock - 14 * 24 * 3600

        for i in range(len(regions)):
            #for i in range(1):
            test_region = regions[i]
            try:
                gp = GaussianProcessJob(test_region, str(fourteen_days_ago),
                                        str(clock), redis_queue)
                res, pred_time = gp.submit()
            except Exception as e:
                print 'Initialization of gp error. continue, error message %s' % (
                    e)
                continue
            _results[gp.getID()] = (test_region, res, pred_time)
            _saved[gp.getID()] = False
        save_to_mongo(_results, _saved, cur_utc_timestamp)
        clock += 3600 * 24
    done = False
    while not done:
        done = save_to_mongo(_results, _saved, cur_utc_timestamp)
        time.sleep(10)

    print 'finish work'
Ejemplo n.º 9
0
def run():
    coordinates = [InstagramConfig.photo_min_lat,
            InstagramConfig.photo_min_lng,
            InstagramConfig.photo_max_lat,
            InstagramConfig.photo_max_lng
                 ]
    huge_region = Region(coordinates)
    
    regions = huge_region.divideRegions(25,25)
    filtered_regions = huge_region.filterRegions( regions )
    regions = filtered_regions

    for r in regions:
        r.display()

    cur_utc_timestamp = getCurrentStampUTC() 
    #experiment start time - Dec 1 00:00
    clock = 1354320000  + 7*24*3600
    end_of_time = 1354320000 + 7*24*3600  + 7*24*3600
    days_passed = 0
    _results =  {} 
    _saved = {}

    redis_conn = Redis("tall4")
    redis_queue = Queue(connection = redis_conn)

    while clock<end_of_time:
        print 'working on day ',days_passed
        days_passed+=1
        # use 14 days of data as training
        fourteen_days_ago = clock - 14*24*3600

        for i in range(len(regions)):
        #for i in range(1):
            test_region = regions[i]
            try:
                gp = GaussianProcessJob( test_region, str(fourteen_days_ago), str(clock) , redis_queue)
                res, pred_time = gp.submit()
            except Exception as e:
                print 'Initialization of gp error. continue, error message %s'%(e)
                continue
            _results[gp.getID()] = (test_region, res, pred_time)
            _saved[ gp.getID() ] = False
        save_to_mongo(_results, _saved, cur_utc_timestamp) 
        clock+=3600*24
    done = False
    while not done:
        done = save_to_mongo(_results, _saved, cur_utc_timestamp)
        time.sleep(10)

    print 'finish work' 
Ejemplo n.º 10
0
 def _extract24HoursCountsStats(self, past_week=False, type='tweets'):
     now = int(getCurrentStampUTC())
     offset = 0
     if past_week:
         offset = 7 * 24
     count_during_past_24_hours = []
     for hour in xrange(24):
         end_time = now - 3600 * (hour + offset)
         begin_time = end_time - 3600
         if type == 'tweets':
             count_during_past_24_hours.append(self._tweet_interface.rangeQuery(period=[begin_time, end_time]).count())
         else:
             count_during_past_24_hours.append(self._photo_interface.rangeQuery(period=[begin_time, end_time]).count())
     return count_during_past_24_hours
Ejemplo n.º 11
0
def run(data_source):
    coordinates = [
        InstagramConfig.photo_min_lat,
        InstagramConfig.photo_min_lng,
        InstagramConfig.photo_max_lat,
        InstagramConfig.photo_max_lng,
    ]
    nyc_region = Region(coordinates)
    regions = nyc_region.divideRegions(25, 25)
    if data_source == "twitter":
        regions = nyc_region.filterRegions(regions, test=True, n=25, m=25, element_type="tweets")
    elif data_source == "instagram":
        regions = nyc_region.filterRegions(regions, test=True, n=25, m=25, element_type="photos")

    for r in regions:
        r.display()

    cur_utc_timestamp = getCurrentStampUTC()

    _results = {}
    _saved = {}

    redis_conn = Redis("tall4")
    redis_queue = Queue(connection=redis_conn)
    fourteen_days_ago = cur_utc_timestamp - 24 * 14 * 3600

    for i in range(len(regions)):
        logging.warn("Working on region %d" % i)
        test_region = regions[i]
        # try:
        gp = GaussianProcessJob(test_region, str(fourteen_days_ago), str(cur_utc_timestamp), redis_queue)
        res, pred_time = gp.submit()
        # except Exception as e:
        #    logging.warn("Initialization of gp error. continue, error message %s" % e)
        #    continue
        _results[gp.getID()] = (test_region, res, pred_time)
        _saved[gp.getID()] = False

    save_to_mongo(_results, _saved, cur_utc_timestamp, data_source)
    done = False
    while not done:
        done = save_to_mongo(_results, _saved, cur_utc_timestamp, data_source)
        time.sleep(10)
        logging.warn("Waiting for completing...")

    logging.warn("Work done.")
Ejemplo n.º 12
0
def run():
    coordinates = [
        InstagramConfig.photo_min_lat, InstagramConfig.photo_min_lng,
        InstagramConfig.photo_max_lat, InstagramConfig.photo_max_lng
    ]
    huge_region = Region(coordinates)

    regions = huge_region.divideRegions(25, 25)
    filtered_regions = huge_region.filterRegions(regions)
    regions = filtered_regions

    for r in regions:
        r.display()

    cur_utc_timestamp = getCurrentStampUTC()

    _results = {}
    _saved = {}

    redis_conn = Redis("tall4")
    redis_queue = Queue(connection=redis_conn)
    fourteen_days_ago = cur_utc_timestamp - 24 * 14 * 3600

    for i in range(len(regions)):
        test_region = regions[i]
        try:
            gp = GaussianProcessJob(test_region, str(fourteen_days_ago),
                                    str(cur_utc_timestamp), redis_queue)
            res, pred_time = gp.submit()
        except Exception as e:
            print 'Initialization of gp error. continue, error message %s' % (
                e)
            continue
        _results[gp.getID()] = (test_region, res, pred_time)
        _saved[gp.getID()] = False

    save_to_mongo(_results, _saved, cur_utc_timestamp)
    done = False
    while not done:
        done = save_to_mongo(_results, _saved, cur_utc_timestamp)
        time.sleep(10)

    print 'finish work'
Ejemplo n.º 13
0
    def getTweetAndPhotoStats(self):
        stats = {}
        tweet_basic_count = {}
        photo_basic_count = {}

        photo_basic_count['last_minute'] = self._getCurrentCountStats('photos')
        photo_basic_count['last_24_hour'] = self._get24HoursCountStats('photos')

        tweet_basic_count['last_minute'] = self._getCurrentCountStats('tweets')
        tweet_basic_count['last_24_hour'] = self._get24HoursCountStats('tweets')

        res = self._extractMostPopularTweet()
        stats['photo_basic_count'] = photo_basic_count
        stats['tweet_basic_count'] = tweet_basic_count
        stats['created_time'] = str(getCurrentStampUTC())
        stats['tweet_top_mentions'] = self._extractTweetTopMentions()
        stats['most_popular_tweet'] = res[0]
        stats['tweet_vs_retweet'] = res[1]
        return stats
Ejemplo n.º 14
0
def run():
    coordinates = [InstagramConfig.photo_min_lat,
            InstagramConfig.photo_min_lng,
            InstagramConfig.photo_max_lat,
            InstagramConfig.photo_max_lng
                 ]
    huge_region = Region(coordinates)
    
    regions = huge_region.divideRegions(25,25)
    filtered_regions = huge_region.filterRegions( regions )
    regions = filtered_regions

    for r in regions:
        r.display()

    cur_utc_timestamp = getCurrentStampUTC() 
    
    _results =  {} 
    _saved = {}

    redis_conn = Redis("tall4")
    redis_queue = Queue(connection = redis_conn)
    fourteen_days_ago = cur_utc_timestamp - 24*14*3600

    for i in range(len(regions)):
        test_region = regions[i]
        try:
            gp = GaussianProcessJob( test_region, str(fourteen_days_ago), str(cur_utc_timestamp) , redis_queue)
            res, pred_time = gp.submit()
        except Exception as e:
            print 'Initialization of gp error. continue, error message %s'%(e)
            continue
        _results[gp.getID()] = (test_region, res, pred_time)
        _saved[ gp.getID() ] = False

    save_to_mongo(_results, _saved, cur_utc_timestamp) 
    done = False
    while not done:
        done = save_to_mongo(_results, _saved, cur_utc_timestamp)
        time.sleep(10)

    print 'finish work' 
Ejemplo n.º 15
0
 def getAllEvents(self):
     now = int(getCurrentStampUTC())
     two_days_before = now - 3 * 24 * 3600
     event_cursor = self.ei.getAllDocuments({'created_time':{'$gte':str(two_days_before)}})
     events = []
     for e in event_cursor:
         #representor
         #rep_photos = self.representor.getRepresentivePhotos(e)
         #e['photos'] = rep_photos[:min(5,len(rep_photos))]
         e['_id'] = str(e['_id'])
         e['urgency'] = 58
         e['volume'] = 99
         e['stats'] = {'photos':50, 'tweets':0, 'checkins':0}
         #print e['photos']
         if e['actual_value']>=6 and e['zscore']>3.0:
             events.append(e)
     events = sorted(events, key = lambda x:x['created_time'], reverse=True)
     for w in events:
         print w['created_time']
     events = events[:5]
     return json.dumps(events)
Ejemplo n.º 16
0
    def _extractMostPopularTweet(self):
        ti = TweetInterface(collection=TwitterConfig.extended_tweet_collection)
        tweets = {}
        most_popular_tweet_text = ''
        max_retweet_count = -1
        user_name = ''

        # 60 minutes
        now = int(getCurrentStampUTC())
        time_span = 60 * 60
        end_time = now
        begin_time = end_time - time_span

        for tweet in ti.rangeQuery(period=[begin_time, end_time], fields=['text', 'user.screen_name']):
            text = tweet['text']
            count = tweets.get(text, 0) + 1
            tweets[text] = count
            if count > max_retweet_count:
                max_retweet_count = count
                most_popular_tweet_text = text
                user_name = tweet['user']['screen_name']

        single_tweet_count = 0
        retweet_count = 0
        for key, value in tweets.items():
            if value == 1:
                single_tweet_count += 1
            else:
                retweet_count += value

        most_popular_tweet = {}
        most_popular_tweet['user_name'] = user_name
        most_popular_tweet['text'] = most_popular_tweet_text
        most_popular_tweet['count'] = max_retweet_count

        tweets_count = {}
        tweets_count['tweet_percentage'] = 1.0 * single_tweet_count / (single_tweet_count + retweet_count)
        tweets_count['retweet_percentage'] = 1.0 * retweet_count / (single_tweet_count + retweet_count)

        return [most_popular_tweet, tweets_count]
Ejemplo n.º 17
0
def run():
    coordinates = [InstagramConfig.photo_min_lat,
            InstagramConfig.photo_min_lng,
            InstagramConfig.photo_max_lat,
            InstagramConfig.photo_max_lng
                 ]
    huge_region = Region(coordinates)
    
    alarm_region_size = 25

    regions = huge_region.divideRegions(alarm_region_size,alarm_region_size)
    filtered_regions = huge_region.filterRegions( region_list = regions, test=True, n=alarm_region_size, m = alarm_region_size)

    cur_utc_time = getCurrentStampUTC() 

    regions = filtered_regions
    print 'all regions',len(regions)
    for region in regions:
        start_of_time =  cur_utc_time
        end_of_time = cur_utc_time
        alarm = Alarm(region, start_of_time, end_of_time, 'online_prediction', 'online_candidate')
        region.display()
        alarm.fireAlarm()
Ejemplo n.º 18
0
 def getLatestStats(self):
     now = int(getCurrentStampUTC()) - 5 * 60
     condition = {'created_time': {"$gte": str(now)}}
     most_recent_stats = self.stats_interface.getAllDocuments(condition=condition).sort('created_time', -1)[0]
     most_recent_stats['_id'] = str(most_recent_stats['_id'])
     return json.dumps(most_recent_stats)