Beispiel #1
0
    def testlastBaselined(self):
        ''' Builds a baseline document for inserting.'''

        # Connect and get handle
        c, dbh = mdb.getHandle()
        dbh = mdb.setupCollections(dbh, dropCollections=True)
        
        # Build a keyword object
        testKywd = kw(keyword='keyword1',
                   timeStamp=datetime.datetime(2011,6,22,12,10,45),
                   lat=34.4, lon=45.5,
                   text='this text contained the hashtag #keyword1',
                   tweetID=346664, userID=4444, source='twitter')
        
        # Create a new baseline object
        baseLine = bl.baseline(kywd=testKywd, cellBuildPeriod=600)
        
        baseLine.outputs['days30_all']      = 0.5
        baseLine.outputs['days7_all']       = 0.4
        baseLine.outputs['hrs30_all']       = 0.3
        baseLine.outputs['days30_weekly']   = 0.2
        baseLine.outputs['days7_daily']     = 0.1
        
        doc = baseLine.buildDoc()
        bl.insertBaselineDoc(dbh, doc)
        
        # Method returns the date of last baseline calculation
        lastBaseline = baseLine.lastBaselined()
        self.assertEquals(lastBaseline, datetime.datetime(2011,6,22,12,10))

        # Close the connection
        mdb.close(c, dbh)
Beispiel #2
0
    def testInsertBaselineDoc(self):
        ''' Inserts a completed baseline document into the baseline collection.'''
        
        # Connect and get handle
        c, dbh = mdb.getHandle()
        dbh = mdb.setupCollections(dbh, dropCollections=True)

        # Build a keyword object
        testKywd = kw(keyword='keyword1',
                   timeStamp=datetime.datetime(2011,6,22,12,10,45),
                   lat=34.4, lon=45.5,
                   text='this text contained the hashtag #keyword1',
                   tweetID=346664, userID=4444, source='twitter')
        
        # Instantiate the baseline object/class
        baseLine = bl.baseline(kywd=testKywd,cellBuildPeriod=600)

        # Build the document and insert it
        doc = baseLine.buildDoc()
        bl.insertBaselineDoc(dbh, doc)
        
        res = dbh.baseline.find()[0]
        print res
        
        self.assertEquals(res['keyword'], 'keyword1')
        self.assertEquals(res['mgrs'], '38SND4595706622')
        self.assertEquals(res['mgrsPrecision'], 10)

        # Close the connection
        mdb.close(c, dbh)
Beispiel #3
0
def main():

    # Should really move these to being 
    parser = OptionParser()
    parser.add_option("-H", "--host",   dest="host")
    parser.add_option("-p", "--port",   dest="port")
    parser.add_option("-d", "--db",     dest="db")
    
    
    parser.add_option("-m", "--mgrs",               dest="mgrs")
    parser.add_option("-M", "--mgrsprecision",      dest="mgrsPrecision")
    parser.add_option("-t", "--timestamp",          dest="timeStamp")
    parser.add_option("-k", "--keyword",            dest="keyword")
    parser.add_option("-u", "--baselineUnit",       dest="baselineUnit")
    parser.add_option("-v", "--baselineValue",      dest="baselineValue")
    
    (options, args) = parser.parse_args()
    
    # Format the option inputs = these really should be arguments
    port              = int(options.port)
    timeStamp         = datetime.datetime.strptime(options.timeStamp, "%Y-%m-%dT%H:%M:%S")
    mgrsPrecision     = int(options.mgrsPrecision)
    baselinePrecision = [options.baselineUnit, int(options.baselineValue)]
    
    c, dbh = mdb.getHandle(host=options.host, port=port, db=options.db)
    
    # Build the baseline objects as we go so that they can be updated at the end of the period.
    base = baseline(options.mgrs, mgrsPrecision, options.keyword, timeStamp, c=c, dbh=dbh, baselinePrecision=baselinePrecision)

    # Does the baseline document need updating?
    if base.needUpdate == True:
        
        # This method takes care of update and insert
        base.processBaseline(tsd.buildBlankData())
        
    try:
        mdb.close(c, dbh)
    except:
        pass
Beispiel #4
0
 def processKeyword(self, eventIn):
     ''' Process tweets coming off JMS into keywords'''
     
     # For processing tweets
     if self.source == 'twitter':
         record = eventRecord('twitter', eventIn)
     
     elif self.source == 'flickr':
         record = eventRecord('flickr', eventIn)
     
     elif self.source == 'instagram':
         record = eventRecord('instagram', eventIn)
     
     elif self.source == 'panoramia':
         record = eventRecord('panoramia', eventIn)
     
     elif self.source == 'foursquares':
         record = eventRecord('foursquares', eventIn)
     else:
         print 'No recognised source of data.'
         sys.exit()
     
     print record.lat, record.lon, record.timeStamp
     
     if not record.lat or not record.lon or not record.timeStamp:
         print 'Geo or time was not present, skipping this record.' 
         return
     
     # Goes from single tweet --> n-keywords
     kywd = processEvent(record, self.source, self.mgrsPrecision)
     
     # Add keywords to the list based on hashtags
     kywd.fromHashTag()                  # Get keyword from the hashtags
     kywd.fromLookup(self.lookup)        # Get keywords from a lookup
     
     #=====================================================================================================
     # PUT ADDITIONAL .fromXXX functions in here to get content from other types (tags, keywords, nlp, etc) 
     #=====================================================================================================
     
     # If we don't find anything, build a blank keyword so that the observation isn't lost
     if len(kywd.keywords) == 0:
         kywd.whenNothingFound(tokenName=self.nothingFound)
     
     #Now loop the resultant keywords
     for extractedKeyword in kywd.keywords:
         
         # Pass keyword object into a class
         ts = timeSeries(c=self.c, dbh=self.dbh)
         ts.importData(extractedKeyword, blockPrecision=24)
 
         # Insert the time series document
         success = ts.insertDoc(self.cleanUpWorker, blankData=self.blankData, incrementBy=self.increment)
         
         # Build the baseline objects as we go so that they can be updated at the end of the period.
         base = baseline(mgrs              = extractedKeyword.mgrs,
                         mgrsPrecision     = extractedKeyword.mgrsPrecision, 
                         keyword           = extractedKeyword.keyword, 
                         timeStamp         = extractedKeyword.timeStamp,
                         c                 = self.c,
                         dbh               = self.dbh,
                         baselinePrecision = self.baselineParameters)
     
         # Does the baseline document need updating?
         if base.needUpdate == True:
             base.processBaseline(self.blankData)
Beispiel #5
0
    def testProcessBaselineLast30Days(self):
        ''' Checks accurate population of an array for 30 day all '''
        
        # Connect and get handle
        c, dbh = mdb.getHandle()
        dbh = mdb.setupCollections(dbh, dropCollections=True)

        # Set up some times to work with
        tweetTime = datetime.datetime.utcnow()
        thisMinute = tweetTime.replace(second=0,microsecond=0)
        today = tweetTime.replace(hour=0, minute=0, second=0, microsecond=0)
        
        # Thirty days ago - at the start of the day
        lastMonthTweet = tweetTime - datetime.timedelta(days=30)
        
        # Build a keyword object
        testKywd = kw(keyword='keyword1',
                      timeStamp=lastMonthTweet,
                      lat=34.4, lon=45.5,
                      text='this text contained the hashtag #keyword1',
                      tweetID=346664, userID=4444, source='twitter')
        
        # Insert a new timeseries object for the tweet 30 days ago
        ts = timeSeries()
        ts.importData(testKywd)
        success = ts.insertBlankDoc()
        ts.updateCount()
        
        # Create a keyword object for the current tweet
        testKywd2 = kw(keyword='keyword1',
                       timeStamp=lastMonthTweet + datetime.timedelta(hours=1),
                       lat=34.4, lon=45.5,
                       text='this text contained the hashtag #keyword1',
                       tweetID=346664, userID=4444, source='twitter')
        
        # Insert the current keyword - NOTE HOW THIS IS AFTER THE BASELINE BUILD
        ts2 = timeSeries()
        ts2.importData(testKywd2)
        success = ts2.insertBlankDoc()
        ts2.updateCount()
        
        # Create a keyword object for the current tweet
        testKywd3 = testKywd
        testKywd3.timeStamp = tweetTime
        # Instantiate the baseline object/class
        base = bl.baseline(kywd=testKywd3, cellBuildPeriod=600)
        if base.needUpdate == True:
            if not base.lastBaselined():
                doc = base.buildDoc()
                bl.insertBaselineDoc(dbh, doc)
        
        # Insert the current keyword - NOTE HOW THIS IS AFTER THE BASELINE BUILD
        ts3 = timeSeries()
        ts3.importData(testKywd3)
        success = ts3.insertBlankDoc()
        ts3.updateCount()

        tweetTimeMinus2Days = tweetTime - datetime.timedelta(days=2)
        
        # Create a new keyword object to test the daily slicing
        testKywd5 = kw(keyword='keyword1',
                       timeStamp=tweetTimeMinus2Days,
                       lat=34.4, lon=45.5,
                       text='this text contained the hashtag #keyword1',
                       tweetID=346664, userID=4444, source='twitter')
        
        # Insert the current keyword - NOTE HOW THIS IS AFTER THE BASELINE BUILD
        ts5 = timeSeries()
        ts5.importData(testKywd5)
        success = ts5.insertBlankDoc()
        ts5.updateCount()

        # Process Baseline
        base.processBaseline()
        
        # Get back the 30 day array
        arr = base.test30DayArray
        
        # Calculate what the array length should be
        soFarToday = (thisMinute - today).seconds/60.0
        
        # The start of the array datetime
        lastMonthDay = lastMonthTweet.replace(hour=0, minute=0, second=0, microsecond=0)
        
        # The number of days between today and the start of the array (then in minutes)
        dateDiff = (today - lastMonthDay)
        minsDiff = dateDiff.days*1440 + dateDiff.seconds/60.0 
        total = minsDiff + soFarToday
        
        # Confirm its the right length
        self.assertEqual(total, len(arr))
        
        # Get the minutes for the first 2 keywords (the third shouldn't be there)
        kwd1Min = int((testKywd.timeStamp - lastMonthDay).seconds/60)
        kwd2Min = int((testKywd2.timeStamp - lastMonthDay).seconds/60)
        
        kwd1Test = [arr[kwd1Min-1], arr[kwd1Min], arr[kwd1Min+1]]
        kwd2Test = [arr[kwd2Min-1], arr[kwd2Min], arr[kwd2Min+1]]
        
        for j in arr:
            if arr[j] > 0:
                print j, arr[j]
        
        self.assertEquals(kwd1Test, [0,1,0])
        self.assertEquals(kwd2Test, [0,1,0])
        
        # 30 DAY TIME SLICE CHECK
        arr = base.test30DaySliced
        # weekly 
        testSliced = int(30/7) * 6 * 60
        self.assertEquals(testSliced, len(arr))
        
        arr7Day = base.test7DayArray
        test7DayAll = (thisMinute - today).seconds/60.0 + 1440*7
        self.assertEquals(len(arr7Day), int(test7DayAll))
        
        arr30Hrs = base.test30hrArray
        test30Hours = 30*60
        self.assertEquals(len(arr30Hrs), int(test30Hours))
        
        # Close the connection
        mdb.close(c, dbh)