def testlastBaselined(self): ''' Builds a baseline document for inserting.''' # Connect and get handle c, dbh = mdb.getHandle() dbh = mdb.setupCollections(dbh, dropCollections=True) # Build a keyword object testKywd = kw(keyword='keyword1', timeStamp=datetime.datetime(2011,6,22,12,10,45), lat=34.4, lon=45.5, text='this text contained the hashtag #keyword1', tweetID=346664, userID=4444, source='twitter') # Create a new baseline object baseLine = bl.baseline(kywd=testKywd, cellBuildPeriod=600) baseLine.outputs['days30_all'] = 0.5 baseLine.outputs['days7_all'] = 0.4 baseLine.outputs['hrs30_all'] = 0.3 baseLine.outputs['days30_weekly'] = 0.2 baseLine.outputs['days7_daily'] = 0.1 doc = baseLine.buildDoc() bl.insertBaselineDoc(dbh, doc) # Method returns the date of last baseline calculation lastBaseline = baseLine.lastBaselined() self.assertEquals(lastBaseline, datetime.datetime(2011,6,22,12,10)) # Close the connection mdb.close(c, dbh)
def testInsertBaselineDoc(self): ''' Inserts a completed baseline document into the baseline collection.''' # Connect and get handle c, dbh = mdb.getHandle() dbh = mdb.setupCollections(dbh, dropCollections=True) # Build a keyword object testKywd = kw(keyword='keyword1', timeStamp=datetime.datetime(2011,6,22,12,10,45), lat=34.4, lon=45.5, text='this text contained the hashtag #keyword1', tweetID=346664, userID=4444, source='twitter') # Instantiate the baseline object/class baseLine = bl.baseline(kywd=testKywd,cellBuildPeriod=600) # Build the document and insert it doc = baseLine.buildDoc() bl.insertBaselineDoc(dbh, doc) res = dbh.baseline.find()[0] print res self.assertEquals(res['keyword'], 'keyword1') self.assertEquals(res['mgrs'], '38SND4595706622') self.assertEquals(res['mgrsPrecision'], 10) # Close the connection mdb.close(c, dbh)
def main(): # Should really move these to being parser = OptionParser() parser.add_option("-H", "--host", dest="host") parser.add_option("-p", "--port", dest="port") parser.add_option("-d", "--db", dest="db") parser.add_option("-m", "--mgrs", dest="mgrs") parser.add_option("-M", "--mgrsprecision", dest="mgrsPrecision") parser.add_option("-t", "--timestamp", dest="timeStamp") parser.add_option("-k", "--keyword", dest="keyword") parser.add_option("-u", "--baselineUnit", dest="baselineUnit") parser.add_option("-v", "--baselineValue", dest="baselineValue") (options, args) = parser.parse_args() # Format the option inputs = these really should be arguments port = int(options.port) timeStamp = datetime.datetime.strptime(options.timeStamp, "%Y-%m-%dT%H:%M:%S") mgrsPrecision = int(options.mgrsPrecision) baselinePrecision = [options.baselineUnit, int(options.baselineValue)] c, dbh = mdb.getHandle(host=options.host, port=port, db=options.db) # Build the baseline objects as we go so that they can be updated at the end of the period. base = baseline(options.mgrs, mgrsPrecision, options.keyword, timeStamp, c=c, dbh=dbh, baselinePrecision=baselinePrecision) # Does the baseline document need updating? if base.needUpdate == True: # This method takes care of update and insert base.processBaseline(tsd.buildBlankData()) try: mdb.close(c, dbh) except: pass
def processKeyword(self, eventIn): ''' Process tweets coming off JMS into keywords''' # For processing tweets if self.source == 'twitter': record = eventRecord('twitter', eventIn) elif self.source == 'flickr': record = eventRecord('flickr', eventIn) elif self.source == 'instagram': record = eventRecord('instagram', eventIn) elif self.source == 'panoramia': record = eventRecord('panoramia', eventIn) elif self.source == 'foursquares': record = eventRecord('foursquares', eventIn) else: print 'No recognised source of data.' sys.exit() print record.lat, record.lon, record.timeStamp if not record.lat or not record.lon or not record.timeStamp: print 'Geo or time was not present, skipping this record.' return # Goes from single tweet --> n-keywords kywd = processEvent(record, self.source, self.mgrsPrecision) # Add keywords to the list based on hashtags kywd.fromHashTag() # Get keyword from the hashtags kywd.fromLookup(self.lookup) # Get keywords from a lookup #===================================================================================================== # PUT ADDITIONAL .fromXXX functions in here to get content from other types (tags, keywords, nlp, etc) #===================================================================================================== # If we don't find anything, build a blank keyword so that the observation isn't lost if len(kywd.keywords) == 0: kywd.whenNothingFound(tokenName=self.nothingFound) #Now loop the resultant keywords for extractedKeyword in kywd.keywords: # Pass keyword object into a class ts = timeSeries(c=self.c, dbh=self.dbh) ts.importData(extractedKeyword, blockPrecision=24) # Insert the time series document success = ts.insertDoc(self.cleanUpWorker, blankData=self.blankData, incrementBy=self.increment) # Build the baseline objects as we go so that they can be updated at the end of the period. base = baseline(mgrs = extractedKeyword.mgrs, mgrsPrecision = extractedKeyword.mgrsPrecision, keyword = extractedKeyword.keyword, timeStamp = extractedKeyword.timeStamp, c = self.c, dbh = self.dbh, baselinePrecision = self.baselineParameters) # Does the baseline document need updating? if base.needUpdate == True: base.processBaseline(self.blankData)
def testProcessBaselineLast30Days(self): ''' Checks accurate population of an array for 30 day all ''' # Connect and get handle c, dbh = mdb.getHandle() dbh = mdb.setupCollections(dbh, dropCollections=True) # Set up some times to work with tweetTime = datetime.datetime.utcnow() thisMinute = tweetTime.replace(second=0,microsecond=0) today = tweetTime.replace(hour=0, minute=0, second=0, microsecond=0) # Thirty days ago - at the start of the day lastMonthTweet = tweetTime - datetime.timedelta(days=30) # Build a keyword object testKywd = kw(keyword='keyword1', timeStamp=lastMonthTweet, lat=34.4, lon=45.5, text='this text contained the hashtag #keyword1', tweetID=346664, userID=4444, source='twitter') # Insert a new timeseries object for the tweet 30 days ago ts = timeSeries() ts.importData(testKywd) success = ts.insertBlankDoc() ts.updateCount() # Create a keyword object for the current tweet testKywd2 = kw(keyword='keyword1', timeStamp=lastMonthTweet + datetime.timedelta(hours=1), lat=34.4, lon=45.5, text='this text contained the hashtag #keyword1', tweetID=346664, userID=4444, source='twitter') # Insert the current keyword - NOTE HOW THIS IS AFTER THE BASELINE BUILD ts2 = timeSeries() ts2.importData(testKywd2) success = ts2.insertBlankDoc() ts2.updateCount() # Create a keyword object for the current tweet testKywd3 = testKywd testKywd3.timeStamp = tweetTime # Instantiate the baseline object/class base = bl.baseline(kywd=testKywd3, cellBuildPeriod=600) if base.needUpdate == True: if not base.lastBaselined(): doc = base.buildDoc() bl.insertBaselineDoc(dbh, doc) # Insert the current keyword - NOTE HOW THIS IS AFTER THE BASELINE BUILD ts3 = timeSeries() ts3.importData(testKywd3) success = ts3.insertBlankDoc() ts3.updateCount() tweetTimeMinus2Days = tweetTime - datetime.timedelta(days=2) # Create a new keyword object to test the daily slicing testKywd5 = kw(keyword='keyword1', timeStamp=tweetTimeMinus2Days, lat=34.4, lon=45.5, text='this text contained the hashtag #keyword1', tweetID=346664, userID=4444, source='twitter') # Insert the current keyword - NOTE HOW THIS IS AFTER THE BASELINE BUILD ts5 = timeSeries() ts5.importData(testKywd5) success = ts5.insertBlankDoc() ts5.updateCount() # Process Baseline base.processBaseline() # Get back the 30 day array arr = base.test30DayArray # Calculate what the array length should be soFarToday = (thisMinute - today).seconds/60.0 # The start of the array datetime lastMonthDay = lastMonthTweet.replace(hour=0, minute=0, second=0, microsecond=0) # The number of days between today and the start of the array (then in minutes) dateDiff = (today - lastMonthDay) minsDiff = dateDiff.days*1440 + dateDiff.seconds/60.0 total = minsDiff + soFarToday # Confirm its the right length self.assertEqual(total, len(arr)) # Get the minutes for the first 2 keywords (the third shouldn't be there) kwd1Min = int((testKywd.timeStamp - lastMonthDay).seconds/60) kwd2Min = int((testKywd2.timeStamp - lastMonthDay).seconds/60) kwd1Test = [arr[kwd1Min-1], arr[kwd1Min], arr[kwd1Min+1]] kwd2Test = [arr[kwd2Min-1], arr[kwd2Min], arr[kwd2Min+1]] for j in arr: if arr[j] > 0: print j, arr[j] self.assertEquals(kwd1Test, [0,1,0]) self.assertEquals(kwd2Test, [0,1,0]) # 30 DAY TIME SLICE CHECK arr = base.test30DaySliced # weekly testSliced = int(30/7) * 6 * 60 self.assertEquals(testSliced, len(arr)) arr7Day = base.test7DayArray test7DayAll = (thisMinute - today).seconds/60.0 + 1440*7 self.assertEquals(len(arr7Day), int(test7DayAll)) arr30Hrs = base.test30hrArray test30Hours = 30*60 self.assertEquals(len(arr30Hrs), int(test30Hours)) # Close the connection mdb.close(c, dbh)