def testvastTweet2Json(self): ''' Check the conversion to a json object: ''' d = {'timestamp': datetime.datetime(2011,1,1,12,1,1).strftime('%Y-%m-%dT%H:%M:%S'), 'lat' : 34.4, 'lon' : 45.5, 'text' : 'Hello world again #Hashtag1 #hashtag2 #hashtag3.', 'tweet_id' : 346664, 'user_id' : 4444, 'hashtags' : None } jTweet = json.dumps(d) vt = vastTweet() vt.importData(jTweet=jTweet) dumped = json.loads(vt.vastTweet2Json()) # Reassign hashtags, now they've been generated d['hashtags'] = ['hashtag1', 'hashtag2', 'hashtag3'] for key in d.keys(): print key, d[key], dumped[key] self.assertEquals(d[key], dumped[key])
def testFinHashTagsAdvancedUser(self): ''' Test the finding of hashtags in text.''' text = "@brantinr, @helloworld - this is a #Hashtag #test-link #A should#not#work" vt = vastTweet() hts = vt.findHashTagsAdvanced(text, user=True) truth = ['brantinr', 'helloworld'] print 'Advanced', hts
def testFinHashTagsAdvancedSubject(self): ''' Test the finding of hashtags in text.''' text = "This is a #Hashtag #test-link #A should#not#work" vt = vastTweet() hts = vt.findHashTagsAdvanced(text) truth = ['hashtag', 'test', 'a'] print 'Advanced', hts
def testFinHashTags(self): ''' Test the finding of hashtags in text.''' text = "This is a #Hashtag #tesT-link #A should#not#work and #this_link" vt = vastTweet() hts = vt.findHashTags(text) print "normal", hts truth = ['hashtag', 'test', 'a', 'this_link'] self.assertEqual(hts, truth)
def setUp(self): ''' Setup an object to work with ''' # Build a tweet ts = datetime.datetime(2011,1,1,12,1,1) lat, lon = 34.4, 45.5 text = 'Hello germ world again #Hashtag1 #hashtag2 #hashtag3 virus.' tId = 346664 uId = 4444 self.vt = vastTweet() self.vt.importData(timeStamp=ts, lat=lat, lon=lon, text=text, userId=uId, tweetId=tId)
def testImportData(self): ''' Check the __init__ without json object''' ts = datetime.datetime(2011,1,1,12,1,1) lat, lon = 34.4, 45.5 text = 'Hello world again #Hashtag1 #hashtag2 #hashtag3.' tId = 346664 uId = 4444 vt = vastTweet() vt.importData(timeStamp=ts, lat=lat, lon=lon, text=text, userId=uId, tweetId=tId) self.assertEquals(vt.timeStamp, ts) self.assertEquals(vt.lat, lat) self.assertEquals(vt.lon, lon) self.assertEquals(vt.hashTags, ['hashtag1','hashtag2','hashtag3']) self.assertEquals(vt.userId, uId) self.assertEquals(vt.tweetId, tId)
def testImportDataJson(self): ''' Check the __init__ WITH A JSON object''' d = {'timestamp': datetime.datetime(2011,1,1,12,1,1).strftime('%Y-%m-%dT%H:%M:%S'), 'lat' : 34.4, 'lon' : 45.5, 'text' : 'Hello world again #Hashtag1 #hashtag2 #hashtag3.', 'tweet_id' : 346664, 'user_id' : 4444, 'hashtags' : None, } jTweet = json.dumps(d) vt = vastTweet() vt.importData(jTweet=jTweet) self.assertEquals(vt.timeStamp, datetime.datetime.strptime(d['timestamp'], '%Y-%m-%dT%H:%M:%S')) self.assertEquals(vt.lat, d['lat']) self.assertEquals(vt.lon, d['lon']) self.assertEquals(vt.hashTags, ['hashtag1','hashtag2','hashtag3']) self.assertEquals(vt.userId, d['user_id']) self.assertEquals(vt.tweetId, d['tweet_id'])
def main(): ''' Script to build tweet objects from the VAST dataset and place them on a Queue and/or JMS for testing purposes. LIKELY SPEED IMPROVEMENTS: - BUILDING BLANK ARRAYS IN THE TIME SERIES TAKES A WHILE - PUTTING THE KEYWORDS IN A QUEUE, HAVING SET UP THE THREADS TO PROCESS EACH ONE. - ANY DUPLICATION CHECKS? ''' db = 'bam' host = 'localhost' port = 27017 start = datetime.datetime.utcnow() tweetProcessTimes = datetime.timedelta(seconds=0) blUnits = 'minute' blPrecision = 10 baselineParameters = [blUnits, blPrecision] mgrsPrecision = 2 #dripRate = 1.5 # JMS destination #destination = '/topic/test.vasttweets' #hostIn = 'localhost' #portIn = 61613 # Reset the collections c, dbh = mdb.getHandle() dbh = mdb.setupCollections(dbh, dropCollections=True) # Set up collections dbh = mdb.setupIndexes(dbh) #jms = jmsCode.jmsHandler(hostIn, portIn, verbose=True) # Make the JMS connection via STOMP and the jmsCode class #jms.connect() path = "/Users/brantinghamr/Documents/Code/eclipseWorkspace/bam/data/" #fName= "MicroblogsSample.csv" fName= "MicroblogsOrdered.csv" tweetStats = 'tweetStatsFile_50000.csv' tptFile = open(path+tweetStats, 'w') # The script used to generate the baseline baselinePath = '/Users/brantinghamr/Documents/Code/eclipseWorkspace/bam/src/scripts/' baselineScript = 'subprocessBaseline.py' scriptFile = os.path.join(baselinePath, baselineScript) f = retrieveFile(path, fName) x = 0 # Start time earliestTweet = datetime.datetime(2011, 4, 30, 0, 0) earliestTweet = time.mktime(time.struct_time(earliestTweet.timetuple())) lastTweetTime = earliestTweet print "First Tweet Time: ", lastTweetTime # This speeds things up from seconds to minutes speedUpRate = 1000 # Build a blank timeseries array to save it being built everytime blankData = buildBlankData(hours=24) # Loop the lines build tweet objects for line in f.readlines(): #print line # Extract content from each line line = line.rstrip('\r').rstrip('\n').rstrip('\r') if x == 0: x+=1 continue if x % 100 == 0: print "processed: ", x if x >100000: print line break sys.exit(0) line = line.split(',') tweetProcessStart = datetime.datetime.utcnow() tweetId, dt, latLon, text = line # Get the geos geos = getGeos(tweetId, latLon) if not geos: print "skipping this record - bad or no geos" continue # Get the datetime group into seconds since UNIX time dtg = getTime(tweetId, dt) if not dtg: print "skipping this record - bad or no time" continue # Get the tweettime into seconds from UNIX tweetTime = time.mktime(time.struct_time(dtg.timetuple())) #print "The time of this tweet", tweetTime # Get the tweet time in seconds since the last tweet sinceLastTweet = tweetTime - lastTweetTime #print "Time since last tweet", sinceLastTweet #delay = sinceLastTweet / speedUpRate #print "Delay: ", delay # Apply a scaling to it #time.sleep(delay) # Assign this tweet time to be the last tweet time lastTweetTime = tweetTime # Build a tweet object twt = vastTweet() twt.importData(timeStamp=dtg, lat=geos[0], lon=geos[1], text=text, tweetId=tweetId) #---------------------------------------------------------------------------------- # PROCESS INTO KEYWORDS # Build into keywords - skipping a step for development kywd = processTweet(twt, mgrsPrecision) # Add keywords to the list based on hashtags kywd.fromHashTag() # Add keywords to the list based on name lookup kywd.fromLookup() if len(kywd.keywords) == 0: pass #print "No matches: ", twt.text xx = 0 #Now loop the resultant keywords for kwObj in kywd.keywords: xx += 1 #print "------------------" #print kwObj.keyword #print kwObj.text #------------------------------------------------------- # Pass keyword object into a class #ts = timeSeries(host='localhost', port=27017, db='bam') ts = timeSeries(c=c, dbh=dbh) ts.importData(kwObj, blockPrecision=24) success = ts.insertDoc(blankData=blankData, incrementBy=100) callBaseliner(scriptFile, host, port, db, kwObj, baselineParameters, mac=1) # METRICS - currently about 0.05 seconds per tweet tweetProcessStop = datetime.datetime.utcnow() tweetProcessTimes += (tweetProcessStop - tweetProcessStart) processDif = (tweetProcessStop - tweetProcessStart) tptFile.write(str(x)+","+str(xx)+","+str(processDif.seconds + processDif.microseconds/1000000.)+"\n") #---------------------------------------------------------------------------------- # SEND TO JMS WITH THIS CODE # Convert it into a JSON object #jTwt = twt.vastTweet2Json() #print jTwt # Push the JSON version of the tweet to the JMS #jms.sendData(destination, jTwt, x) #---------------------------------------------------------------------------------- x += 1 #time.sleep(dripRate) # Disconnect from the JMS #jms.disConnect() end = datetime.datetime.utcnow() dif = end - start print "Total Tweet Process Time: %s" %tweetProcessTimes.seconds print "Average Tweet process time: %s" % (float(tweetProcessTimes.seconds)/float(x)) print "Tweet Processed: %s" %x print "Total Process Time: %s" %(dif) # Close the mongo connection mdb.close(c, dbh) f.close() tptFile.close()