Beispiel #1
0
    def testvastTweet2Json(self):
        ''' Check the conversion to a json object: '''


        d = {'timestamp': datetime.datetime(2011,1,1,12,1,1).strftime('%Y-%m-%dT%H:%M:%S'),
             'lat'      : 34.4,
             'lon'      : 45.5,
             'text'     : 'Hello world again #Hashtag1 #hashtag2 #hashtag3.',
             'tweet_id' : 346664,
             'user_id'  : 4444,
             'hashtags' : None
             }
        
        jTweet = json.dumps(d)
        vt = vastTweet()
        vt.importData(jTweet=jTweet)
        
        dumped = json.loads(vt.vastTweet2Json())
        
        # Reassign hashtags, now they've been generated
        d['hashtags'] = ['hashtag1', 'hashtag2', 'hashtag3']
        
        for key in d.keys():
            print key, d[key], dumped[key]
            self.assertEquals(d[key], dumped[key])
Beispiel #2
0
    def testFinHashTagsAdvancedUser(self):
        ''' Test the finding of hashtags in text.'''

        text = "@brantinr, @helloworld - this is a #Hashtag #test-link #A should#not#work"
        vt = vastTweet()
        hts = vt.findHashTagsAdvanced(text, user=True)
        truth = ['brantinr', 'helloworld']
        print 'Advanced', hts
Beispiel #3
0
    def testFinHashTagsAdvancedSubject(self):
        ''' Test the finding of hashtags in text.'''

        text = "This is a #Hashtag #test-link #A should#not#work"
        vt = vastTweet()
        hts = vt.findHashTagsAdvanced(text)
        truth = ['hashtag', 'test', 'a']
        print 'Advanced', hts
Beispiel #4
0
    def testFinHashTags(self):
        ''' Test the finding of hashtags in text.'''

        text = "This is a #Hashtag #tesT-link #A should#not#work and #this_link"
        vt = vastTweet()
        hts = vt.findHashTags(text)
        print "normal", hts
        truth = ['hashtag', 'test', 'a', 'this_link']
        self.assertEqual(hts, truth)
 def setUp(self):
     ''' Setup an object to work with '''
     
     # Build a tweet
     ts = datetime.datetime(2011,1,1,12,1,1)
     lat, lon = 34.4, 45.5
     text = 'Hello germ world again #Hashtag1 #hashtag2 #hashtag3 virus.'
     tId = 346664
     uId = 4444
     
     self.vt = vastTweet()
     self.vt.importData(timeStamp=ts, lat=lat, lon=lon, text=text, userId=uId, tweetId=tId) 
Beispiel #6
0
    def testImportData(self):
        ''' Check the __init__ without json object'''
        
        ts = datetime.datetime(2011,1,1,12,1,1)
        lat, lon = 34.4, 45.5
        text = 'Hello world again #Hashtag1 #hashtag2 #hashtag3.'
        tId = 346664
        uId = 4444
        vt = vastTweet()
        vt.importData(timeStamp=ts, lat=lat, lon=lon, text=text, userId=uId, tweetId=tId) 

        self.assertEquals(vt.timeStamp, ts)
        self.assertEquals(vt.lat, lat)
        self.assertEquals(vt.lon, lon)
        self.assertEquals(vt.hashTags, ['hashtag1','hashtag2','hashtag3'])
        self.assertEquals(vt.userId, uId)
        self.assertEquals(vt.tweetId, tId)
Beispiel #7
0
    def testImportDataJson(self):
        ''' Check the __init__ WITH A JSON object'''

        d = {'timestamp': datetime.datetime(2011,1,1,12,1,1).strftime('%Y-%m-%dT%H:%M:%S'),
             'lat'      : 34.4,
             'lon'      : 45.5,
             'text'     : 'Hello world again #Hashtag1 #hashtag2 #hashtag3.',
             'tweet_id' : 346664,
             'user_id'  : 4444,
             'hashtags' : None,
             }
                
        jTweet = json.dumps(d)
        
        vt = vastTweet()
        vt.importData(jTweet=jTweet)

        self.assertEquals(vt.timeStamp, datetime.datetime.strptime(d['timestamp'], '%Y-%m-%dT%H:%M:%S'))
        self.assertEquals(vt.lat, d['lat'])
        self.assertEquals(vt.lon, d['lon'])
        self.assertEquals(vt.hashTags, ['hashtag1','hashtag2','hashtag3'])
        self.assertEquals(vt.userId, d['user_id'])
        self.assertEquals(vt.tweetId, d['tweet_id'])
Beispiel #8
0
def main(): 
    '''
    Script to build tweet objects from the VAST dataset and place them on a Queue and/or JMS
    for testing purposes.
    
    LIKELY SPEED IMPROVEMENTS:
    - BUILDING BLANK ARRAYS IN THE TIME SERIES TAKES A WHILE
    - PUTTING THE KEYWORDS IN A QUEUE, HAVING SET UP THE THREADS TO PROCESS EACH ONE.
    - ANY DUPLICATION CHECKS?
    
    
    
    '''
    db = 'bam'
    host = 'localhost'
    port = 27017
    
    start = datetime.datetime.utcnow()
    tweetProcessTimes = datetime.timedelta(seconds=0)
    
    blUnits     = 'minute'
    blPrecision = 10
    baselineParameters = [blUnits, blPrecision] 
    mgrsPrecision = 2
    
    #dripRate = 1.5
    
    # JMS destination
    #destination = '/topic/test.vasttweets'
    #hostIn      = 'localhost'
    #portIn      = 61613

    # Reset the collections
    c, dbh = mdb.getHandle()
    dbh = mdb.setupCollections(dbh, dropCollections=True)         # Set up collections
    dbh = mdb.setupIndexes(dbh)
    
    #jms = jmsCode.jmsHandler(hostIn, portIn, verbose=True)
    # Make the JMS connection via STOMP and the jmsCode class
    #jms.connect()
     
    path = "/Users/brantinghamr/Documents/Code/eclipseWorkspace/bam/data/"
    #fName= "MicroblogsSample.csv"
    fName= "MicroblogsOrdered.csv"
    tweetStats = 'tweetStatsFile_50000.csv'
    tptFile = open(path+tweetStats, 'w')
    
    # The script used to generate the baseline
    baselinePath = '/Users/brantinghamr/Documents/Code/eclipseWorkspace/bam/src/scripts/'
    baselineScript = 'subprocessBaseline.py'
    scriptFile = os.path.join(baselinePath, baselineScript)

    
    f = retrieveFile(path, fName)
    x = 0
    
    # Start time
    earliestTweet = datetime.datetime(2011, 4, 30, 0, 0)
    earliestTweet = time.mktime(time.struct_time(earliestTweet.timetuple()))
    lastTweetTime = earliestTweet
    print "First Tweet Time: ", lastTweetTime
    
    # This speeds things up from seconds to minutes
    speedUpRate = 1000
    
    # Build a blank timeseries array to save it being built everytime
    blankData = buildBlankData(hours=24)
    
    # Loop the lines build tweet objects
    for line in f.readlines():
        
        #print line
        # Extract content from each line
        line = line.rstrip('\r').rstrip('\n').rstrip('\r')

        if x == 0:
            x+=1
            continue
        
        if x % 100 == 0:
            print "processed: ", x
        
        if x >100000:
            print line
            break
            sys.exit(0)
            
        line = line.split(',')
        
        tweetProcessStart = datetime.datetime.utcnow()
        
        tweetId, dt, latLon, text = line
        
        # Get the geos
        geos = getGeos(tweetId, latLon)
        if not geos:
            print "skipping this record - bad or no geos"
            continue
        
        # Get the datetime group into seconds since UNIX time
        dtg = getTime(tweetId, dt)

        if not dtg:
            print "skipping this record - bad or no time"
            continue
        
        # Get the tweettime into seconds from UNIX
        tweetTime = time.mktime(time.struct_time(dtg.timetuple()))
        #print "The time of this tweet", tweetTime
        
        # Get the tweet time in seconds since the last tweet
        sinceLastTweet = tweetTime - lastTweetTime
        #print "Time since last tweet", sinceLastTweet
        
        #delay = sinceLastTweet / speedUpRate
        #print "Delay: ", delay
                
        # Apply a scaling to it
        #time.sleep(delay)
        
        # Assign this tweet time to be the last tweet time
        lastTweetTime = tweetTime
        
        # Build a tweet object
        twt = vastTweet()
        twt.importData(timeStamp=dtg, lat=geos[0], lon=geos[1], text=text, tweetId=tweetId)
        
        #----------------------------------------------------------------------------------
        # PROCESS INTO KEYWORDS
                
        # Build into keywords - skipping a step for development
        kywd = processTweet(twt, mgrsPrecision)
        
        # Add keywords to the list based on hashtags
        kywd.fromHashTag()
        
        # Add keywords to the list based on name lookup
        kywd.fromLookup()

        if len(kywd.keywords) == 0:
            pass
            #print "No matches: ", twt.text
        
        xx = 0
        #Now loop the resultant keywords
        for kwObj in kywd.keywords:
            
            xx += 1
            
            #print "------------------"
            #print kwObj.keyword
            #print kwObj.text
        
            #-------------------------------------------------------
            # Pass keyword object into a class
            #ts = timeSeries(host='localhost', port=27017, db='bam')
            ts = timeSeries(c=c, dbh=dbh)
            ts.importData(kwObj, blockPrecision=24)
    
            success = ts.insertDoc(blankData=blankData, incrementBy=100)
  
            callBaseliner(scriptFile, host, port, db, kwObj, baselineParameters, mac=1)
  
        # METRICS - currently about 0.05 seconds per tweet
        tweetProcessStop = datetime.datetime.utcnow()
        tweetProcessTimes += (tweetProcessStop - tweetProcessStart)
        processDif = (tweetProcessStop - tweetProcessStart) 
        tptFile.write(str(x)+","+str(xx)+","+str(processDif.seconds + processDif.microseconds/1000000.)+"\n")
        #----------------------------------------------------------------------------------
        # SEND TO JMS WITH THIS CODE

        # Convert it into a JSON object
        #jTwt = twt.vastTweet2Json()
        #print jTwt

        # Push the JSON version of the tweet to the JMS
        #jms.sendData(destination, jTwt, x)

        #----------------------------------------------------------------------------------
        
        x += 1
    
        #time.sleep(dripRate)
        
    # Disconnect from the JMS
    #jms.disConnect()    

    end = datetime.datetime.utcnow()
    dif = end - start
    
    print "Total Tweet Process Time: %s" %tweetProcessTimes.seconds
    print "Average Tweet process time: %s" % (float(tweetProcessTimes.seconds)/float(x))

    print "Tweet Processed: %s" %x
    print "Total Process Time: %s" %(dif)
    
    # Close the mongo connection
    mdb.close(c, dbh)
    f.close()
    tptFile.close()