Ejemplo n.º 1
0
def searchTwitter(tags, fileName):    
    print "Start Twitter scraping for " + str(tags)
    j=1
    fileName = "data/" + fileName
    fileExt = ".txt"
    sOptions = twitterRPP(100)
    sQuery = twitterBuildTagString(tags)
    
    # If data file exists, read latest tweet, otherwise skip
    from os.path import exists
    if (exists(fileName+fileExt)):
        lastID, lastTime = getLastTweetID(fileName+fileExt)
        print "Last tweet ID: " + lastID + " at time: " + lastTime
        sOptions += twitterSinceID(lastID) # Manual assignment of the newest tweet we scraped so far
    else:
        print "No file " + fileName + fileExt + " found, searching without maxID"
    
    # Initial search
    tweets = getTweets(sQuery + sOptions)
    if (len(tweets) < 2):
        print "No search results"
        return
    
    # Continue searching from oldest tweet found in every message
    oldestID = tweets[-1][0] # Get ID of the oldest tweet for the next query
    go_on = True
    i=1    
    
    while(go_on):        
        sOptions2 = sOptions + twitterMaxID(oldestID)
        results = getTweets(sQuery + sOptions2, i)
        
        if (len(results) < 2): # Catch empty results, errors and sleep if we'll continue
            go_on = False            
        else:
            time.sleep(1.1) # Sleep a bit so twitter doesn't throw us out
            i += 1
            oldestID = results[-1][0] # Get ID of the oldest tweet for the next query
            
        tweets += results[1:] # First result is tweet with "oldestID", so drop it
        
        if (i>=250): # Backup data if we acquire a lot
            IO.writeData(fileName + "_P" + str(j) + fileExt, tweets, overWrite=True)
            j += 1
            tweets = []
            i = 0
    
    # Save data, if buffer has been used, read buffer files in reversed order
    if (j==1):
        IO.writeData(fileName+fileExt, tweets, True, False)
    else:
        IO.writeData(fileName+fileExt, tweets, True, False)
        j -= 1
        while (j>=1):
            bfr = IO.readData(fileName + "_P" + str(j) + fileExt)            
            IO.writeData(fileName+fileExt, bfr, True, False)
            IO.deleteFile(fileName + "_P" + str(j) + fileExt) # Remove temporary file
            j -= 1
    print "Finished Twitter scrape"
Ejemplo n.º 2
0
def searchTwitter(tags, fileName, oldestID = 0, j=1):    
    print "Start Twitter scraping for " + str(tags)
    fileName = "data/scrapes/" + fileName
    fileExt = ".txt"
    sOptions = twitterRPP(100)
    sQuery = twitterBuildTagString(tags)
    
    # If data file exists, read latest tweet, otherwise skip
    from os.path import exists
    if (exists(fileName+fileExt)):
        lastID, lastTime = getLastTweetID(fileName+fileExt)
        print "Last tweet ID: " + lastID + " at time: " + lastTime
        sOptions += twitterSinceID(lastID) # Manual assignment of the newest tweet we scraped so far
    else:
        print "No file " + fileName + fileExt + " found, searching without maxID"
    
    # Initial search
    if (oldestID == 0):
        tweets = getTweets(sQuery + sOptions)
    else:
        sOptions2 = sOptions + twitterMaxID(oldestID)
        tweets = getTweets(sQuery + sOptions2)
    if (len(tweets) < 2):
        print "No search results"
        return
    
    # Continue searching from oldest tweet found in every message
    oldestID = tweets[-1][0] # Get ID of the oldest tweet for the next query
    go_on = True
    i=1    
    
    while(go_on):        
        sOptions2 = sOptions + twitterMaxID(oldestID)
        results = getTweets(sQuery + sOptions2, i)
        
        if (len(results) < 2): # Catch empty results, errors and sleep if we'll continue
            go_on = False            
        else:
            time.sleep(1.1) # Sleep a bit so twitter doesn't throw us out
            i += 1
            oldestID = results[-1][0] # Get ID of the oldest tweet for the next query
            
        tweets += results[1:] # First result is tweet with "oldestID", so drop it
        
        if (i>=250): # Backup data if we acquire a lot
            IO.writeData(fileName + "_P" + str(j) + fileExt, tweets, overWrite=True)
            j += 1
            tweets = []
            i = 0
    
    # Save data, if buffer has been used, read buffer files in reversed order
    if (j==1):
        IO.writeData(fileName+fileExt, tweets, True, False)
    else:
        IO.writeData(fileName+fileExt, tweets, True, False)
        j -= 1
        while (j>=1):
            bfr = IO.readData(fileName + "_P" + str(j) + fileExt)            
            IO.writeData(fileName+fileExt, bfr, True, False)
            IO.deleteFile(fileName + "_P" + str(j) + fileExt) # Remove temporary file
            j -= 1
    print "Finished " + fileName + " scrape"