Ejemplo n.º 1
0
def searchTwitter(tags, fileName):    
    print "Start Twitter scraping for " + str(tags)
    j=1
    fileName = "data/" + fileName
    fileExt = ".txt"
    sOptions = twitterRPP(100)
    sQuery = twitterBuildTagString(tags)
    
    # If data file exists, read latest tweet, otherwise skip
    from os.path import exists
    if (exists(fileName+fileExt)):
        lastID, lastTime = getLastTweetID(fileName+fileExt)
        print "Last tweet ID: " + lastID + " at time: " + lastTime
        sOptions += twitterSinceID(lastID) # Manual assignment of the newest tweet we scraped so far
    else:
        print "No file " + fileName + fileExt + " found, searching without maxID"
    
    # Initial search
    tweets = getTweets(sQuery + sOptions)
    if (len(tweets) < 2):
        print "No search results"
        return
    
    # Continue searching from oldest tweet found in every message
    oldestID = tweets[-1][0] # Get ID of the oldest tweet for the next query
    go_on = True
    i=1    
    
    while(go_on):        
        sOptions2 = sOptions + twitterMaxID(oldestID)
        results = getTweets(sQuery + sOptions2, i)
        
        if (len(results) < 2): # Catch empty results, errors and sleep if we'll continue
            go_on = False            
        else:
            time.sleep(1.1) # Sleep a bit so twitter doesn't throw us out
            i += 1
            oldestID = results[-1][0] # Get ID of the oldest tweet for the next query
            
        tweets += results[1:] # First result is tweet with "oldestID", so drop it
        
        if (i>=250): # Backup data if we acquire a lot
            IO.writeData(fileName + "_P" + str(j) + fileExt, tweets, overWrite=True)
            j += 1
            tweets = []
            i = 0
    
    # Save data, if buffer has been used, read buffer files in reversed order
    if (j==1):
        IO.writeData(fileName+fileExt, tweets, True, False)
    else:
        IO.writeData(fileName+fileExt, tweets, True, False)
        j -= 1
        while (j>=1):
            bfr = IO.readData(fileName + "_P" + str(j) + fileExt)            
            IO.writeData(fileName+fileExt, bfr, True, False)
            IO.deleteFile(fileName + "_P" + str(j) + fileExt) # Remove temporary file
            j -= 1
    print "Finished Twitter scrape"
Ejemplo n.º 2
0
def classifyTweetsCompany(tag, _offset=3):
    tweetFile = open("data/scrapeCompanies.txt")
    priceData = IO.readData("data/" + tag + ".csv", ',')
    priceIter = iter(priceData)
    next(priceIter)
    priceHist = priceHistory(priceIter, "%Y-%m-%d", 2)
    
    classifyTweets(tweetFile, priceHist, tag, "data/Classified" + tag + ".txt", offset=_offset)
Ejemplo n.º 3
0
def classifyTweetsDJIA(_offset=3):
    tweetFile = open("data/scrapeDJIA.txt")
    priceData = IO.readData("data/DJIA.tsv")
    priceHist = priceHistory(priceData, "%b %d, %Y", 1)

    classifyTweets(tweetFile,
                   priceHist,
                   "DJIA",
                   "data/ClassifiedDJIA.txt",
                   offset=_offset)
Ejemplo n.º 4
0
def classifyTweetsCompany(tag, _offset=3):
    tweetFile = open("data/scrapeCompanies.txt")
    priceData = IO.readData("data/" + tag + ".csv", ',')
    priceIter = iter(priceData)
    next(priceIter)
    priceHist = priceHistory(priceIter, "%Y-%m-%d", 2)

    classifyTweets(tweetFile,
                   priceHist,
                   tag,
                   "data/Classified" + tag + ".txt",
                   offset=_offset)
Ejemplo n.º 5
0
def classifyTweetsDJIA(_offset=3):
    tweetFile = open("data/scrapeDJIA.txt")
    priceData = IO.readData("data/DJIA.tsv")
    priceHist = priceHistory(priceData, "%b %d, %Y", 1)
    
    classifyTweets(tweetFile, priceHist, "DJIA", "data/ClassifiedDJIA.txt", offset=_offset)
Ejemplo n.º 6
0
def getStopWords():
    arr = IO.readData("data/StopWords.txt")
    return arr
Ejemplo n.º 7
0
def getEmotions():
    arr = IO.readData("data/Emotions.txt")
    for index in range(0, len(arr)):
        arr[index][0] = stem(arr[index][0])
    return arr
Ejemplo n.º 8
0
def getEmotions():
    arr = IO.readData("data/Emotions.txt")
    for index in range(0, len(arr)):
        arr[index][0] = stem(arr[index][0])
    return arr
Ejemplo n.º 9
0
def getStopWords():
    arr = IO.readData("data/StopWords.txt")
    return arr
Ejemplo n.º 10
0
def searchTwitter(tags, fileName, oldestID = 0, j=1):    
    print "Start Twitter scraping for " + str(tags)
    fileName = "data/scrapes/" + fileName
    fileExt = ".txt"
    sOptions = twitterRPP(100)
    sQuery = twitterBuildTagString(tags)
    
    # If data file exists, read latest tweet, otherwise skip
    from os.path import exists
    if (exists(fileName+fileExt)):
        lastID, lastTime = getLastTweetID(fileName+fileExt)
        print "Last tweet ID: " + lastID + " at time: " + lastTime
        sOptions += twitterSinceID(lastID) # Manual assignment of the newest tweet we scraped so far
    else:
        print "No file " + fileName + fileExt + " found, searching without maxID"
    
    # Initial search
    if (oldestID == 0):
        tweets = getTweets(sQuery + sOptions)
    else:
        sOptions2 = sOptions + twitterMaxID(oldestID)
        tweets = getTweets(sQuery + sOptions2)
    if (len(tweets) < 2):
        print "No search results"
        return
    
    # Continue searching from oldest tweet found in every message
    oldestID = tweets[-1][0] # Get ID of the oldest tweet for the next query
    go_on = True
    i=1    
    
    while(go_on):        
        sOptions2 = sOptions + twitterMaxID(oldestID)
        results = getTweets(sQuery + sOptions2, i)
        
        if (len(results) < 2): # Catch empty results, errors and sleep if we'll continue
            go_on = False            
        else:
            time.sleep(1.1) # Sleep a bit so twitter doesn't throw us out
            i += 1
            oldestID = results[-1][0] # Get ID of the oldest tweet for the next query
            
        tweets += results[1:] # First result is tweet with "oldestID", so drop it
        
        if (i>=250): # Backup data if we acquire a lot
            IO.writeData(fileName + "_P" + str(j) + fileExt, tweets, overWrite=True)
            j += 1
            tweets = []
            i = 0
    
    # Save data, if buffer has been used, read buffer files in reversed order
    if (j==1):
        IO.writeData(fileName+fileExt, tweets, True, False)
    else:
        IO.writeData(fileName+fileExt, tweets, True, False)
        j -= 1
        while (j>=1):
            bfr = IO.readData(fileName + "_P" + str(j) + fileExt)            
            IO.writeData(fileName+fileExt, bfr, True, False)
            IO.deleteFile(fileName + "_P" + str(j) + fileExt) # Remove temporary file
            j -= 1
    print "Finished " + fileName + " scrape"