Ejemplo n.º 1
0
def runModel():
    print("Model running mode!\n")
    
    print("Collecting 10 tweets to iterate through!")
    data.testTweets = tweetstreamer.getTweets(10)
    
    print("Use learning?")                
    goodinput = 0           
    while(not goodinput):                                
        inputstring = raw_input("(y/n): ")
        if(inputstring == "y"):
            useLearning = True
            goodinput = 1
        elif(inputstring == "n"):
            useLearning = False
            goodinput = 1
        else:
            print("Please enter y or n")
    
    # runModel main loop
    while(len(data.testTweets) != 0):
        print("\n1: Evaluate new tweet\n2: Exit running mode")             
        inputstring = raw_input("Enter option: ")
        if(inputstring == "1"):
            newTweet = data.testTweets.pop()
            wordsInTweet = re.findall(r'\w+', newTweet.lower()) 
            probTweetFlu = probOfFlu(wordsInTweet)
            isTweetFlu = probTweetFlu > data.THRESHOLD
            if(isTweetFlu):
                data.numFluTweets += 1
            try:
                print("\nNew tweet: " + str(newTweet))
                print("Probability the user has flu: " + str(probTweetFlu))
            except UnicodeEncodeError:
                print("Codec cannot encode characters in this tweet!")
            
            if(useLearning):
                learner.updateProbs(wordsInTweet, isTweetFlu)
        elif(inputstring == "2"):
            return
        else:
            print("\nThat is not an option!\n")
            
    print("All test tweets evaluated!!!")
Ejemplo n.º 2
0
def trainModel():
    # Check if probabilities already exist and ask user if they want to use these probs
    if(len(data.probabilities) != 0):
        print("Saved model already exists.\nUse this model?\n")
       
        while(True):                                
            inputstring = raw_input("(y/n): ")
            if(inputstring == "y"):
                print("Model trained!")
                return
            elif(inputstring == "n"):
                data.probabilities.clear()
                break
            elif(inputstring == "q"):
                print("Good bye!")
                quit()
            ## TODO EXIT ##
            else:
                print("Please enter y, n, or q to quit")
                
    # Read tweets and categorization data from files    
    if(os.path.exists("./tweets.txt")):
        tweetsFile = open("./tweets.txt","r")
        data.tweetsStored = int(tweetsFile.readline())
        tweetsFile.close()
            
        print("Found tweets.txt with " + str(data.tweetsStored) + " tweets in it!")
            
        # Load tweets into memory
        readTweetsFromFile()
            
        # Check if training data already exists for these tweets
        if(os.path.exists("./trainingdata.txt")):
            trainingDataFile = open("./trainingdata.txt")
            data.tweetsCategorized = int(trainingDataFile.readline())
            trainingDataFile.close()
                
            print("Found trainingdata.txt with " + str(data.tweetsCategorized) + " categorizations in it!")
                
            if(data.tweetsStored < data.tweetsCategorized):
                # We have more categorized tweets than tweets themselves. Error
                print("Mismatch: more tweets are categorized than exist! Disregarding categorization data.")
                data.tweetsCategorized = 0
                    
            else:
                readCategorizationFromFile()
        else:
            print("No categorization data exists for these tweets!\n")
            
            
    # Ask the user how many tweets they would like in the training set        
    print("Please enter the number of tweets you would like in the training set")                     
    inputstring = raw_input(">> ")
    tweetsWanted = int(inputstring)

    # If we do not have enough tweets as the user requested
    if(data.tweetsStored < tweetsWanted):
        print(str(data.tweetsStored) + " tweets already stored.")
        print(str(tweetsWanted - data.tweetsStored) + " more tweets needed.")
                          
        print("Append more tweets or overwrite and collect new tweets?") 
                         
        while(True):                                
            inputstring = raw_input("(a/o): ")
            if(inputstring == "a"):
                data.tweets.append(tweetstreamer.getTweets(tweetsWanted - data.tweetsStored, data.searchterms))
                trainer.categorizeTweets()
                break
            elif(inputstring == "o"):
                data.tweetsStored = 0
                data.tweetsCategorized = 0
                data.tweets = tweetstreamer.getTweets(tweetsWanted - data.tweetsStored, data.searchterms)
                trainer.categorizeTweets()
                break
            elif(inputstring == "q"):
                print("Good bye!")
            ## TODO EXIT ##
            else:
                print("Please enter a to append, o to overwrite, or q to quit")
                    
    # We have more than enough tweets
    elif(data.tweetsStored >= tweetsWanted):
        print(str(data.tweetsStored) + " tweets already stored.")
                          
        print("Use these tweets or overwrite and collect new tweets?") 
                           
        while(True):                                
            inputstring = raw_input("(u/o): ")
            if(inputstring == "u"):
                readCategorizationFromFile()
                break
            elif(inputstring == "o"):
                data.tweetsStored = 0
                data.tweetsCategorized = 0
                data.tweets = tweetstreamer.getTweets(tweetsWanted - data.tweetsStored, data.searchterms)
                trainer.categorizeTweets()
                break
            elif(inputstring == "q"):
                print("Good bye!")
                quit()
            else:
                print("Please enter a to append, o to overwrite, or q to quit")      
           
    # Now we have the tweets and some categorization or not
    # Model is now trained
    # Model is stored in probabilities
    data.tweetsStored = len(data.tweets)
    trainer.calculateProbs()