def createNewTrainingSet(self, training_data_file):
        XTrain = []
        YTrain = []
        XTrainFeatures = []
        XTrainSentiment = []
        XTrainFreqTweets = []
        geo_latitude = []
        geo_longitude = []
        
        objFilterStopWords = FilterStopWords()
        objPreprocessTweets = PreprocessTweets()

        stopWords = objFilterStopWords.getStopWordList('../../TwitterData/StopWords.txt')
        
        # Read the tweets one by one and process it
        inpTweets = csv.reader(open(training_data_file, 'rb'), delimiter=',')
        inpTweets.next()
        tweets = []
        i = 0
        for row in inpTweets:
#             print row
            personality = row[5]
            tweet = row[1]
            cleanTweet = tweet.replace('"",""', " ")
            cleanTweet = cleanTweet.replace('""', " ")
            processedTweet = objPreprocessTweets.processTweet(cleanTweet)

            XTrainFreqTweets.append(int(row[4]))
            wordsList = processedTweet.split()
            
            # Remove stop words
            filtered_words = [word for word in wordsList if word not in stopwords.words('english')]
            filteredTweets = ' '.join(filtered_words)
            
            featureVector = objFilterStopWords.getFeatureVector(processedTweet, stopWords)
            
            geo_latitude.append(float(row[2]))
            geo_longitude.append(float(row[3]))
            
            blob = TextBlob(processedTweet)
            sentiment = 0
            for sentence in blob.sentences:
                sentiment += sentence.sentiment.polarity

            totSentiment = sentiment / len(blob.sentences)

            XTrainSentiment.append(totSentiment)

            XTrainFeatures.append(filteredTweets)
            
            YTrain.append(personality.replace('[', '').replace('\"', '').replace(']', ''))
                        
#             i+=1
#             if i==3:
#                 break
            

        return XTrain, YTrain, XTrainFeatures, XTrainSentiment, XTrainFreqTweets, geo_latitude, geo_longitude
Beispiel #2
0
    def createNewTrainingSet(self, training_data_file):
        XTrain = []
        YTrain = []
        XTrainFeatures = []
        XTrainSentiment = []
        XTrainFreqTweets = []
        geo_latitude = []
        geo_longitude = []

        objFilterStopWords = FilterStopWords()
        objPreprocessTweets = PreprocessTweets()

        stopWords = objFilterStopWords.getStopWordList(
            '../../TwitterData/StopWords.txt')

        # Read the tweets one by one and process it
        inpTweets = csv.reader(open(training_data_file, 'rb'), delimiter=',')
        inpTweets.next()
        tweets = []
        i = 0
        for row in inpTweets:
            #             print row
            personality = row[5]
            tweet = row[1]
            cleanTweet = tweet.replace('"",""', " ")
            cleanTweet = cleanTweet.replace('""', " ")
            processedTweet = objPreprocessTweets.processTweet(cleanTweet)

            XTrainFreqTweets.append(int(row[4]))
            wordsList = processedTweet.split()

            # Remove stop words
            filtered_words = [
                word for word in wordsList
                if word not in stopwords.words('english')
            ]
            filteredTweets = ' '.join(filtered_words)

            featureVector = objFilterStopWords.getFeatureVector(
                processedTweet, stopWords)

            geo_latitude.append(float(row[2]))
            geo_longitude.append(float(row[3]))

            blob = TextBlob(processedTweet)
            sentiment = 0
            for sentence in blob.sentences:
                sentiment += sentence.sentiment.polarity

            totSentiment = sentiment / len(blob.sentences)

            XTrainSentiment.append(totSentiment)

            XTrainFeatures.append(filteredTweets)

            YTrain.append(
                personality.replace('[', '').replace('\"',
                                                     '').replace(']', ''))

#             i+=1
#             if i==3:
#                 break

        return XTrain, YTrain, XTrainFeatures, XTrainSentiment, XTrainFreqTweets, geo_latitude, geo_longitude
# import preprocess_tweets
# import filter_stop_words
from mmds.supervised.filter_stop_words import FilterStopWords
from mmds.supervised.preprocess_tweets import PreprocessTweets

#Read the tweets one by one and process it
fp = open('../../TwitterData/UserTweets.txt', 'r')
line = fp.readline()

objFilterStopWords = FilterStopWords()
objPreprocessTweets = PreprocessTweets()

st = open('../../TwitterData/StopWords.txt', 'r')
stopWords = objFilterStopWords.getStopWordList('../../TwitterData/StopWords.txt')

while line:
    processedTweet = objPreprocessTweets.processTweet(line)
    featureVector = objFilterStopWords.getFeatureVector(processedTweet, stopWords)
    print featureVector
    line = fp.readline()
#end loop
fp.close()