def cleanUpTrainingSet(a, b, c, cat): #this code will cleanup training set with previous training to eliminate confusion. #This code has to executed only once and is done in the above tweetprocessor = TweetProcessor() sportswords = json.load(open("sportswords")) politicswords = json.load(open("politicswords")) sportshashtags = json.load(open("sportshashtags")) politicshashtags = json.load(open("politicshashtags")) sportsmentions = json.load(open("sportsmentions")) politicsmentions = json.load(open("politicsmentions")) tweets = json.load(open(cat + "trainingset")) for actualtweet in tweets: tweet = tweetprocessor.processTweet(actualtweet) words = tweetprocessor.getwords(tweet) totalsportsweight = 0.0 totalpoliticsweight = 0.0 sportwordweight = 0.0 politicwordweight = 0.0 for word in words: if (word != ''): if (word[0] == '#' and len(word.split()) < 2): sportwordweight = sportshashtags.get(word, 0.0) + 1.0 politicwordweight = politicshashtags.get(word, 0.0) + 1.0 if (sportwordweight != 0 or sportwordweight != 0): totalsportsweight += a * ( sportwordweight / (sportwordweight + politicwordweight)) totalpoliticsweight += a * ( politicwordweight / (sportwordweight + politicwordweight)) elif (word[0] == '@' and len(word.split()) < 2): sportwordweight = sportsmentions.get(word, 0.0) + 1.0 politicwordweight = politicsmentions.get(word, 0.0) + 1.0 if (sportwordweight != 0 or sportwordweight != 0): totalsportsweight += b * ( sportwordweight / (sportwordweight + politicwordweight)) totalpoliticsweight += b * ( politicwordweight / (sportwordweight + politicwordweight)) else: sportwordweight = sportswords.get(word, 0.0) + 1.0 politicwordweight = politicswords.get(word, 0.0) + 1.0 if (sportwordweight != 0 or sportwordweight != 0): totalsportsweight += c * sportwordweight / ( sportwordweight + politicwordweight) totalpoliticsweight += c * politicwordweight / ( sportwordweight + politicwordweight) if (cat == "politics" and totalsportsweight > totalpoliticsweight): tweets.remove(actualtweet) if (cat == "sports" and totalsportsweight < totalpoliticsweight): tweets.remove(actualtweet) json.dump(tweets, open(cat + "trainingset", 'wb'), indent=True)
def __init__(self): self.tweetprocessor = TweetProcessor() self.topsportswords = open(r'topsportswords', 'r').read().splitlines() self.toppoliticswords = open(r'toppoliticswords', 'r').read().splitlines() self.topsportshashtags = open(r'topsportshashtags', 'r').read().splitlines() self.toppoliticshashtags = open(r'toppoliticshashtags', 'r').read().splitlines() self.sportstweets = json.load(open("sportstrainingset")) self.politicstweets = json.load(open("politicstrainingset")) self.categorisetweets()
def addTrainingToTweets(): trainingset= open(r'training.txt', 'r').read().splitlines() tweetprocessor = TweetProcessor() sportstweets = [] politicstweets = [] for line in trainingset: sportindex = line.find(" ")+9 if line.split()[1] == 'Sports': sportstweets.append(tweetprocessor.processTweet(line[sportindex:-1])) for line in trainingset: politicsindex = line.find(" ")+11 if line.split()[1] == 'Politics': politicstweets.append(tweetprocessor.processTweet(line[politicsindex:-1])) json.dump(sportstweets, open("sportstrainingset", 'wb'),indent = True) json.dump(politicstweets, open("politicstrainingset", 'wb'),indent = True)
def cleanUpTrainingSet(a,b,c,cat): #this code will cleanup training set with previous training to eliminate confusion. #This code has to executed only once and is done in the above tweetprocessor = TweetProcessor() sportswords = json.load(open("sportswords")) politicswords = json.load(open("politicswords")) sportshashtags = json.load(open("sportshashtags")) politicshashtags = json.load(open("politicshashtags")) sportsmentions = json.load(open("sportsmentions")) politicsmentions = json.load(open("politicsmentions")) tweets = json.load(open(cat+"trainingset")) for actualtweet in tweets: tweet = tweetprocessor.processTweet(actualtweet) words= tweetprocessor.getwords(tweet) totalsportsweight = 0.0 totalpoliticsweight = 0.0 sportwordweight = 0.0 politicwordweight = 0.0 for word in words: if(word != ''): if(word[0] == '#' and len(word.split())<2): sportwordweight = sportshashtags.get(word,0.0) +1.0 politicwordweight = politicshashtags.get(word,0.0) +1.0 if(sportwordweight!=0 or sportwordweight!=0): totalsportsweight += a*(sportwordweight / (sportwordweight +politicwordweight)) totalpoliticsweight += a*(politicwordweight / (sportwordweight +politicwordweight)) elif(word[0] == '@' and len(word.split())<2): sportwordweight = sportsmentions.get(word,0.0) +1.0 politicwordweight = politicsmentions.get(word,0.0) +1.0 if(sportwordweight!=0 or sportwordweight!=0): totalsportsweight += b*(sportwordweight / (sportwordweight +politicwordweight)) totalpoliticsweight += b*(politicwordweight / (sportwordweight +politicwordweight)) else: sportwordweight = sportswords.get(word,0.0) +1.0 politicwordweight = politicswords.get(word,0.0) +1.0 if(sportwordweight!=0 or sportwordweight!=0): totalsportsweight += c*sportwordweight / (sportwordweight +politicwordweight) totalpoliticsweight += c*politicwordweight / (sportwordweight +politicwordweight) if (cat == "politics" and totalsportsweight > totalpoliticsweight): tweets.remove(actualtweet) if (cat == "sports" and totalsportsweight < totalpoliticsweight): tweets.remove(actualtweet) json.dump(tweets, open(cat + "trainingset", 'wb'),indent = True)
def addTrainingToTweets(): trainingset = open(r'training.txt', 'r').read().splitlines() tweetprocessor = TweetProcessor() sportstweets = [] politicstweets = [] for line in trainingset: sportindex = line.find(" ") + 9 if line.split()[1] == 'Sports': sportstweets.append( tweetprocessor.processTweet(line[sportindex:-1])) for line in trainingset: politicsindex = line.find(" ") + 11 if line.split()[1] == 'Politics': politicstweets.append( tweetprocessor.processTweet(line[politicsindex:-1])) json.dump(sportstweets, open("sportstrainingset", 'wb'), indent=True) json.dump(politicstweets, open("politicstrainingset", 'wb'), indent=True)
def __init__(self): self.sportswords = json.load(open("sportswords")) self.politicswords = json.load(open("politicswords")) self.sportshashtags = json.load(open("sportshashtags")) self.politicshashtags = json.load(open("politicshashtags")) self.sportsmentions = json.load(open("sportsmentions")) self.politicsmentions = json.load(open("politicsmentions")) self.tweetprocessor = TweetProcessor()
class Classifier: def __init__(self): self.tweetprocessor = TweetProcessor() self.topsportswords = open(r'topsportswords', 'r').read().splitlines() self.toppoliticswords = open(r'toppoliticswords', 'r').read().splitlines() self.topsportshashtags = open(r'topsportshashtags', 'r').read().splitlines() self.toppoliticshashtags = open(r'toppoliticshashtags', 'r').read().splitlines() self.sportstweets = json.load(open("sportstrainingset")) self.politicstweets = json.load(open("politicstrainingset")) self.categorisetweets() def categorisetweets(self): politicswords = {} sportswords= {} politicshashtags = {} sportshashtags= {} politicsmentions = {} sportsmentions= {} #Filling in most common words and hash tags for word in self.topsportswords: sportswords[word] = 10 for word in self.topsportshashtags: sportshashtags[word] = 5 for word in self.toppoliticswords: politicswords[word] = 10 for word in self.toppoliticshashtags: politicshashtags[word] = 5 #Analyzing the tweets for tweet in self.sportstweets: tweet = self.tweetprocessor.processTweet(tweet) words= self.tweetprocessor.getwords(tweet) for word in words: if(word[0] == '#' ): sportshashtags[word] = sportshashtags.get(word, 0) +1 elif(word[0] == '@' ): sportsmentions[word] = sportsmentions.get(word, 0) +1 else: sportswords[word] = sportswords.get(word, 0) +1 for tweet in self.politicstweets: tweet = self.tweetprocessor.processTweet(tweet) words= self.tweetprocessor.getwords(tweet) for word in words: if(word[0] == '#' ): politicshashtags[word] = politicshashtags.get(word, 0) +1 elif(word[0] == '@' ): politicsmentions[word] = politicsmentions.get(word, 0) +1 else: politicswords[word] = politicswords.get(word, 0) +1 #Saving the categorised tweets json.dump(sportswords, open("sportswords", 'wb')) json.dump(sportshashtags, open("sportshashtags", 'wb')) json.dump(politicswords, open("politicswords", 'wb')) json.dump(politicshashtags, open("politicshashtags", 'wb')) json.dump(politicsmentions, open("politicsmentions", 'wb')) json.dump(sportsmentions, open("sportsmentions", 'wb'))
class Classifier: def __init__(self): self.tweetprocessor = TweetProcessor() self.topsportswords = open(r'topsportswords', 'r').read().splitlines() self.toppoliticswords = open(r'toppoliticswords', 'r').read().splitlines() self.topsportshashtags = open(r'topsportshashtags', 'r').read().splitlines() self.toppoliticshashtags = open(r'toppoliticshashtags', 'r').read().splitlines() self.sportstweets = json.load(open("sportstrainingset")) self.politicstweets = json.load(open("politicstrainingset")) self.categorisetweets() def categorisetweets(self): politicswords = {} sportswords = {} politicshashtags = {} sportshashtags = {} politicsmentions = {} sportsmentions = {} #Filling in most common words and hash tags for word in self.topsportswords: sportswords[word] = 10 for word in self.topsportshashtags: sportshashtags[word] = 5 for word in self.toppoliticswords: politicswords[word] = 10 for word in self.toppoliticshashtags: politicshashtags[word] = 5 #Analyzing the tweets for tweet in self.sportstweets: tweet = self.tweetprocessor.processTweet(tweet) words = self.tweetprocessor.getwords(tweet) for word in words: if (word[0] == '#'): sportshashtags[word] = sportshashtags.get(word, 0) + 1 elif (word[0] == '@'): sportsmentions[word] = sportsmentions.get(word, 0) + 1 else: sportswords[word] = sportswords.get(word, 0) + 1 for tweet in self.politicstweets: tweet = self.tweetprocessor.processTweet(tweet) words = self.tweetprocessor.getwords(tweet) for word in words: if (word[0] == '#'): politicshashtags[word] = politicshashtags.get(word, 0) + 1 elif (word[0] == '@'): politicsmentions[word] = politicsmentions.get(word, 0) + 1 else: politicswords[word] = politicswords.get(word, 0) + 1 #Saving the categorised tweets json.dump(sportswords, open("sportswords", 'wb')) json.dump(sportshashtags, open("sportshashtags", 'wb')) json.dump(politicswords, open("politicswords", 'wb')) json.dump(politicshashtags, open("politicshashtags", 'wb')) json.dump(politicsmentions, open("politicsmentions", 'wb')) json.dump(sportsmentions, open("sportsmentions", 'wb'))