Ejemplo n.º 1
0
def cleanUpTrainingSet(a, b, c, cat):
    #this code will cleanup training set with previous training to eliminate confusion.
    #This code has to executed only once and is done in the above
    tweetprocessor = TweetProcessor()
    sportswords = json.load(open("sportswords"))
    politicswords = json.load(open("politicswords"))

    sportshashtags = json.load(open("sportshashtags"))
    politicshashtags = json.load(open("politicshashtags"))

    sportsmentions = json.load(open("sportsmentions"))
    politicsmentions = json.load(open("politicsmentions"))
    tweets = json.load(open(cat + "trainingset"))
    for actualtweet in tweets:
        tweet = tweetprocessor.processTweet(actualtweet)
        words = tweetprocessor.getwords(tweet)
        totalsportsweight = 0.0
        totalpoliticsweight = 0.0
        sportwordweight = 0.0
        politicwordweight = 0.0
        for word in words:
            if (word != ''):
                if (word[0] == '#' and len(word.split()) < 2):
                    sportwordweight = sportshashtags.get(word, 0.0) + 1.0
                    politicwordweight = politicshashtags.get(word, 0.0) + 1.0
                    if (sportwordweight != 0 or sportwordweight != 0):
                        totalsportsweight += a * (
                            sportwordweight /
                            (sportwordweight + politicwordweight))
                        totalpoliticsweight += a * (
                            politicwordweight /
                            (sportwordweight + politicwordweight))
                elif (word[0] == '@' and len(word.split()) < 2):
                    sportwordweight = sportsmentions.get(word, 0.0) + 1.0
                    politicwordweight = politicsmentions.get(word, 0.0) + 1.0
                    if (sportwordweight != 0 or sportwordweight != 0):
                        totalsportsweight += b * (
                            sportwordweight /
                            (sportwordweight + politicwordweight))
                        totalpoliticsweight += b * (
                            politicwordweight /
                            (sportwordweight + politicwordweight))
                else:
                    sportwordweight = sportswords.get(word, 0.0) + 1.0
                    politicwordweight = politicswords.get(word, 0.0) + 1.0
                    if (sportwordweight != 0 or sportwordweight != 0):
                        totalsportsweight += c * sportwordweight / (
                            sportwordweight + politicwordweight)
                        totalpoliticsweight += c * politicwordweight / (
                            sportwordweight + politicwordweight)
        if (cat == "politics" and totalsportsweight > totalpoliticsweight):
            tweets.remove(actualtweet)
        if (cat == "sports" and totalsportsweight < totalpoliticsweight):
            tweets.remove(actualtweet)
    json.dump(tweets, open(cat + "trainingset", 'wb'), indent=True)
Ejemplo n.º 2
0
 def __init__(self):
     self.tweetprocessor = TweetProcessor()
     self.topsportswords = open(r'topsportswords', 'r').read().splitlines()
     self.toppoliticswords = open(r'toppoliticswords',
                                  'r').read().splitlines()
     self.topsportshashtags = open(r'topsportshashtags',
                                   'r').read().splitlines()
     self.toppoliticshashtags = open(r'toppoliticshashtags',
                                     'r').read().splitlines()
     self.sportstweets = json.load(open("sportstrainingset"))
     self.politicstweets = json.load(open("politicstrainingset"))
     self.categorisetweets()
Ejemplo n.º 3
0
def addTrainingToTweets():
	trainingset= open(r'training.txt', 'r').read().splitlines()
	tweetprocessor = TweetProcessor()
	sportstweets = []
	politicstweets = []
	for line in trainingset:
		sportindex = line.find(" ")+9
		if line.split()[1] == 'Sports':
			sportstweets.append(tweetprocessor.processTweet(line[sportindex:-1]))

	for line in trainingset:
		politicsindex = line.find(" ")+11
		if line.split()[1] == 'Politics':
			politicstweets.append(tweetprocessor.processTweet(line[politicsindex:-1]))

	json.dump(sportstweets, open("sportstrainingset", 'wb'),indent = True)
	json.dump(politicstweets, open("politicstrainingset", 'wb'),indent = True)
Ejemplo n.º 4
0
def cleanUpTrainingSet(a,b,c,cat):
	#this code will cleanup training set with previous training to eliminate confusion.
	#This code has to executed only once and is done in the above
	tweetprocessor = TweetProcessor()
	sportswords = json.load(open("sportswords"))
	politicswords = json.load(open("politicswords"))
	
	sportshashtags = json.load(open("sportshashtags"))
	politicshashtags = json.load(open("politicshashtags"))

	sportsmentions = json.load(open("sportsmentions"))
	politicsmentions = json.load(open("politicsmentions"))
	tweets = json.load(open(cat+"trainingset"))
	for actualtweet in tweets:
		tweet = tweetprocessor.processTweet(actualtweet)
		words= tweetprocessor.getwords(tweet)
		totalsportsweight = 0.0
		totalpoliticsweight = 0.0
		sportwordweight = 0.0
		politicwordweight = 0.0
		for word in words:
			if(word != ''):	
				if(word[0] == '#' and len(word.split())<2):
					sportwordweight = sportshashtags.get(word,0.0)  +1.0
					politicwordweight = politicshashtags.get(word,0.0) +1.0
					if(sportwordweight!=0 or sportwordweight!=0):
						totalsportsweight += a*(sportwordweight / (sportwordweight +politicwordweight))
						totalpoliticsweight += a*(politicwordweight / (sportwordweight +politicwordweight))
				elif(word[0] == '@' and len(word.split())<2):
					sportwordweight = sportsmentions.get(word,0.0) +1.0
					politicwordweight = politicsmentions.get(word,0.0) +1.0
					if(sportwordweight!=0 or sportwordweight!=0):
						totalsportsweight += b*(sportwordweight / (sportwordweight +politicwordweight))
						totalpoliticsweight += b*(politicwordweight / (sportwordweight +politicwordweight))
				else:
					sportwordweight = sportswords.get(word,0.0) +1.0
					politicwordweight = politicswords.get(word,0.0) +1.0
					if(sportwordweight!=0 or sportwordweight!=0):
						totalsportsweight += c*sportwordweight / (sportwordweight +politicwordweight)
						totalpoliticsweight += c*politicwordweight / (sportwordweight +politicwordweight)
		if (cat == "politics" and totalsportsweight > totalpoliticsweight):
			tweets.remove(actualtweet)
		if (cat == "sports" and totalsportsweight < totalpoliticsweight):
			tweets.remove(actualtweet)
	json.dump(tweets, open(cat + "trainingset", 'wb'),indent = True)
Ejemplo n.º 5
0
	def __init__(self):
		self.tweetprocessor = TweetProcessor()	
		self.topsportswords =  open(r'topsportswords', 'r').read().splitlines()
		self.toppoliticswords = open(r'toppoliticswords', 'r').read().splitlines()
		self.topsportshashtags = open(r'topsportshashtags', 'r').read().splitlines()
		self.toppoliticshashtags = open(r'toppoliticshashtags', 'r').read().splitlines()
		self.sportstweets = json.load(open("sportstrainingset"))
		self.politicstweets = json.load(open("politicstrainingset"))
		self.categorisetweets()
Ejemplo n.º 6
0
def addTrainingToTweets():
    trainingset = open(r'training.txt', 'r').read().splitlines()
    tweetprocessor = TweetProcessor()
    sportstweets = []
    politicstweets = []
    for line in trainingset:
        sportindex = line.find(" ") + 9
        if line.split()[1] == 'Sports':
            sportstweets.append(
                tweetprocessor.processTweet(line[sportindex:-1]))

    for line in trainingset:
        politicsindex = line.find(" ") + 11
        if line.split()[1] == 'Politics':
            politicstweets.append(
                tweetprocessor.processTweet(line[politicsindex:-1]))

    json.dump(sportstweets, open("sportstrainingset", 'wb'), indent=True)
    json.dump(politicstweets, open("politicstrainingset", 'wb'), indent=True)
Ejemplo n.º 7
0
	def __init__(self):
		self.sportswords = json.load(open("sportswords"))
		self.politicswords = json.load(open("politicswords"))
	
		self.sportshashtags = json.load(open("sportshashtags"))
		self.politicshashtags = json.load(open("politicshashtags"))
	
		self.sportsmentions = json.load(open("sportsmentions"))
		self.politicsmentions = json.load(open("politicsmentions"))
		self.tweetprocessor = TweetProcessor()
Ejemplo n.º 8
0
class Classifier:
	def __init__(self):
		self.tweetprocessor = TweetProcessor()	
		self.topsportswords =  open(r'topsportswords', 'r').read().splitlines()
		self.toppoliticswords = open(r'toppoliticswords', 'r').read().splitlines()
		self.topsportshashtags = open(r'topsportshashtags', 'r').read().splitlines()
		self.toppoliticshashtags = open(r'toppoliticshashtags', 'r').read().splitlines()
		self.sportstweets = json.load(open("sportstrainingset"))
		self.politicstweets = json.load(open("politicstrainingset"))
		self.categorisetweets()
		
	def categorisetweets(self):
		politicswords = {}
		sportswords= {}

		politicshashtags = {}
		sportshashtags= {}

		politicsmentions = {}
		sportsmentions= {}
		#Filling in most common words and hash tags
		for word in self.topsportswords:
			sportswords[word] = 10
		for word in self.topsportshashtags:
			sportshashtags[word] = 5
		for word in self.toppoliticswords:
			politicswords[word] = 10
		for word in self.toppoliticshashtags:
			politicshashtags[word] = 5
		#Analyzing the tweets
		for tweet in self.sportstweets:

			tweet = self.tweetprocessor.processTweet(tweet)
			words= self.tweetprocessor.getwords(tweet)
			for word in words:	
				if(word[0] == '#' ):
					sportshashtags[word] = sportshashtags.get(word, 0) +1
				elif(word[0] == '@' ):
					sportsmentions[word] = sportsmentions.get(word, 0) +1
				else:
					sportswords[word] = sportswords.get(word, 0) +1

		for tweet in self.politicstweets:
			tweet = self.tweetprocessor.processTweet(tweet)
			words= self.tweetprocessor.getwords(tweet)
			for word in words:
				if(word[0] == '#' ):
					politicshashtags[word] = politicshashtags.get(word, 0) +1
				elif(word[0] == '@' ):
					politicsmentions[word] = politicsmentions.get(word, 0) +1
				else:
					politicswords[word] = politicswords.get(word, 0) +1
		#Saving the categorised tweets

	
		json.dump(sportswords, open("sportswords", 'wb'))
		json.dump(sportshashtags, open("sportshashtags", 'wb'))

		json.dump(politicswords, open("politicswords", 'wb'))
		json.dump(politicshashtags, open("politicshashtags", 'wb'))

		json.dump(politicsmentions, open("politicsmentions", 'wb'))
		json.dump(sportsmentions, open("sportsmentions", 'wb'))
Ejemplo n.º 9
0
class Classifier:
    def __init__(self):
        self.tweetprocessor = TweetProcessor()
        self.topsportswords = open(r'topsportswords', 'r').read().splitlines()
        self.toppoliticswords = open(r'toppoliticswords',
                                     'r').read().splitlines()
        self.topsportshashtags = open(r'topsportshashtags',
                                      'r').read().splitlines()
        self.toppoliticshashtags = open(r'toppoliticshashtags',
                                        'r').read().splitlines()
        self.sportstweets = json.load(open("sportstrainingset"))
        self.politicstweets = json.load(open("politicstrainingset"))
        self.categorisetweets()

    def categorisetweets(self):
        politicswords = {}
        sportswords = {}

        politicshashtags = {}
        sportshashtags = {}

        politicsmentions = {}
        sportsmentions = {}
        #Filling in most common words and hash tags
        for word in self.topsportswords:
            sportswords[word] = 10
        for word in self.topsportshashtags:
            sportshashtags[word] = 5
        for word in self.toppoliticswords:
            politicswords[word] = 10
        for word in self.toppoliticshashtags:
            politicshashtags[word] = 5
        #Analyzing the tweets
        for tweet in self.sportstweets:

            tweet = self.tweetprocessor.processTweet(tweet)
            words = self.tweetprocessor.getwords(tweet)
            for word in words:
                if (word[0] == '#'):
                    sportshashtags[word] = sportshashtags.get(word, 0) + 1
                elif (word[0] == '@'):
                    sportsmentions[word] = sportsmentions.get(word, 0) + 1
                else:
                    sportswords[word] = sportswords.get(word, 0) + 1

        for tweet in self.politicstweets:
            tweet = self.tweetprocessor.processTweet(tweet)
            words = self.tweetprocessor.getwords(tweet)
            for word in words:
                if (word[0] == '#'):
                    politicshashtags[word] = politicshashtags.get(word, 0) + 1
                elif (word[0] == '@'):
                    politicsmentions[word] = politicsmentions.get(word, 0) + 1
                else:
                    politicswords[word] = politicswords.get(word, 0) + 1
        #Saving the categorised tweets

        json.dump(sportswords, open("sportswords", 'wb'))
        json.dump(sportshashtags, open("sportshashtags", 'wb'))

        json.dump(politicswords, open("politicswords", 'wb'))
        json.dump(politicshashtags, open("politicshashtags", 'wb'))

        json.dump(politicsmentions, open("politicsmentions", 'wb'))
        json.dump(sportsmentions, open("sportsmentions", 'wb'))