def normalize(self, text):
     """
         Make some word improvements before feeding to the sentence tokenizer.
     """  
     rr = RepeatReplacer(self.lexicon)
     normalized_text = []
     final = None
     try:
         for word in text.split():
             normal = rr.replace(word.lower()) 
             if word[0].isupper(): 
                 normal = normal[0].upper() + normal[1:]
             
             normalized_text.append(normal)
             final = " ".join(normalized_text)
     except:
             final = text
 
     return final
Example #2
0
 def normalize(self, text):
     """
         Make some word improvements before feeding to the sentence tokenizer.
     """  
     rr = RepeatReplacer(self.lexicon)
     normalized_text = []
     final = None
     try:
         for word in text.split():
             normal = rr.replace(word.lower()) 
             if word[0].isupper(): 
                 normal = normal[0].upper() + normal[1:]
             
             normalized_text.append(normal)
             final = " ".join(normalized_text)
     except:
             final = text
 
     return final
Example #3
0
if not os.path.exists(file):
	print "No such file found"
else:
	for line in open(file):
		a = tokenizer.tokenize(line)
		tweets.append(a)
	
	sports_synset = wordnet.synset('sport.n.01')
	cricket_synset = wordnet.synset('cricket.n.02')
	i = 0	

	for line in tweets:
		out = False
		for word in line:
			rep.replace(word)
			if not wordnet.synsets(word):
				continue
			syn = wordnet.synsets(word)[0]

			if word == "cricket" :
				i+=1
				specific_tweets.append(line)
				break
			
			for element in syn.hypernym_paths():
				for word1 in element:
					if word1 == sports_synset:
                        sports_word.append(word)
						specific_tweets.append(line)
						out = True