def normalize(self, text): """ Make some word improvements before feeding to the sentence tokenizer. """ rr = RepeatReplacer(self.lexicon) normalized_text = [] final = None try: for word in text.split(): normal = rr.replace(word.lower()) if word[0].isupper(): normal = normal[0].upper() + normal[1:] normalized_text.append(normal) final = " ".join(normalized_text) except: final = text return final
if not os.path.exists(file): print "No such file found" else: for line in open(file): a = tokenizer.tokenize(line) tweets.append(a) sports_synset = wordnet.synset('sport.n.01') cricket_synset = wordnet.synset('cricket.n.02') i = 0 for line in tweets: out = False for word in line: rep.replace(word) if not wordnet.synsets(word): continue syn = wordnet.synsets(word)[0] if word == "cricket" : i+=1 specific_tweets.append(line) break for element in syn.hypernym_paths(): for word1 in element: if word1 == sports_synset: sports_word.append(word) specific_tweets.append(line) out = True