def is_tweet_valid(self, tweet): if not tweet or 'delete' in tweet: logger.debug('Empty tweet - skipping') return False if not 'lang' in tweet or tweet['lang'] != 'en': logger.debug('Non EN - skipping') return False if not 'text' in tweet or tweet['text'].startswith('RT'): logger.debug('RE-Tweet found - skipping') return False folded_text = TwitterMixin.word_map(tweet['text']).split() if '__h__' in folded_text and '__s__' in folded_text: logger.debug('Tweet with double emoicons found - skipping') return False return True
def is_tweet_valid(self, tweet): if not tweet or "delete" in tweet: logger.debug("Empty tweet - skipping") return False if not "lang" in tweet or tweet["lang"] != "en": logger.debug("Non EN - skipping") return False if not "text" in tweet or tweet["text"].startswith("RT"): logger.debug("RE-Tweet found - skipping") return False folded_text = TwitterMixin.word_map(tweet["text"]).split() if "__h__" in folded_text and "__s__" in folded_text: logger.debug("Tweet with double emoicons found - skipping") return False return True
def test_remove_usernames(self): text = "hello @username xxx" result = TwitterMixin.remove_usernames(text).split() expect = "hello xxx".split() self.assertEqual(expect, result)
def test_word_map(self): text = "hello :) :( not xxx :)" result = TwitterMixin.word_map(text).split() expect = "hello __h__ __s__ __not__ xxx __h__".split() self.assertEqual(expect, result)
def test_char_fold(self): text = "hello loooooooool" result = TwitterMixin.char_fold(text).split() expect = "hello lool".split() self.assertEqual(expect, result)
def test_remove_numbers(self): text = "hello 12456 xxx" result = TwitterMixin.remove_numbers(text).split() expect = "hello xxx".split() self.assertEqual(expect, result)
def test_remove_urls(self): text = "hello http://cyhex.com xxx" result = TwitterMixin.remove_urls(text).split() expect = "hello xxx".split() self.assertEqual(expect, result)
def test_remove_hashtags(self): text = "hello #hashtag xxx" result = TwitterMixin.remove_hashtags(text).split() expect = "hello xxx".split() self.assertEqual(expect, result)
return (feature_extractor(row.text), row.get_label()) if args.source: featureset = [] f = io.open(args.source) c = 0 for l in f.readlines(): pos, id, posScore, negScore, synsetTerm, gloss = l.split('\t') c += 1 if c == 1: continue gloss = TwitterMixin.make_plain(gloss) print negScore negScore = float(negScore) posScore = float(posScore) if posScore > negScore: label = labels.positive elif posScore < negScore: label = labels.negative else: continue featureset.append((tokenizer.getFeatures(gloss), label)) else: