class Test_Data(unittest.TestCase): def setUp(self): self.tweets_data_path = 'test/sample.json' self.db = './test.sqlite' self.feels_db = TweetData(self.db) def tearDown(self): os.remove(self.db) def test_file_creation(self): self.assertTrue(os.path.exists(self.db)) def test_fields(self): f = self.feels_db.fields self.assertTrue(isinstance(f, tuple)) self.assertTrue(len(f) >= 11) def test_scrub(self): data = {'a': 1, 'b': 2} scrubbed = self.feels_db.scrub(data) self.assertTrue(isinstance(scrubbed, str)) def test_data_operation(self): twt = { 'created_at': 'Sun Feb 19 19:14:18 +0000 2017', 'id_str': '833394296418082817', 'text': 'All the feels!' } t = Tweet(twt) self.assertEqual(len(t.keys()), 3) self.feels_db.insert_tweet(t) df = self.feels_db.queue self.assertEqual(len(df), 1) df.sentiment = 0.9 for row in df.itertuples(): self.feels_db.update_tweet({ 'id_str': row.id_str, 'sentiment': row.sentiment }) self.assertEqual(len(self.feels_db.queue), 0) self.assertEqual(len(self.feels_db.all), 1)
class Test_Data(unittest.TestCase): def setUp(self): self.tweets_data_path = 'test/sample.json' self.db = './test.sqlite' self.feels_db = TweetData(self.db) self.tweets = [ { 'created_at': 'Sun Feb 19 19:14:18 +0000 2017', 'id_str': '833394296418082817', 'text': 'Tweetfeels is tremendous! Believe me. I know.', 'user': { 'followers_count': '100', 'friends_count': '200', 'location': None } }, # sentiment value = 0 { 'created_at': 'Sun Feb 20 19:14:19 +0000 2017', 'id_str': '833394296418082818', 'text': 'Fake news. Sad!', 'user': { 'followers_count': '200', 'friends_count': '200', 'location': None } }, # sentiment value = -0.7351 { 'created_at': 'Sun Feb 21 19:14:20 +0000 2017', 'id_str': '833394296418082819', 'text': 'I hate it.', 'user': { 'followers_count': '200', 'friends_count': '200', 'location': None } } # sentiment value = -0.5719 ] self.mock_tweets = [Tweet(t) for t in self.tweets] def tearDown(self): os.remove(self.db) def test_file_creation(self): self.assertTrue(os.path.exists(self.db)) def test_fields(self): f = self.feels_db.fields self.assertTrue(isinstance(f, tuple)) self.assertTrue(len(f) >= 11) def test_start(self): self.assertTrue(isinstance(self.feels_db.start, datetime)) def test_dates(self): for t in self.mock_tweets: self.feels_db.insert_tweet(t) self.assertEqual(len(self.feels_db.tweet_dates), 3) tweets = [] with open(self.tweets_data_path) as tweets_file: lines = filter(None, (line.rstrip() for line in tweets_file)) for line in lines: try: tweets.append(Tweet(json.loads(line))) except KeyError: pass for t in tweets: self.feels_db.insert_tweet(t) self.assertEqual(len(self.feels_db.tweet_dates), 105) df = self.feels_db.tweet_dates timebox = timedelta(seconds=60) second = timedelta(seconds=1) df = df.groupby(pd.TimeGrouper(freq=f'{int(timebox/second)}S')).size() df = df[df != 0] print(df) self.assertEqual(len(df), 3) self.assertEqual(df.iloc[0], 103) def test_fetch(self): tweets = [] with open(self.tweets_data_path) as tweets_file: lines = filter(None, (line.rstrip() for line in tweets_file)) for line in lines: try: tweets.append(Tweet(json.loads(line))) except KeyError: pass for t in tweets: self.feels_db.insert_tweet(t) for t in self.mock_tweets: self.feels_db.insert_tweet(t) it = self.feels_db.fetchbin(binsize=timedelta(minutes=30)) cur = next(it) self.assertEqual(cur.end - cur.start, timedelta(minutes=30)) self.assertEqual(len(cur), 103) cur = next(it) self.assertEqual(len(cur), 1) cur = next(it) self.assertEqual(len(cur), 1) def test_empty(self): for t in self.mock_tweets: self.feels_db.insert_tweet(t) it = self.feels_db.fetchbin(binsize=timedelta(hours=12), empty=True) cur = next(it) self.assertEqual(len(cur), 1) cur = next(it) self.assertEqual(len(cur), 0) cur = next(it) self.assertEqual(len(cur), 1) cur = next(it) cur = next(it) self.assertEqual(len(cur), 1) def test_bin(self): for t in self.mock_tweets: self.feels_db.insert_tweet(t) it = self.feels_db.fetchbin(binsize=timedelta(hours=12), empty=True) cur = next(it) self.assertEqual(cur.influence, 300) cur = next(it) self.assertEqual(cur.influence, 0) cur = next(it) self.assertEqual(cur.influence, 400) cur = next(it) cur = next(it) self.assertEqual(cur.influence, 400) def test_data_operation(self): twt = { 'created_at': 'Sun Feb 19 19:14:18 +0000 2017', 'id_str': '833394296418082817', 'text': 'All the feels!' } t = Tweet(twt) self.assertEqual(len(t.keys()), 7) self.feels_db.insert_tweet(t) b = self.feels_db.tweets_since(datetime.now()) self.assertEqual(len(b), 0) b = self.feels_db.tweets_since(0) self.assertEqual(len(b), 1) b.df.sentiment = 0.9 for row in b.df.itertuples(): self.feels_db.update_tweet({ 'id_str': row.id_str, 'sentiment': row.sentiment }) start = datetime(2017, 2, 17, 0, 0, 0) before = datetime(2017, 2, 18, 0, 0, 0) after = datetime(2017, 2, 20, 0, 0, 0) b = self.feels_db.tweets_between(start, before) self.assertEqual(len(b), 0) b = self.feels_db.tweets_between(start, after) self.assertEqual(len(b), 1)
class TweetFeels(object): """ The controller. :param credentials: A list of your 4 credential components. :param tracking: A list of keywords to track. :param db: A sqlite database to store data. Will be created if it doesn't already exist. Will append if it exists. :ivar calc_every_n: Wont calculate new sentiment until there are n records in the queue. :ivar lang: A list of languages to include in tweet gathering. """ def __init__(self, credentials, tracking=[], db='feels.sqlite'): self._listener = TweetListener(self.on_data, self.on_error) self._feels = TweetData(db) _auth = OAuthHandler(credentials[0], credentials[1]) _auth.set_access_token(credentials[2], credentials[3]) self._stream = Stream(_auth, self._listener) self.tracking = tracking self.lang = ['en'] self._sentiment = 0 self._filter_level = 'low' self.calc_every_n = 10 def start(self, seconds=None): def delayed_stop(): time.sleep(seconds) print('Timer completed. Disconnecting now...') self.stop() if len(self.tracking) == 0: print('Nothing to track!') else: self._stream.filter(track=self.tracking, languages=self.lang, async=True) # This does not work due to upstream bug in tweepy 3.5.0. They have fixed it in # https://github.com/tweepy/tweepy/pull/783 # self._stream.filter( # track=self.tracking, languages=self.lang, async=True, # filter_level=self._filter_level # ) if seconds is not None: t = Thread(target=delayed_stop) t.start() def stop(self): self._stream.disconnect() def on_data(self, data): """ Note: Due to upstream bug in tweepy for python3, it cannot handle the `filter_level` parameter in the `Stream.filter` function. Therefore, we'll take care of it here. The problem has been identified and fixed by the tweepy team here: https://github.com/tweepy/tweepy/pull/783 """ filter_value = {'none': 0, 'low': 1, 'medium': 2} value = filter_value[data['filter_level']] if value >= filter_value[self._filter_level]: self._feels.insert_tweet(data) def on_error(self, status): pass def _intensity(self, tweet): t = clean(tweet) return SentimentIntensityAnalyzer().polarity_scores(t)['compound'] @property def sentiment(self): df = self._feels.queue if (len(df) > self.calc_every_n): df.sentiment = df.text.apply(self._intensity) for row in df.itertuples(): self._feels.update_tweet({ 'id_str': row.id_str, 'sentiment': row.sentiment }) df = df.loc[df.sentiment != 0] # drop rows having 0 sentiment df = df.groupby('created_at') df = df.apply( lambda x: np.average(x.sentiment, weights=x.followers_count)) df = df.sort_index() for row in df.iteritems(): self._sentiment = self._sentiment * 0.99 + row[1] * 0.01 return self._sentiment
class Test_Feels(unittest.TestCase): def setUp(self): TweetFeels._db_factory = (lambda db: MagicMock()) TweetFeels._auth_factory = (lambda cred: MagicMock()) TweetFeels._listener_factory = (lambda ctrl: MagicMock()) TweetFeels._stream_factory = (lambda auth, listener: MagicMock()) self.tweets_data_path = 'test/sample.json' self.tweets = [ {'created_at': 'Sun Feb 19 09:14:18 +0000 2017', 'id_str': '833394296418082817', 'text': 'Tweetfeels is tremendous! Believe me. I know.', 'user': {'followers_count': '100', 'friends_count': '200', 'location':None} }, # sentiment value = 0 {'created_at': 'Sun Feb 21 18:14:19 +0000 2017', 'id_str': '833394296418082818', 'text': 'Fake news. Sad!', 'user': {'followers_count': '100', 'friends_count': '200', 'location':None} }, # sentiment value = -0.7351 {'created_at': 'Sun Feb 21 19:14:20 +0000 2017', 'id_str': '833394296418082819', 'text': 'I hate it.', 'user': {'followers_count': '100', 'friends_count': '200', 'location':None} } # sentiment value = -0.5719 ] self.mock_feels = TweetFeels('abcd') self.feels_db = TweetData(file='./test/db.sqlite') self.mock_feels._feels = self.feels_db self.mock_tweets = [Tweet(t) for t in self.tweets] for t in self.mock_tweets: self.feels_db.insert_tweet(t) self.mock_feels.clear_buffer() def tearDown(self): os.remove('./test/db.sqlite') def test_start(self): mock_feels = TweetFeels("abcd") mock_feels.tracking = [] mock_feels.start(selfupdate=0) mock_feels._stream.filter.assert_not_called() mock_feels.tracking = ['tsla'] mock_feels.start(selfupdate=0) mock_feels._stream.filter.assert_called_once() def test_stop(self): mock_feels = TweetFeels("abcd") mock_feels.stop() mock_feels._stream.disconnect.assert_called_once() def test_on_data(self): mock_feels = TweetFeels("abcd") mock_feels.buffer_limit = 0 data = {'filter_level': 'low', 'text': 'test data'} mock_feels.on_data(data) mock_feels._feels.insert_tweet.assert_called_once() # test filtering levels mock_feels2 = TweetFeels("abcd") mock_feels2._filter_level = 'medium' mock_feels2.on_data(data) mock_feels2._feels.insert_tweet.assert_not_called() # test buffer limit. no inserts until we are over limit mock_feels2.buffer_limit = 2 mock_feels2.filter_level = 'low' mock_feels2.on_data(data) mock_feels2._feels.insert_tweet.assert_not_called() mock_feels2.on_data(data) mock_feels2.on_data(data) mock_feels._feels.insert_tweet.assert_called_once() def test_sentiment(self): mock_feels = TweetFeels("abcd") mock_feels._feels.tweets_since = MagicMock(return_value=[]) mock_feels._sentiment = Sentiment(0.5, 0, 0, 0) mock_feels._latest_calc = datetime(2017, 1, 1, 0, 0, 0) mock_feels._feels.start = datetime(2017, 1, 1, 0, 0, 0) mock_feels._feels.end = datetime(2017, 1, 1, 0, 0, 0) self.assertEqual(mock_feels.sentiment.value, 0.5) def test_buffer(self): mock_feels = TweetFeels('abcd') mock_feels.buffer_limit = 5 feels_db = TweetData(file='sample.sqlite') mock_feels._feels = feels_db with open(self.tweets_data_path) as tweets_file: lines = list(filter(None, (line.rstrip() for line in tweets_file))) for line in lines[0:3]: t = Tweet(json.loads(line)) mock_feels.on_data(t) self.assertEqual(len(mock_feels._tweet_buffer), 3) for line in lines[3:6]: t = Tweet(json.loads(line)) mock_feels.on_data(t) time.sleep(1) #this waits for items to finish popping off the buffer self.assertEqual(len(mock_feels._tweet_buffer), 0) dfs = [df for df in mock_feels._feels.all] self.assertEqual(len(dfs[0]), 6) os.remove('sample.sqlite') def test_sentiment_comprehensive(self): sentiment = 0.0 for t in self.mock_tweets: if t['sentiment']!=0: sentiment = 0.99*sentiment + 0.01*t['sentiment'] # calc = 0*0.99**2 + 0.01*0.99*-0.7531 + 0.01*-0.5719 # = -0.01299649 self.mock_feels._latest_calc = self.mock_feels._feels.start self.assertTrue(np.isclose(self.mock_feels.sentiment.value, sentiment)) # first observation is at 2017-2-19 19:14:18 and we are using default # 60 second bins, therefore the observation at 2017-2-21 19:14:20 will # never get saved but will always be recalculated. self.assertEqual(self.mock_feels._latest_calc, datetime(2017, 2, 21, 19, 14, 0)) # repeat the calculation, nothing changes self.assertTrue(np.isclose(self.mock_feels.sentiment.value, sentiment)) self.assertEqual(self.mock_feels._latest_calc, datetime(2017, 2, 21, 19, 14, 0)) self.assertEqual(self.mock_feels.sentiment.start, self.mock_feels._latest_calc) def test_sentiment_factor(self): sentiment = 0.0 self.mock_feels.factor = 0.75 for t in self.mock_tweets: if t['sentiment']!=0: sentiment = 0.75*sentiment + 0.25*t['sentiment'] # calc = 0*0.75**2 + 0.25*0.75*-0.7531 + 0.25*-0.5719 # = -0.28418125 mock_sentiment = self.mock_feels.sentiment.value self.assertTrue(np.isclose(mock_sentiment, sentiment)) def test_sentiment_binsize(self): T = self.mock_tweets A = T[1]['sentiment'] B = T[2]['sentiment'] sentiment = 0.75*0 + 0.25*(A+B)/2 self.mock_feels.factor = 0.75 self.mock_feels.binsize = timedelta(days=2.5) mock_sentiment = self.mock_feels.sentiment.value self.assertTrue(np.isclose(mock_sentiment, sentiment)) def test_nans(self): sentiments = self.mock_feels.sentiments( delta_time=timedelta(hours=24), nans=True) s = next(sentiments) self.assertEqual(s.value, 0) s = next(sentiments) self.assertTrue(np.isnan(s.value)) # can return nans # does not affect current sentiment self.assertEqual(self.mock_feels._sentiment.value, 0) s = next(sentiments) self.assertTrue(s.value<0) def test_sentiments(self): start = datetime(2017, 2, 19, 0, 0, 0) dt = timedelta(minutes=30) sentiment = self.mock_feels.sentiments(strt=start, delta_time=dt) self.assertTrue(np.isclose(next(sentiment).value, 0)) self.assertTrue(np.isclose(next(sentiment).value, -0.007351)) self.assertTrue(np.isclose(next(sentiment).value, -0.01299649)) for s in sentiment: print(s) # we are starting at 2017-2-19 19:00:00 and using bins with length 30 # minutes, therefore our latest calc will be just prior to the final # observation. self.assertEqual(self.mock_feels._latest_calc, datetime(2017, 2, 21, 19, 0, 0))