def test_get_hashtags(self): self.assertEqual( Tweet(self.tweets[0]).get_hashtags(), ["spark", "apache"]) self.assertEqual(Tweet(self.tweets[1]).get_hashtags(), []) self.assertEqual( Tweet(self.tweets[2]).get_hashtags(), ["spark\"\/", "apache storm"])
def main(): TIME = 60 * 60 # 60 min * 60s --> s # getting twitter keys CONSUMER_KEY = environ['CONSUMER_KEY'] CONSUMER_SECRET = environ['CONSUMER_SECRET'] ACCESS_TOKEN = environ['ACCESS_KEY'] ACCESS_TOKEN_SECRET = environ['ACCESS_SECRET'] # init bot bot = Bot(CONSUMER_KEY=CONSUMER_KEY, CONSUMER_SECRET=CONSUMER_SECRET, ACCESS_TOKEN=ACCESS_TOKEN, ACCESS_TOKEN_SECRET=ACCESS_TOKEN_SECRET) # init tracker (database api call) tracker = Tracker() # tweet init tweet = Tweet(totalDeaths=(tracker.getTotalDeaths()), totalInfected=(tracker.getTotalInfected())) while True: # Get latest data from Tracker tracker.update() # Generate tweet with latest data tweet.update(totalDeaths=(tracker.totalDeaths), totalInfected=(tracker.totalInfected)) # Get old tweets oldTweets = bot.getOldTweets() # Check if tweet is not duplicated if (tweet.isDuplicated(oldTweets=oldTweets) == False): bot.postTweet(text=(tweet.text)) time.sleep(TIME) #s
def test_clean_text(self): self.assertEqual( Tweet(self.tweets[0]).get_clean_text(), "Spark Summit East this week! #Spark #Apache") self.assertEqual( Tweet(self.tweets[1]).get_clean_text(), "I'm at Terminal de Integrao do Varadouro in Joo Pessoa, \PB https://t.co/HOl34REL1a" )
def test_get_last_post_inavlid_creds(): access_keys = { 'TWEEPY_CONSUMER_KEY': None, 'TWEEPY_CONSUMER_SECRET': None, 'TWEEPY_ACCESS_TOKEN': None, 'TWEEPY_ACCESS_TOKEN_SECRET': None } new_tweet = Tweet(access_keys) result = new_tweet.get_last_post() assert result.response is None
def test_get_last_post_inavlid_creds2(): access_keys = { 'TWEEPY_CONSUMER_KEY': 'None', 'TWEEPY_CONSUMER_SECRET': 'None', 'TWEEPY_ACCESS_TOKEN': 'None', 'TWEEPY_ACCESS_TOKEN_SECRET': 'None' } new_tweet = Tweet(access_keys) result = new_tweet.get_last_post() assert result.args[0][0]['code'] == 89 assert result.args[0][0]['message'] == 'Invalid or expired token.'
def __init__(self, file, stem=False, **kwargs): df = pd.read_csv(file) tweets = [] for index, row in df.iterrows(): tweets.append(Tweet(row['tweet'])) p = Preprocessor(stem=stem) if 'threshold' not in kwargs else Preprocessor(stem=stem,treshold=kwargs['threshold']/len(tweets)) p.process(tweets) del p #Remove blank tweets tweets = [t for t in tweets if len(t.text) > 0] #Change n-grams to string for i in range(len(tweets)): tweets[i].n_grams = [" ".join(x) for x in tweets[i].n_grams] self.__word_list = set() actual_word = dict() for tw in tweets: self.__word_list = self.__word_list.union(list(map(lambda a: a[0], tw.text)) + tw.n_grams) for w in tw.text: actual_word[w[0]] = w[1] print('tweet-size = ', len(tweets)) print('word-size = ',len(self.__word_list)) self.__word_list = list(self.__word_list) print('Sample words: ', self.__word_list[:5]) term_count = defaultdict(lambda: defaultdict(lambda: 0)) for i,tw in enumerate(tweets): for w in list(map(lambda a: a[0], tw.text)) + tw.n_grams: #: term_count[i][w] += 1 self.__tweet_count = len(tweets) self.actual_words = [] for w in self.__word_list: if w in actual_word: self.actual_words.append(actual_word[w]) tf = [] for i in range(self.__tweet_count): tmp = [] for w in self.__word_list: tmp.append(term_count[i][w]) del term_count[i] tf.append(tmp) self.__term_freq = tf self.__results = list(df['sentiment']) # Sentiment 1 if positive, -1 if negative
def load_tweets_from_file(tweets_file_name): file = codecs.open(tweets_file_name, mode='r', encoding='utf-8') content = file.readlines() content = [x.strip() for x in content] current_line_count = 0 tweets = [] for line in content: current_line_count += 1 try: tweets.append(Tweet(line)) except InvalidTweetLine as error: logger.error('Invalid tweet at line %d' % current_line_count) return tweets
def send(tweet_json): try: tweet = Tweet(tweet_json) if tweet.media is not None: if len(tweet.media) == 1: media_obj = tweet.media[0] media_obj.update({"caption": tweet.text}) return TelegramMediaProxy.send_media(media_obj.get("type"), media_obj) elif len(tweet.media) > 1: tweet.media[0].update({"caption": tweet.text}) TelegramMediaProxy.send_media("group_media", tweet.media) else: TelegramMediaProxy.send_media("message", {"text": tweet.text}) except Exception as e: logger.error("can't send tweet to telegram error: \n\t{}".format(e))
def getSentiment(self, tweet): tw = Tweet(tweet) stop_words = set(stopwords.words('english')) tokens = [w for w in tw.text + tw.emoji if not w in stop_words] if self.stemmer: tokens = [ self.stemmer.stem(w) for w in tokens ] if self.ngrams: tokens += [" ".join(z) for y in tw.n_grams for z in y[random.randrange(len(y))]] sentiment = 0 if len(tokens) == 0: return sentiment for t in tokens: sentiment += self.word_sentiment[t] return sentiment/len(tokens)
def test_is_tweet_ascii(self): self.assertEqual(Tweet(self.tweets[0]).is_tweet_ascii(), True) self.assertEqual(Tweet(self.tweets[1]).is_tweet_ascii(), False)
def test_default_post(): access_keys = '' new_tweet = Tweet(access_keys) result = new_tweet.get_default_post() assert result == "Thank you for following MaikuOnline! 毎日頑張りましょう!"
from src.tweet import Tweet from src.preprocessing import Preprocessor import pandas as pd df = pd.read_csv('dataset/sample_sts.txt') tweets = [] for index, row in df.iterrows(): tweets.append(Tweet(row['tweet'])) p = Preprocessor() p.process(tweets) with open('dataset/processed_sample_sts.txt', 'w') as fp: for tweet in tweets: fp.write(str(tweet.text) + '\n')
def setUp(self): self.format = "%a %b %d %H:%M:%S +0000 %Y" self.tweets_json = [ """ {"created_at":"Thu Oct 29 17:51:01 +0000 2015", "text":"Spark Summit East this week! #Spark #Apache", "entities":{"hashtags":[{"text":"Spark","indices":[29,35]},{"text":"Apache","indices":[36,43]}],"urls":[],"user_mentions":[],"symbols":[]}} """, """ {"created_at":"Thu Oct 29 17:51:30 +0000 2015", "text":"Just saw a great post on Insight Data Engineering #Apache #Hadoop #Storm", "entities":{"hashtags":[{"text":"Storm","indices":[29,35]},{"text":"Apache","indices":[36,43]},{"text":"Hadoop","indices":[37,48]}],"urls":[],"user_mentions":[],"symbols":[]}} """, """ {"created_at":"Thu Oct 29 17:51:55 +0000 2015", "text":"Doing great work #Apache", "entities":{"hashtags":[{"text":"Apache","indices":[29,35]}],"urls":[],"user_mentions":[],"symbols":[]}} """, """ {"created_at":"Thu Oct 29 17:51:56 +0000 2015", "text":"Excellent post on #Flink and #Spark", "entities":{"hashtags":[{"text":"Flink","indices":[29,35]},{"text":"Spark","indices":[29,35]}],"urls":[],"user_mentions":[],"symbols":[]}} """, """ {"created_at":"Thu Oct 29 17:51:59 +0000 2015", "text":"New and improved #HBase connector for #Spark", "entities":{"hashtags":[{"text":"HBase","indices":[29,35]},{"text":"Spark","indices":[29,35]}],"urls":[],"user_mentions":[],"symbols":[]}} """, """ {"created_at":"Thu Oct 29 17:52:05 +0000 2015", "text":"New 2.7.1 version update for #Hadoop #Apache", "entities":{"hashtags":[{"text":"Hadoop","indices":[29,35]},{"text":"Apache","indices":[29,35]}],"urls":[],"user_mentions":[],"symbols":[]}} """, """ {"created_at":"Thu Oct 29 17:52:31 +0000 2015", "text":"Try to evict the storm vertex", "entities":{"hashtags":[],"urls":[],"user_mentions":[],"symbols":[]}} """, """ {"created_at":"Thu Oct 29 17:52:56 +0000 2015", "text":"New 2.7.1 version update for #Unrelated", "entities":{"hashtags":[{"text":"Unrelated","indices":[29,35]}],"urls":[],"user_mentions":[],"symbols":[]}} """, """ {"created_at":"Thu Oct 29 17:52:57 +0000 2015", "text":"Excellent post on #Flink and #Spark", "entities":{"hashtags":[{"text":"Flink","indices":[29,35]},{"text":"Spark","indices":[29,35]}],"urls":[],"user_mentions":[],"symbols":[]}} """, """ {"created_at":"Thu Oct 29 17:54:57 +0000 2015", "text":"Another Excellent post on #Flink", "entities":{"hashtags":[{"text":"Flink","indices":[29,35]}],"urls":[],"user_mentions":[],"symbols":[]}} """, ] self.tweets = [ Tweet(json.loads(cur_tweet_json)) for cur_tweet_json in self.tweets_json ] self.timestamps = [ datetime.strptime(tweet.get_timestamp(), self.format) for tweet in self.tweets ]