def fetch_many( self, query_strings: List[str], config: FetchConfig = FetchConfig() ) -> Dict[str, List[str]]: ret = {} for q in query_strings: ret[q] = self.fetch(q, config) return ret
def fetch( self, query_string: str, config: FetchConfig = FetchConfig()) -> List[str]: ret = [] for date_ in reversed(list(daterange(config.from_date, config.to_date))): ret = ret + self._fetch_impl(query_string, date_, config) return ret
def _fetch_impl(self, query_string: str, date_: date, config: FetchConfig = FetchConfig()): results = [] max_tweets = config.max_tweets_per_fetch if max_tweets == 0: max_tweets = 20000 tweet_amount = max_tweets last_max_id = None while tweet_amount > 0: statuses = None try: statuses = self.api.GetSearch(term='#' + query_string, until=date_, since=date_, count=tweet_amount, lang=config.lang, max_id=last_max_id, result_type='recent') except TwitterError as e: if e.message[0]['code'] == 88: # Rate limit exceeded print( f"Warning: aborting pulling tweets for query '{query_string}' after {int(max_tweets - tweet_amount)} " f"tweets, because of error: '{e.message[0]['message']}'" ) break else: raise e if len(statuses) <= 1: print( f"Warning: aborting pulling tweets for query '{query_string}' after {int(max_tweets - tweet_amount)} tweets, " f"because there are no more tweets available") break # out of results results.extend([t.text for t in statuses]) tweet_amount = tweet_amount - len(statuses) last_max_id = statuses[-1].id_str results.reverse() #results = self.remove_duplicates(results) for i, e in enumerate(results): # remove 'RT' from tweets (retweets) if e.startswith('RT'): results[i] = e[2:] self.save_tweets(results, query_string, date_) return results
def _fetch_impl(self, query_string: str, date_: date, config: FetchConfig = FetchConfig()): c = twint.Config() c.Search = '#' + query_string c.Lang = config.lang c.Store_object = True c.Since = "{:%Y-%m-%d}".format(date_) c.Until = "{:%Y-%m-%d}".format(date_ + timedelta(days=1)) c.Limit = config.max_tweets_per_fetch if config.max_tweets_per_fetch != 0 else None twint.run.Search(c) result = twint.output.tweets_list tweets = [t.tweet for t in result] self.save_tweets(tweets, query_string, date_) return tweets
def _fetch_impl( self, query_string: str, date_: date, config: FetchConfig = FetchConfig()) -> List[str]: raise NotImplementedError()
from datetime import date, timedelta from bix.twitter.fetch.fetch_config import FetchConfig from bix.twitter.fetch.twint_api.twint_fetcher import TwintFetcher if __name__ == '__main__': hashtags = ['brexit'] config = FetchConfig() config.to_date = date.today() + timedelta(days=1) config.from_date = date.today() tf = TwintFetcher() tf.fetch_many(hashtags, config)
from bix.twitter.analysis.analyse_sentiment_conv import predict_model_convolutional from bix.twitter.base.utils import load_model_mat from bix.twitter.fetch.download_tweets import download_tweets_twint from bix.twitter.fetch.fetch_config import FetchConfig from bix.twitter.preprocessing.preprocess import preprocess, tokenize_cleaned_tweets # before you run this script, set the execution folder for this script to where the tokenizer and the model are saved if __name__ == '__main__': # step 1: fetch tweets (for categorizing hashtags) hashtags = [ 'brexit', 'lol' ] # these should match the hashtags, the model was created with config = FetchConfig() config.from_date = date.today() # fetch all tweets from today config.to_date = date.today() + timedelta(days=1) # to_date is exclusive config.max_tweets_per_fetch = 10 tweets = download_tweets_twint(hashtags, config) # step 2: cleanup cleaned_tweets = preprocess(tweets) # step 3: tokenization (using the tokenizer created in the build_model.py script) tokenized_tweets = tokenize_cleaned_tweets(cleaned_tweets) encoded_categories = {hashtags[i]: i for i in range(len(hashtags))} y = [] # eg. 0,0,0,1,1,1,1 for hashtag, tweets in tokenized_tweets.items(): for item in tweets: y.append(encoded_categories[hashtag])