Beispiel #1
0
 def fetch_many(
     self, query_strings: List[str], config: FetchConfig = FetchConfig()
 ) -> Dict[str, List[str]]:
     ret = {}
     for q in query_strings:
         ret[q] = self.fetch(q, config)
     return ret
Beispiel #2
0
 def fetch(
     self, query_string: str,
     config: FetchConfig = FetchConfig()) -> List[str]:
     ret = []
     for date_ in reversed(list(daterange(config.from_date,
                                          config.to_date))):
         ret = ret + self._fetch_impl(query_string, date_, config)
     return ret
Beispiel #3
0
    def _fetch_impl(self,
                    query_string: str,
                    date_: date,
                    config: FetchConfig = FetchConfig()):

        results = []
        max_tweets = config.max_tweets_per_fetch
        if max_tweets == 0:
            max_tweets = 20000
        tweet_amount = max_tweets

        last_max_id = None
        while tweet_amount > 0:
            statuses = None
            try:
                statuses = self.api.GetSearch(term='#' + query_string,
                                              until=date_,
                                              since=date_,
                                              count=tweet_amount,
                                              lang=config.lang,
                                              max_id=last_max_id,
                                              result_type='recent')
            except TwitterError as e:
                if e.message[0]['code'] == 88:  # Rate limit exceeded
                    print(
                        f"Warning: aborting pulling tweets for query '{query_string}' after {int(max_tweets - tweet_amount)} "
                        f"tweets, because of error: '{e.message[0]['message']}'"
                    )
                    break
                else:
                    raise e
            if len(statuses) <= 1:
                print(
                    f"Warning: aborting pulling tweets for query '{query_string}' after {int(max_tweets - tweet_amount)} tweets, "
                    f"because there are no more tweets available")
                break  # out of results
            results.extend([t.text for t in statuses])
            tweet_amount = tweet_amount - len(statuses)
            last_max_id = statuses[-1].id_str

        results.reverse()

        #results = self.remove_duplicates(results)
        for i, e in enumerate(results):  # remove 'RT' from tweets (retweets)
            if e.startswith('RT'):
                results[i] = e[2:]

        self.save_tweets(results, query_string, date_)

        return results
Beispiel #4
0
    def _fetch_impl(self,
                    query_string: str,
                    date_: date,
                    config: FetchConfig = FetchConfig()):

        c = twint.Config()

        c.Search = '#' + query_string
        c.Lang = config.lang
        c.Store_object = True
        c.Since = "{:%Y-%m-%d}".format(date_)
        c.Until = "{:%Y-%m-%d}".format(date_ + timedelta(days=1))
        c.Limit = config.max_tweets_per_fetch if config.max_tweets_per_fetch != 0 else None

        twint.run.Search(c)
        result = twint.output.tweets_list
        tweets = [t.tweet for t in result]
        self.save_tweets(tweets, query_string, date_)
        return tweets
Beispiel #5
0
 def _fetch_impl(
     self,
     query_string: str,
     date_: date,
     config: FetchConfig = FetchConfig()) -> List[str]:
     raise NotImplementedError()
Beispiel #6
0
from datetime import date, timedelta

from bix.twitter.fetch.fetch_config import FetchConfig
from bix.twitter.fetch.twint_api.twint_fetcher import TwintFetcher

if __name__ == '__main__':
    hashtags = ['brexit']

    config = FetchConfig()
    config.to_date = date.today() + timedelta(days=1)
    config.from_date = date.today()

    tf = TwintFetcher()
    tf.fetch_many(hashtags, config)
Beispiel #7
0
from bix.twitter.analysis.analyse_sentiment_conv import predict_model_convolutional
from bix.twitter.base.utils import load_model_mat
from bix.twitter.fetch.download_tweets import download_tweets_twint
from bix.twitter.fetch.fetch_config import FetchConfig

from bix.twitter.preprocessing.preprocess import preprocess, tokenize_cleaned_tweets

# before you run this script, set the execution folder for this script to where the tokenizer and the model are saved
if __name__ == '__main__':

    # step 1: fetch tweets (for categorizing hashtags)
    hashtags = [
        'brexit', 'lol'
    ]  # these should match the hashtags, the model was created with
    config = FetchConfig()
    config.from_date = date.today()  # fetch all tweets from today
    config.to_date = date.today() + timedelta(days=1)  # to_date is exclusive
    config.max_tweets_per_fetch = 10
    tweets = download_tweets_twint(hashtags, config)

    # step 2: cleanup
    cleaned_tweets = preprocess(tweets)

    # step 3: tokenization (using the tokenizer created in the build_model.py script)
    tokenized_tweets = tokenize_cleaned_tweets(cleaned_tweets)
    encoded_categories = {hashtags[i]: i for i in range(len(hashtags))}
    y = []  # eg. 0,0,0,1,1,1,1
    for hashtag, tweets in tokenized_tweets.items():
        for item in tweets:
            y.append(encoded_categories[hashtag])