Esempio n. 1
0
def collect_tweets():
    '''
		collects the tweets from twitter. only gets the last 500 which
		is the maximum allowed by the api.
	'''

    #
    try:

        #
        tso = TwitterSearch.TwitterSearchOrder()
        tso.set_keywords(keywords)
        tso.set_language('en')
        tso.set_include_entities(False)
        tso.set_count(500)

        #
        ts = TwitterSearch.TwitterSearch(
            consumer_key=configs['consumer_key'],
            consumer_secret=configs['consumer_secret'],
            access_token=configs['access_token'],
            access_token_secret=configs['access_token_secret'])

        #
        tweets = []

        #
        for tweet in ts.search_tweets_iterable(tso):
            tweets.append((tweet['created_at'], tweet['text']))

    except TwitterSearch.TwitterSearchException as e:
        None

    #
    return tweets
def searchTwitter(keywords):
    myReturnString = ""
    try:
        tso = tw.TwitterSearchOrder()
        tso.set_keywords(keywords)
        tso.set_language('en')
        tso.set_include_entities(False)

        # Twitter credentials
        ts = tw.TwitterSearch(
            consumer_key='w8xDxzVeKgw7dKLmZLgzsmKD4',
            consumer_secret=
            'qXg4b6B4loG1wTLrXCinmzILQyK3HFg8EP8mDhXQATb8PGhNIA',
            access_token='1112704711-Lp9k0REFZmzI4ODBa6dVhyIYfrREqLjOb15dUDh',
            access_token_secret='kBVXbCqKPnYvJWduRUQ2Q5H0gRFocZ4duWcoDrx8DqGLb'
        )

        count = 0
        for tweet in ts.search_tweets_iterable(tso):
            #time.sleep(3)
            if (count >= 1):
                break
            myString = '@%s tweeted: %s' % (tweet['user']['screen_name'],
                                            tweet['text'])
            print(myString)
            count += 1
            myReturnString += myString

    except tw.TwitterSearchException as e:
        print(e)

    return myReturnString
Esempio n. 3
0
def main(argv=None):
    parser = create_parser()
    args = parser.parse_args(argv)

    api_search = None
    if args.consumer_key and args.consumer_secret and args.access_token and args.access_token_secret:
        api_search = ts.TwitterSearch(
            consumer_key=args.consumer_key,
            consumer_secret=args.consumer_secret,
            access_token=args.access_token,
            access_token_secret=args.access_token_secret)
    else:
        api_search = ts.TwitterSearch()

    tso = ts.TwitterSearchOrder()

    if args.language:
        tso.set_language(args.language)

    if args.count:
        tso.set_count(args.count)

    if args.enable_entities:
        tso.set_include_entities(args.enable_entities)

    if args.latitude and args.longitude:
        tso.set_geocode(float(args.latitude),
                        float(args.longitude),
                        30,
                        imperial_metric=False)
        #print("LON:{0}, LAT:{1}".format(float(args.latitude), float(args.longitude)), file=sys.stderr)

    if args.keywords is None:
        tso.set_keywords(['*'])
    else:
        tso.set_keywords(args.keywords)

    num_tweets = 0
    for tweet in api_search.search_tweets_iterable(
            tso, callback=my_callback_closure):
        num_tweets = num_tweets + 1

        fecha_parsed = datetime.strptime(tweet['created_at'],
                                         '%a %b %d %H:%M:%S %z %Y')
        fecha = fecha_parsed.strftime("%Y%m%d_%H%M%S")  #tweet['created_at']
        text = tweet['text'].replace("\n", " ")
        #print( '%s;%s;%s' % ( tweet['user']['screen_name'], fecha, text ) )
        print(text)
Esempio n. 4
0
    def count_response(self, con_list):

        get = gd.get_data()
        mod = md.modify_data()
        tso = ts.TwitterSearchOrder()
        tso.arguments.update({'tweet_mode': 'extended'})
        api = get.api()
        coordinates = get.coordinates()
        con_count = 0
        respo_list = []
        respo_loc = []

        for con in con_list:
            print('\tCounting ' + con + '...')
            tso.set_keywords([con])

            for coordinate in coordinates:
                tso.set_geocode(coordinate['lat'], coordinate['long'], 5, False)

                for tweet in api.search_tweets_iterable(tso, callback=self.avoid_rate_limit):
                    try:
                        tweet_text = tweet['retweeted_status']['full_text']
                    except KeyError:
                        tweet_text = tweet['full_text']

                    cleaned_tweet = mod.clean_tweet(tweet_text)
                    temp_res = cleaned_tweet + ' --- ' + tweet['id_str']
                    if temp_res not in respo_list:
                        respo_list.append(temp_res)
                        respo_loc.append(coordinate['city'])
                        con_count += 1

        with open('raw/response.txt', 'a') as res:
            print('Total: ' + str(con_count))
            res.write(con_list[0] + ': ' + str(con_count) + '\n')
            for i in range(con_count):
                response = respo_list[i] + ' (' + respo_loc[i] + ')'
                res.write(response + '\n')
            res.write('\n')

        return con_count
def search():
    
    try:
        tso = TwitterSearch.TwitterSearchOrder() # create a TwitterSearchOrder object
        tso.set_keywords(['', 'Doktorarbeit']) # let's define all words we would like to have a look for
        tso.set_language('de') # we want to see German tweets only
        tso.set_include_entities(False) # and don't give us all those entity information
    
        # it's about time to create a TwitterSearch object with our secret tokens
        ts = TwitterSearch(
            consumer_key = 'aaabbb',
            consumer_secret = 'cccddd',
            access_token = '111222',
            access_token_secret = '333444'
         )
    
         # this is where the fun actually starts :)
        for tweet in ts.search_tweets_iterable(tso):
            print( '@%s tweeted: %s' % ( tweet['user']['screen_name'], tweet['text'] ) )
    
    except TwitterSearch.TwitterSearchException as e: # take care of all those ugly errors if there are some
        print(e)
Esempio n. 6
0
def tweet_search(keywords, tweet_lang):
    # load yaml file with secrets to dictionary
    credentials = yaml.safe_load(open("./credentials.yml"))

    try:
        tso = TwitterSearch.TwitterSearchOrder(
        )  # create a TwitterSearchOrder object
        tso.set_keywords(
            keywords
        )  # defines all words that we like to search for in a tweet
        tso.set_language(
            tweet_lang)  # set the language of tweets we are searching for
        tso.set_include_entities(False)  # no entity information

        # create a TwitterSearch object with our secret tokens
        ts = TwitterSearch.TwitterSearch(
            consumer_key=credentials['database']['consumer_key'],
            consumer_secret=credentials['database']['consumer_secret'],
            access_token=credentials['database']['access_token'],
            access_token_secret=credentials['database']['access_token_secret'])

        # Save all tweets in a nested dic
        # twitty{"id"}
        #          |- {date} -> tweet creation date
        #          |- {text} -> tweet text
        twitty = {}
        for tweet in ts.search_tweets_iterable(tso):
            # Dict based on tweet ID, assign a new dict as value
            twitty[tweet["id"]] = {}
            # Key is date and value "created at"
            twitty[tweet["id"]]["date"] = tweet["created_at"]
            # Key is text and value is the tweet
            twitty[tweet["id"]]["text"] = tweet["text"]

        return twitty

    except TwitterSearch.TwitterSearchException as e:
        print(e)
Esempio n. 7
0
def twitter_tag_search(ck, cs, at, ats, tag, count, lang, proxy=None):
    """ function for twitter search on hashtags and keywords """
    tso = TwitterSearch.TwitterSearchOrder()
    tso.set_keywords(tag)
    if lang == 'en' or lang == 'nl':
        tso.set_language(lang)

    tso.set_result_type('recent')
    if proxy:
        ts = TwitterSearch.TwitterSearch(ck, cs, at, ats, proxy=proxy)
    else:
        ts = TwitterSearch.TwitterSearch(ck, cs, at, ats)

    tweetcount = 0

    for tweet in ts.search_tweets_iterable(tso):
        if tweetcount < count:
            print(f"@{tweet['user']['screen_name']} - {tweet['created_at']}")
            print(f"{tweet['text']}")
            print(f"")
            tweetcount = tweetcount + 1
        else:
            break
Esempio n. 8
0
    def get_sources(self, meme, number):
        """Fetches a list of Sources from Twitter that match the given meme"""
        #stdout available through Heroku logs. TODO: syslog
        print(" ".join([meme.get_body(), meme.get_exceptions()]))

        sources = []
        try:
            tso = TwitterSearch.TwitterSearchOrder(
            )  # create a TwitterSearchOrder object
            tso.setSearchURL(self._format_query(meme))
            tso.setLocale('en')
            tso.setCount(number)  #smallest request that might work
            tso.setIncludeEntities(False)

            twitter_search = TwitterSearch.TwitterSearch(
                consumer_key=Secrets.consumer_key,
                consumer_secret=Secrets.consumer_secret,
                access_token=Secrets.access_token,
                access_token_secret=Secrets.access_token_secret)

            tweets = twitter_search.searchTweets(tso)
            retries = 0
            while len(sources) < number and retries < 5:
                for tweet in tweets['content']['statuses']:
                    sources.append(
                        Source(tweet['user']['name'], tweet['text'],
                               tweet['id_str']))
                    #print(tweet['text'])
                    #there's a lot of strange characters coming in here
                tweets = twitter_search.searchNextResults()
                retries += 1

        except TwitterSearch.TwitterSearchException as exception:
            print(exception)  #TODO: syslog

        return sources
Esempio n. 9
0
def generate_tso(keywords, db_file):
    '''
    Generate tsos combining least frequet keywords.
    '''

    # Get info from db on keywords
    # (keyword, count, max_id)
    with sqlite3.connect(db_file) as conn:
        c = conn.cursor()
        c.execute('SELECT keyword, count, max_id FROM exp_averages')
        latest = c.fetchall()
        c.close()

    # mere keywords data with sql data
    latest_df = pd.DataFrame(latest, columns=['keyword', 'count', 'max_id'])
    df = pd.DataFrame(keywords, columns=['keyword'])
    df = df[df['keyword'] != '$OR']  # twitter keywords... not allowed
    df = df[df['keyword'] != 'OR']  # same
    df = pd.merge(df, latest_df, how='left', on='keyword')

    thresholds = [
        {
            'count': 3,
            'combine': 50
        },
        {
            'count': 10,
            'combine': 10
        },
        {
            'count': 20,
            'combine': 4
        },
        {
            'count': 40,
            'combine': 2
        },
        {
            'count': None,
            'combine': 1
        },
    ]

    for threshold in thresholds:
        # Select a section from the df
        # truncating the df as it goes through the thresholds
        # new tweets will have None as count and thus will not
        # meet any thresholds and will be processed one at time.
        if threshold['count']:
            section = df[df['count'] < threshold['count']]
        else:
            section = df[:]

        df.drop(section.index, inplace=True)

        # Generate tsos
        while len(section) > 0:
            # determine the right number of keywords to combine
            try_n = threshold['combine']
            too_long = True
            while too_long:
                subsection = section[:try_n]
                combine = list(subsection.keyword)

                # use the smallest of the max_id because in the time
                # from min(max_id) to max(max_id) there might have been
                # tweets for keywords other then the one of max(max_id)
                max_id = subsection['max_id'].min()
                tso = TwitterSearch.TwitterSearchOrder()
                tso.set_include_entities(True)
                tso.set_result_type('recent')
                tso.set_keywords(combine, or_operator=True)
                if not pd.isnull(max_id):
                    tso.set_since_id(int(max_id))

                url = tso.create_search_url()

                if (len(url) < 450) | (try_n == 1):
                    # exit clause
                    too_long = False
                    logging.debug('Number of tickers combnied {}'.format(
                        len(combine)))
                    logging.debug(combine)
                    logging.debug(tso.create_search_url())
                else:
                    try_n -= 1

            yield tso
            section = section.iloc[try_n:]
Esempio n. 10
0
    def __init__(self):

        print('Gathering tweets with political context...')
        get = gd.get_data()
        mod = md.modify_data()
        api = get.api()
        tso = ts.TwitterSearchOrder()
        tso.arguments.update({'tweet_mode': 'extended'})
        res_list = []
        res_dict = {}
        json_data = {}
        senators = get.senators()
        concerns = get.concerns()
        coordinates = get.coordinates()

        for senator in senators:
            json_data[senator] = {}
            print('Gathering tweets mentioning ' + senator + '...')

            for concern in concerns:
                json_data[senator][concern] = []
                con_en = concern.split(',')[0]
                try:
                    con_tl = concern.split(', ')[1]
                    con_cb = concern.split(', ')[2]
                    con_list = [con_en, con_tl, con_cb]
                except IndexError:
                    con_tl = concern.split(', ')[1]
                    con_cb = None
                    con_list = [con_en, con_tl]
                print('\t' + concern + '...')

                for con_item in con_list:
                    tso.set_keywords([senator, con_item])

                    for coordinate in coordinates:
                        tso.set_geocode(coordinate['lat'], coordinate['long'], 5, False)

                        for tweet in api.search_tweets_iterable(tso, callback=self.avoid_rate_limit):
                            try:
                                tweet_text = tweet['retweeted_status']['full_text']
                                is_retweet = True
                            except KeyError:
                                tweet_text = tweet['full_text']
                                is_retweet = False

                            res_text = tweet['id_str'] + ': ' + tweet_text
                            if res_text not in res_list:
                                res_list.append(res_text)

                                if tweet['is_quote_status']:
                                    if is_retweet:
                                        quote_text = tweet['retweeted_status']['quoted_status']['full_text']
                                    else:
                                        quote_text = tweet['quoted_status']['full_text']
                                else:
                                    quote_text = None

                                tweet_text2 = mod.clean_tweet(tweet_text)
                                tweet_text2 = mod.translate(tweet_text2)

                                if tweet_text2 is None:
                                    continue

                                if quote_text is not None:
                                    quote_text2 = mod.clean_tweet(quote_text)
                                    quote_text2 = mod.translate(quote_text2)
                                else:
                                    quote_text2 = None

                                json_data[senator][concern].append({
                                    'tweet_text': tweet_text,
                                    'tweet_text2': tweet_text2,
                                    'is_retweet': is_retweet,
                                    'quote_text': quote_text,
                                    'quote_text2': quote_text2,
                                    'tweet_id': tweet['id'],
                                    'rt_count': tweet['retweet_count'],
                                    'tweet_created': tweet['created_at'],
                                    'tweet_loc': coordinate['city'],
                                    'user_id': tweet['user']['id'],
                                    'user_created': tweet['user']['created_at'],
                                    'user_verified': tweet['user']['verified'],
                                    'user_follower': tweet['user']['followers_count'],
                                    'user_total_tweet': tweet['user']['statuses_count'],
                                    'user_loc': tweet['user']['location']
                                })

                                res_tweet = mod.remove_stopwords(tweet_text2)
                                if quote_text2 is not None:
                                    res_dict = self.initialize_triangulation(
                                        res_dict, res_tweet + ' ' + quote_text2 + ' ' + coordinate['city'])
                                else:
                                    res_dict = self.initialize_triangulation(
                                        res_dict, res_tweet + ' ' + coordinate['city'])

        print('Saving collected tweets into \"gathered_tweets.json\" file...')
        self.save_tweet(json_data)
        self.save_cleaned_tweet(res_dict)
        print('Finished gathering tweets with political context...')
Esempio n. 11
0
def twitter_search(db_file, output_dir, keywords_file):

    ts = TwitterSearch.TwitterSearch(
        consumer_key=twitter_keys.consumer_key,
        consumer_secret=twitter_keys.consumer_secret,
        access_token=twitter_keys.access_token,
        access_token_secret=twitter_keys.access_token_secret)

    start = time.time()
    window_count = 1
    conn = sqlite3.connect(db_file)
    c = conn.cursor()

    if keywords_file:
        keywords = helpers.get_keywords_file(keywords_file)
    else:
        keywords = helpers.get_keywords_sql(db_file)

    pbar = tqdm(keywords)
    for keyword in pbar:
        logging.debug('Getting: ' + keyword)
        # keyword = keyword.replace('/','_')
        pbar.set_description("Processing {:10}".format(keyword))
        pbar.refresh()

        tso = TwitterSearch.TwitterSearchOrder()
        tso.set_include_entities(True)
        tso.set_result_type('recent')
        tso.set_keywords([keyword])

        # only look for tweets since last search..
        c.execute('SELECT max_id FROM latest_search WHERE keyword=?',
                  [keyword])
        fetched = c.fetchone()
        since_id = fetched[0] if not fetched is None else None
        if since_id: tso.set_since_id(since_id)

        ts.search_tweets(tso)

        max_id = []
        max_date = []
        min_date = []
        count = []

        try_next = True
        while try_next:
            # parse response
            meta = ts.get_metadata()
            remaining_limit = int(meta.get('x-rate-limit-remaining', 0))
            num_tweets = ts.get_amount_of_tweets()

            tweets = ts.get_tweets().get('statuses', [])
            helpers.write_tweets(tweets, output_dir)

            if num_tweets != 0:
                max_id.append(max([tweet['id'] for tweet in tweets]))
                max_date.append(
                    max([
                        pd.to_datetime(tweet['created_at'], utc=True)
                        for tweet in tweets
                    ]))
                min_date.append(
                    min([
                        pd.to_datetime(tweet['created_at'], utc=True)
                        for tweet in tweets
                    ]))
                count.append(num_tweets)

            if remaining_limit == 0:
                try:
                    limit_reset = int(
                        meta.get('x-rate-limit-reset',
                                 time.time() + 15 *
                                 60)) + 10  # extra sec to be on the safe side
                    # convert to correct datetime
                    limit_reset_dt = pd.to_datetime(limit_reset,
                                                    unit='s',
                                                    utc=True)
                    limit_reset_dt = limit_reset_dt.tz_convert('Europe/London')
                    pbar.set_description(
                        'Sleeping until {:%H:%M:%S}'.format(limit_reset_dt))
                    pbar.refresh()
                    pause.until(limit_reset)
                    pbar.set_description("Processing %s" % keyword)
                    pbar.refresh()
                    window_count += 1
                except Exception as e:
                    logging.warn('limit_reset ERROR: ' + keyword)
                    logging.warn(str(e))
                    logging.warn('Sleep for 15min...')
                    # wait the maximum time until next window...
                    pbar.set_description("Sleeping for 15 min.")
                    pbar.refresh()

                    pause.minutes(15)

                    pbar.set_description("Processing {:10}".format(keywords))
                    pbar.refresh()
                    window_count += 1

            # check if there is a next page for this search
            try:
                try_next = ts.search_next_results()
            except:
                try_next = False

        # stats and logging for current keyword
        max_id = max(max_id) if len(max_id) != 0 else since_id
        max_date = max(max_date) if len(max_date) != 0 else None
        min_date = min(min_date) if len(min_date) != 0 else None
        count = sum(count)

        search_stats = {
            'keyword':
            keyword,
            'count':
            count,
            'min_date':
            min_date.strftime('%Y-%m-%d %H:%M:%S')
            if not min_date is None else None,
            'max_date':
            max_date.strftime('%Y-%m-%d %H:%M:%S')
            if not max_date is None else None,
            'max_id':
            max_id,
            'search_date':
            pd.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        }

        helpers.dict_to_sqlite(search_stats, 'latest_search', db_file)

    # stats and logging for iteration
    end = time.time()
    total_time = round((end - start) / 60)
    iteration_stats = {
        'start_time': pd.to_datetime(start,
                                     unit='s').strftime('%Y-%m-%d %H:%M:%S'),
        'duration_min': total_time,
        'keywords': len(keywords),
        'tweets_got': ts.get_statistics()[1],
        'queries_submitted': ts.get_statistics()[0],
        'windows_used': window_count,
    }
    helpers.dict_to_sqlite(iteration_stats, 'iterations', db_file)

    logging.info('Total number of windows: ' + str(window_count))
    logging.info('Total time (min): ' + str(total_time))
    logging.info('Total tweets got: ' + str(ts.get_statistics()[1]))

    # close db file
    c.close()
    conn.close()
Esempio n. 12
0
import TwitterSearch as ts

try:
    tso = ts.TwitterSearchOrder()
    # create a TwitterSearchOrder object
    tso.set_keywords(['Trump'])
    # let's define all words we would like to have a look for
    tso.set_language('en')
    # we want to see English tweets only
    tso.set_include_entities(False)
    # and don't give us all those entity information # it's about time to
    # create a TwitterSearch object with our secret tokens
    print "Tessstttttttt"
    ts = ts.TwitterSearch(
        consumer_key='z0JO1aunGAWu0xgxtpOMiw2qx',
        consumer_secret='8vwUONvjOAfBcnNU9X1mtg9YJGPvDLjGZsZnbgs0CWhbOxYZDc',
        access_token='3021210887-iKtdExGlsNC6JNGqsgKdSTgjaKVjyTDLMDLiXKM',
        access_token_secret='GdeRJ504DoANMZqDuE02vO4XFPJcux4pUzzqTCa3Gg6Oj')

    # this is where the fun actually starts :
    for tweet in ts.search_tweets_iterable(tso):
        # print('@%s tweeted: %s' % (
        #     tweet['user']['screen_name'],
        #     tweet['text']))
        print tweet

except ts.TwitterSearchException as e:
    # take care of all those ugly errors if there are some
    print(e)