Exemple #1
0
class StreamWorker:
    config = {}
    api = None
    socketio = None

    def __init__(self, config, socketio):
        self.config = config
        self.api = Api(
            config["consumer_key"],
            config["consumer_secret"],
            config["access_token"],
            config["access_token_secret"]
        )
        self.socketio = socketio
    def run(self):
        for data in self.api.GetStreamFilter(track=self.config["keywords"], languages=self.config["languages"]):
            self.process(data)
    def process(self, data):
        keyword = None
        for k in self.config["keywords"]:
            if k in data["text"].lower():
                keyword = k
        if not keyword or data["user"]["lang"] != "en":
            return
        tweet = {'name': data['user']['screen_name'],
                'text': data['text'],
                'url': 'https://twitter.com/statuses/' + str(data['id']),
                'time': data['created_at'],
                'favorites': data['favorite_count'],
                'retweets': data['retweet_count'],
                'keyword': keyword}
        print(tweet['time'])
        print('@%s: %s' % (data['user']['screen_name'], data['text'].encode('ascii', 'ignore')))
        #broadcast the tweet to all connected clients
        self.socketio.emit('new_tweet', tweet)
Exemple #2
0
class TwitterProducer:
    # logging.basicConfig(level=logging.NOTSET)

    def __init__(self):
        self.api = Api(CONSUMER, CONSUMER_SECRET, ACCESS_TOKEN,
                       ACCESS_TOKEN_SECRET)
        self._producer = KafkaProducer(bootstrap_servers='localhost:9092',
                                       max_in_flight_requests_per_connection=5,
                                       acks='all',
                                       api_version=(0, 10),
                                       retries=100000000000,
                                       compression_type='snappy',
                                       linger_ms=20,
                                       batch_size=32 * 1024)

    def get_timeline(self):
        for msg in self.api.GetStreamFilter(track=['bioinformatics', 'trump']):
            print(msg)
            key = bytes(str(msg.get('id', 'id')), encoding='utf-8')
            info = {
                'text': msg.get('text', 'text'),
                'followers': msg['user']['followers_count']
            }
            val = json.dumps(info, indent=2).encode('utf-8')
            logging.info(val)
            # print(val)
            self._producer.send('twitter_topics', key, val)
Exemple #3
0
class TwitterProducer:
    # logging.basicConfig(level=logging.NOTSET)

    def __init__(self):
        self.api = Api(CONSUMER,
                       CONSUMER_SECRET,
                       ACCESS_TOKEN,
                       ACCESS_TOKEN_SECRET)
        self._producer = KafkaProducer(
            bootstrap_servers='localhost:9092', max_in_flight_requests_per_connection=5, acks='all',
            api_version=(0, 10), retries=100000000000, compression_type='snappy', linger_ms=20, batch_size=32*1024)

    def get_timeline(self):
        for msg in self.api.GetStreamFilter(track=['Cleopatra', 'Potus', 'Trump', 'Pelosi']):
            # key = bytes(str(msg.get('id', 'id')), encoding='utf-8')
            val = bytes(str(msg.get('text', 'text')), encoding='utf-8')
            # logging.info(val)
            self._producer.send('twitter_home', val)
class TwitterProducer:
    logging.basicConfig(filename='tweets.py', level=logging.NOTSET)

    def run(self):
        self.stream_mentions()

    # def _producer(self):
    #     pass

    def __init__(self):
        self.api = Api(CONSUMER, CONSUMER_SECRET, ACCESS_TOKEN,
                       ACCESS_TOKEN_SECRET)
        self._producer = KafkaProducer(bootstrap_servers='localhost:9092',
                                       max_in_flight_requests_per_connection=5,
                                       acks='all',
                                       api_version=(0, 10),
                                       retries=100000000000)

    def stream_mentions(self):
        with open('output.txt', 'a') as f:
            for msg in self.api.GetStreamFilter(track=['@tolumide_ng'],
                                                languages=['en']):
                f.write(json.dumps(msg))
                logging.info(msg)
                self._producer.send('twitter_mentions', None, msg)
                f.write('\n')

    def stream_timeline(self, user):
        with open('timeline.txt', 'a') as f:
            statuses = self.api.GetUserTimeline(user_id=user)
            # print([s.text for s in statuses])
            for s in statuses:
                # f.write(json.dumps(s.text))
                val = bytes(s.text, encoding='utf-8')
                print(val)
                self._producer.send('twitter_timelines', val)
                f.write('\n')
        f.close()

    def get_followers(self, user):
        users = self.api.GetFriends(user)
        logging.info([u.name for u in users])
    def handle(self, *args, **options):
        track = options['track']
        tracks = [Track.objects.get_or_create(text=t)[0] for t in track]

        CONSUMER_KEY = settings.TWITTER_CONSUMER_KEY
        CONSUMER_SECRET = settings.TWITTER_CONSUMER_SECRET
        ACCESS_TOKEN = '2163588176-vrKoKmhLRwanWjEXUw7Zx6KhMMZrEy7jGa1MH3S'
        ACCESS_TOKEN_SECRET = 'asQYfiYM4YermUgoswAdkCjcSDS31kwji8APRZ18Zgu57'
        # ACCESS_TOKEN, ACCESS_TOKEN_SECRET = get_access_token(CONSUMER_KEY, CONSUMER_SECRET)

        api = Api(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET)

        iterator = api.GetStreamFilter(track=track)

        self.stdout.write('Streaming...', ending='\r')
        i = 0
        for tweet in iterator:
            i += 1
            q = TweetQueue(json=json.dumps(tweet), tracks=','.join(track))
            q.save()
            self.stdout.write('Streaming...{0}'.format(str(i)), ending='\r')
consumer_key = None
consumer_secret = None
resource_owner_key = None
resource_owner_secret = None

api = Api(consumer_key=consumer_key,
          consumer_secret=consumer_secret,
          access_token_key=resource_owner_key,
          access_token_secret=resource_owner_secret)

if __name__ == '__main__':
    while True:
        try:
            print('=== Scrapper Launched ===')
            # Bounding Box for Switzerland
            Switzerland_LOCATIONS = [
                "5.9559113", "45.817994999999996", "10.4922941", "47.8084648"
            ]
            for tweet in api.GetStreamFilter(locations=Switzerland_LOCATIONS,
                                             stall_warnings=True):
                if tweet:
                    if tweet.get('created_at', None) != None:
                        with open('data.json', 'a') as outfile:
                            json.dump(tweet, outfile)
                            outfile.write('\n')
                            time.sleep(6)

        except Exception as e:
            print('error (Switzerland): ' + str(e))
            time.sleep(300)
Exemple #7
0
def main():
    arglen = len(sys.argv)
    USING_TWITTER = False
    if arglen == 3:
        directory = sys.argv[1]
        country_code = sys.argv[2]
        LOCATIONS, selected = getLocation(country_code)
        USING_TWITTER = True
    elif arglen == 2:
        directory = sys.argv[1]
    else:
        print(
            'Please give two inputs: directory name and country code {US, UK, AU, NZ, SEA, AF}'
        )
        return
    if directory != '':
        directory = directory + '/'
    if USING_TWITTER:
        loadConfig('config_secret.json')
        # Since we're going to be using a streaming endpoint, there is no need to worry
        # about rate limits.
        api = Api(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN,
                  ACCESS_TOKEN_SECRET)
        # api.GetStreamFilter will return a generator that yields one status
        # message (i.e., Tweet) at a time as a JSON dictionary.
    try:
        today = date.today()
        if USING_TWITTER:
            count_day = 0
            counter = 0
            count_thousands = 0
            print(country_code)
            print(today)
            str_out = ''
            while (True):
                for line in api.GetStreamFilter(locations=LOCATIONS):
                    # warning: "limit"
                    try:
                        if date.today() != today:
                            # Change day
                            today = date.today()
                            try:
                                print('[{0}] Processed {1:,} tweets'.format(
                                    str(datetime.now()),
                                    count_thousands * 1000 + counter))
                                print('--- End of the day ---')
                            except:
                                pass
                            counter = 0
                            count_thousands = 0
                            count_day += 1
                            print(today)
                            # Write remaining data into file
                            if str_out != '':
                                write_to_file(f_complete, str_out)
                            str_out = ''
                            if count_day == DAY_CYCLE:
                                count_day = 0
                                # Change the countries
                                selected = (selected + 1) % len(COUNTRIES)
                                country_code = COUNTRIES[selected]
                                LOCATIONS, selected = getLocation(country_code)
                                print(country_code)
                                break
                        # Write json to file
                        f_complete = '{0}/logs/log_{1}_{2}.txt'.format(
                            directory, country_code, today)
                        #print json.dumps(line)
                        str_out = '{0}{1}\n'.format(str_out, json.dumps(line))
                        # Counter
                        counter = counter + 1
                        if counter % 25 == 0:
                            if str_out != '':
                                write_to_file(f_complete, str_out)
                            str_out = ''
                        if counter % 1000 == 0 and counter > 0:
                            counter = 0
                            count_thousands = count_thousands + 1
                            print('[{0}] Processed {1},000 tweets'.format(
                                str(datetime.now()), count_thousands))
                    except Exception as ex:
                        f_error = '{0}/logs/error_{1}.txt'.format(
                            directory, str(today))
                        with open(f_error, 'a') as fw:
                            fw.write('[{0}] Line Exception {1}\n'.format(
                                str(datetime.now()), ex))
                            fw.write('[{0}] {1}\n'.format(
                                str(datetime.now()), line))
        else:
            # Loop through os files
            # and create similar filename but using csv
            # Extract json and write into csv file
            for subdir, dirs, files in os.walk(directory):
                for file in files:
                    if file.startswith('log'):
                        print('[{0}] Processing file : {1}'.format(
                            str(datetime.now()), file))
                        with open(directory + file, 'r') as fin:
                            for line in fin:
                                try:
                                    extract_line(directory, today, line)
                                except:
                                    pass
            pass
        print('Program finished ')
    except Exception as ex:
        f_error = '{0}/logs/error_{1}.txt'.format(directory, str(today))
        make_sure_path_exists(directory + '/logs')
        write_to_file(
            f_error,
            '[{0}] Outer Exception {1}\n'.format(str(datetime.now()), ex))
Exemple #8
0
import os
import memcache
from twitter import Api

MAX_TWEETS = 10
TAGS = ['#vldc', '#gdgvl']

if __name__ == '__main__':

    MEMCACHE_SERVER = os.getenv("MEMCACHE_SERVER", None)
    mc = memcache.Client([MEMCACHE_SERVER])
    api = Api(os.environ["CONSUMER_KEY"], os.environ["CONSUMER_SECRET"],
              os.environ["ACCESS_TOKEN"], os.environ["ACCESS_TOKEN_SECRET"])

    for tweet in api.GetStreamFilter(track=TAGS):
        tweets = mc.get("last_tweets") or []
        tweets = sorted(tweets, key=lambda x: x["created_at"],
                        reverse=True)[:MAX_TWEETS - 1]
        tweets = [tweet] + tweets
        mc.set("last_tweets", tweets)
import os
import json

from twitter import Api

api = Api(consumer_key='',
          consumer_secret='',
          access_token_key='',
          access_token_secret='')

filter1 = [
    '#iphone7', '#iphone6s', '#iphone6splus', '#googlepixel', '#pixel',
    '#iphone6', '#iphone6plus', '#galaxys7', '#lgg5', '#pixel', '#samsungs7',
    '#iphone7plus'
]

if __name__ == '__main__':
    f = open("tweets.txt", "a")
    for line in api.GetStreamFilter(track=filter1):
        print str(line['id']) + "," + line['text'].encode('utf-8') + "\n"
    keywords = [
        '#iphone7'
    ]  #, '#iphone7plus', '#iphone6s', '#iphone6splus', '#iphone6', '#iphone6plus', '#galaxys7', '#lgg5', '#googlepixel', '#googlepixelxl']
Exemple #10
0
if __name__ == '__main__':

    logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)

    i2c = onionI2C.OnionI2C()

    k = load_config("config.ini")
    api = Api(k.get("twitter", "consumer_key"),
              k.get("twitter", "consumer_secret"),
              k.get("twitter", "access_token"),
              k.get("twitter", "access_token_secret"))

    while True:
        try:
            for tweet in api.GetStreamFilter(
                    follow=k.get("twitter", "user_ids")):
                text = tweet.get("text")
                if text is None:
                    continue
                text = text.lower()
                text = re.sub('http(s)?:[^\s]+', '', text)  # remove urls
                text = re.sub('#', '', text)  # remove hashtags
                text = re.sub('[\s]{2}', ' ',
                              text).strip()  # trim extra spaces
                # ignore replies and retweets
                if tweet.get("in_reply_to_screen_name") is not None or \
                   tweet.get("in_reply_to_status_id") is not None or \
                   tweet.get("retweeted_status") is not None or \
                   len(text) < 5:
                    continue
Exemple #11
0
rds_host = config.db_endpoint
name = config.db_username
password = config.db_password
db_name = config.db_name
port = 3306

# In[5]:

USERS = ['@mlstylephoto']

# In[ ]:

while True:
    try:
        for line in api.GetStreamFilter(track=USERS):
            tweet = line
            media = tweet.get('extended_entities', {}).get('media', [])
            user = tweet.get('user', {}).get('screen_name', [])
            time = str(tweet.get('created_at'))
            tweet_id = tweet.get('id')
            complete = 0
            print(tweet_id)
            if (len(media) == 0):
                pass
            else:
                pic = [item['media_url'] for item in media]
                url_1 = str(pic[0].encode("utf-8"))
                url_2 = str(pic[1].encode("utf-8"))
            try:
                conn = pymysql.connect(rds_host,
        secrets = json.load(secrets_file)
    db = dataset.connect('sqlite:///tweets.db')
    tweets_table = db['tweets']
    twitter_secrets = secrets['twitter']
    CONSUMER_KEY = twitter_secrets['key']
    CONSUMER_SECRET = twitter_secrets['secret']
    ACCESS_TOKEN = twitter_secrets['token']
    ACCESS_TOKEN_SECRET = twitter_secrets['token_secret']

    api = Api(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET)

    cache = []

    while True:
        try:
            test_iter = api.GetStreamFilter(
                track=['bitcoin', 'BTC', 'cryptocurrency', 'crypto'])
            for tweet in test_iter:
                tweet_tuples = []
                tweet_tuples, missing_flag = filter_dict(
                    tweet, tweet_field_matches)
                tweet_dict = dict(tweet_tuples)
                if missing_flag:
                    print(tweet)
                    continue
                cache.append(tweet_dict)
                if len(cache) > 400:
                    tweets_table.insert_many(cache)
                    print(f"{pd.to_datetime('now')}: Added {len(cache)} to db")
                    del cache[:]
        except json.decoder.JSONDecodeError as ex:
            print(f"Got Decoder Exception {ex}")