class StreamWorker: config = {} api = None socketio = None def __init__(self, config, socketio): self.config = config self.api = Api( config["consumer_key"], config["consumer_secret"], config["access_token"], config["access_token_secret"] ) self.socketio = socketio def run(self): for data in self.api.GetStreamFilter(track=self.config["keywords"], languages=self.config["languages"]): self.process(data) def process(self, data): keyword = None for k in self.config["keywords"]: if k in data["text"].lower(): keyword = k if not keyword or data["user"]["lang"] != "en": return tweet = {'name': data['user']['screen_name'], 'text': data['text'], 'url': 'https://twitter.com/statuses/' + str(data['id']), 'time': data['created_at'], 'favorites': data['favorite_count'], 'retweets': data['retweet_count'], 'keyword': keyword} print(tweet['time']) print('@%s: %s' % (data['user']['screen_name'], data['text'].encode('ascii', 'ignore'))) #broadcast the tweet to all connected clients self.socketio.emit('new_tweet', tweet)
class TwitterProducer: # logging.basicConfig(level=logging.NOTSET) def __init__(self): self.api = Api(CONSUMER, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET) self._producer = KafkaProducer(bootstrap_servers='localhost:9092', max_in_flight_requests_per_connection=5, acks='all', api_version=(0, 10), retries=100000000000, compression_type='snappy', linger_ms=20, batch_size=32 * 1024) def get_timeline(self): for msg in self.api.GetStreamFilter(track=['bioinformatics', 'trump']): print(msg) key = bytes(str(msg.get('id', 'id')), encoding='utf-8') info = { 'text': msg.get('text', 'text'), 'followers': msg['user']['followers_count'] } val = json.dumps(info, indent=2).encode('utf-8') logging.info(val) # print(val) self._producer.send('twitter_topics', key, val)
class TwitterProducer: # logging.basicConfig(level=logging.NOTSET) def __init__(self): self.api = Api(CONSUMER, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET) self._producer = KafkaProducer( bootstrap_servers='localhost:9092', max_in_flight_requests_per_connection=5, acks='all', api_version=(0, 10), retries=100000000000, compression_type='snappy', linger_ms=20, batch_size=32*1024) def get_timeline(self): for msg in self.api.GetStreamFilter(track=['Cleopatra', 'Potus', 'Trump', 'Pelosi']): # key = bytes(str(msg.get('id', 'id')), encoding='utf-8') val = bytes(str(msg.get('text', 'text')), encoding='utf-8') # logging.info(val) self._producer.send('twitter_home', val)
class TwitterProducer: logging.basicConfig(filename='tweets.py', level=logging.NOTSET) def run(self): self.stream_mentions() # def _producer(self): # pass def __init__(self): self.api = Api(CONSUMER, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET) self._producer = KafkaProducer(bootstrap_servers='localhost:9092', max_in_flight_requests_per_connection=5, acks='all', api_version=(0, 10), retries=100000000000) def stream_mentions(self): with open('output.txt', 'a') as f: for msg in self.api.GetStreamFilter(track=['@tolumide_ng'], languages=['en']): f.write(json.dumps(msg)) logging.info(msg) self._producer.send('twitter_mentions', None, msg) f.write('\n') def stream_timeline(self, user): with open('timeline.txt', 'a') as f: statuses = self.api.GetUserTimeline(user_id=user) # print([s.text for s in statuses]) for s in statuses: # f.write(json.dumps(s.text)) val = bytes(s.text, encoding='utf-8') print(val) self._producer.send('twitter_timelines', val) f.write('\n') f.close() def get_followers(self, user): users = self.api.GetFriends(user) logging.info([u.name for u in users])
def handle(self, *args, **options): track = options['track'] tracks = [Track.objects.get_or_create(text=t)[0] for t in track] CONSUMER_KEY = settings.TWITTER_CONSUMER_KEY CONSUMER_SECRET = settings.TWITTER_CONSUMER_SECRET ACCESS_TOKEN = '2163588176-vrKoKmhLRwanWjEXUw7Zx6KhMMZrEy7jGa1MH3S' ACCESS_TOKEN_SECRET = 'asQYfiYM4YermUgoswAdkCjcSDS31kwji8APRZ18Zgu57' # ACCESS_TOKEN, ACCESS_TOKEN_SECRET = get_access_token(CONSUMER_KEY, CONSUMER_SECRET) api = Api(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET) iterator = api.GetStreamFilter(track=track) self.stdout.write('Streaming...', ending='\r') i = 0 for tweet in iterator: i += 1 q = TweetQueue(json=json.dumps(tweet), tracks=','.join(track)) q.save() self.stdout.write('Streaming...{0}'.format(str(i)), ending='\r')
consumer_key = None consumer_secret = None resource_owner_key = None resource_owner_secret = None api = Api(consumer_key=consumer_key, consumer_secret=consumer_secret, access_token_key=resource_owner_key, access_token_secret=resource_owner_secret) if __name__ == '__main__': while True: try: print('=== Scrapper Launched ===') # Bounding Box for Switzerland Switzerland_LOCATIONS = [ "5.9559113", "45.817994999999996", "10.4922941", "47.8084648" ] for tweet in api.GetStreamFilter(locations=Switzerland_LOCATIONS, stall_warnings=True): if tweet: if tweet.get('created_at', None) != None: with open('data.json', 'a') as outfile: json.dump(tweet, outfile) outfile.write('\n') time.sleep(6) except Exception as e: print('error (Switzerland): ' + str(e)) time.sleep(300)
def main(): arglen = len(sys.argv) USING_TWITTER = False if arglen == 3: directory = sys.argv[1] country_code = sys.argv[2] LOCATIONS, selected = getLocation(country_code) USING_TWITTER = True elif arglen == 2: directory = sys.argv[1] else: print( 'Please give two inputs: directory name and country code {US, UK, AU, NZ, SEA, AF}' ) return if directory != '': directory = directory + '/' if USING_TWITTER: loadConfig('config_secret.json') # Since we're going to be using a streaming endpoint, there is no need to worry # about rate limits. api = Api(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET) # api.GetStreamFilter will return a generator that yields one status # message (i.e., Tweet) at a time as a JSON dictionary. try: today = date.today() if USING_TWITTER: count_day = 0 counter = 0 count_thousands = 0 print(country_code) print(today) str_out = '' while (True): for line in api.GetStreamFilter(locations=LOCATIONS): # warning: "limit" try: if date.today() != today: # Change day today = date.today() try: print('[{0}] Processed {1:,} tweets'.format( str(datetime.now()), count_thousands * 1000 + counter)) print('--- End of the day ---') except: pass counter = 0 count_thousands = 0 count_day += 1 print(today) # Write remaining data into file if str_out != '': write_to_file(f_complete, str_out) str_out = '' if count_day == DAY_CYCLE: count_day = 0 # Change the countries selected = (selected + 1) % len(COUNTRIES) country_code = COUNTRIES[selected] LOCATIONS, selected = getLocation(country_code) print(country_code) break # Write json to file f_complete = '{0}/logs/log_{1}_{2}.txt'.format( directory, country_code, today) #print json.dumps(line) str_out = '{0}{1}\n'.format(str_out, json.dumps(line)) # Counter counter = counter + 1 if counter % 25 == 0: if str_out != '': write_to_file(f_complete, str_out) str_out = '' if counter % 1000 == 0 and counter > 0: counter = 0 count_thousands = count_thousands + 1 print('[{0}] Processed {1},000 tweets'.format( str(datetime.now()), count_thousands)) except Exception as ex: f_error = '{0}/logs/error_{1}.txt'.format( directory, str(today)) with open(f_error, 'a') as fw: fw.write('[{0}] Line Exception {1}\n'.format( str(datetime.now()), ex)) fw.write('[{0}] {1}\n'.format( str(datetime.now()), line)) else: # Loop through os files # and create similar filename but using csv # Extract json and write into csv file for subdir, dirs, files in os.walk(directory): for file in files: if file.startswith('log'): print('[{0}] Processing file : {1}'.format( str(datetime.now()), file)) with open(directory + file, 'r') as fin: for line in fin: try: extract_line(directory, today, line) except: pass pass print('Program finished ') except Exception as ex: f_error = '{0}/logs/error_{1}.txt'.format(directory, str(today)) make_sure_path_exists(directory + '/logs') write_to_file( f_error, '[{0}] Outer Exception {1}\n'.format(str(datetime.now()), ex))
import os import memcache from twitter import Api MAX_TWEETS = 10 TAGS = ['#vldc', '#gdgvl'] if __name__ == '__main__': MEMCACHE_SERVER = os.getenv("MEMCACHE_SERVER", None) mc = memcache.Client([MEMCACHE_SERVER]) api = Api(os.environ["CONSUMER_KEY"], os.environ["CONSUMER_SECRET"], os.environ["ACCESS_TOKEN"], os.environ["ACCESS_TOKEN_SECRET"]) for tweet in api.GetStreamFilter(track=TAGS): tweets = mc.get("last_tweets") or [] tweets = sorted(tweets, key=lambda x: x["created_at"], reverse=True)[:MAX_TWEETS - 1] tweets = [tweet] + tweets mc.set("last_tweets", tweets)
import os import json from twitter import Api api = Api(consumer_key='', consumer_secret='', access_token_key='', access_token_secret='') filter1 = [ '#iphone7', '#iphone6s', '#iphone6splus', '#googlepixel', '#pixel', '#iphone6', '#iphone6plus', '#galaxys7', '#lgg5', '#pixel', '#samsungs7', '#iphone7plus' ] if __name__ == '__main__': f = open("tweets.txt", "a") for line in api.GetStreamFilter(track=filter1): print str(line['id']) + "," + line['text'].encode('utf-8') + "\n" keywords = [ '#iphone7' ] #, '#iphone7plus', '#iphone6s', '#iphone6splus', '#iphone6', '#iphone6plus', '#galaxys7', '#lgg5', '#googlepixel', '#googlepixelxl']
if __name__ == '__main__': logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) i2c = onionI2C.OnionI2C() k = load_config("config.ini") api = Api(k.get("twitter", "consumer_key"), k.get("twitter", "consumer_secret"), k.get("twitter", "access_token"), k.get("twitter", "access_token_secret")) while True: try: for tweet in api.GetStreamFilter( follow=k.get("twitter", "user_ids")): text = tweet.get("text") if text is None: continue text = text.lower() text = re.sub('http(s)?:[^\s]+', '', text) # remove urls text = re.sub('#', '', text) # remove hashtags text = re.sub('[\s]{2}', ' ', text).strip() # trim extra spaces # ignore replies and retweets if tweet.get("in_reply_to_screen_name") is not None or \ tweet.get("in_reply_to_status_id") is not None or \ tweet.get("retweeted_status") is not None or \ len(text) < 5: continue
rds_host = config.db_endpoint name = config.db_username password = config.db_password db_name = config.db_name port = 3306 # In[5]: USERS = ['@mlstylephoto'] # In[ ]: while True: try: for line in api.GetStreamFilter(track=USERS): tweet = line media = tweet.get('extended_entities', {}).get('media', []) user = tweet.get('user', {}).get('screen_name', []) time = str(tweet.get('created_at')) tweet_id = tweet.get('id') complete = 0 print(tweet_id) if (len(media) == 0): pass else: pic = [item['media_url'] for item in media] url_1 = str(pic[0].encode("utf-8")) url_2 = str(pic[1].encode("utf-8")) try: conn = pymysql.connect(rds_host,
secrets = json.load(secrets_file) db = dataset.connect('sqlite:///tweets.db') tweets_table = db['tweets'] twitter_secrets = secrets['twitter'] CONSUMER_KEY = twitter_secrets['key'] CONSUMER_SECRET = twitter_secrets['secret'] ACCESS_TOKEN = twitter_secrets['token'] ACCESS_TOKEN_SECRET = twitter_secrets['token_secret'] api = Api(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET) cache = [] while True: try: test_iter = api.GetStreamFilter( track=['bitcoin', 'BTC', 'cryptocurrency', 'crypto']) for tweet in test_iter: tweet_tuples = [] tweet_tuples, missing_flag = filter_dict( tweet, tweet_field_matches) tweet_dict = dict(tweet_tuples) if missing_flag: print(tweet) continue cache.append(tweet_dict) if len(cache) > 400: tweets_table.insert_many(cache) print(f"{pd.to_datetime('now')}: Added {len(cache)} to db") del cache[:] except json.decoder.JSONDecodeError as ex: print(f"Got Decoder Exception {ex}")