def query_user_friends_ids(output, id_list, auth_file): logger = logging.getLogger(__name__) num_inputs_queried = 0 #create the api pool api_pool = TweepyPool(auth_file) write_fd = open(output, 'w+') for userid in id_list: num_inputs_queried = num_inputs_queried + 1 if not userid == '': try: count = 0 for item in Cursor(api_pool.friends_ids, id=userid).items(): logger.debug('user id: {}'.format(item)) count = count + 1 tweet_item = {'id': item} tweet_item['smapp_original_user_id'] = userid tweet_item['smapp_timestamp'] = datetime.datetime.utcnow( ).strftime('%Y-%m-%d %H:%M:%S +0000') write_fd.write(json.dumps(tweet_item)) write_fd.write('\n') except TweepError as e: logger.info('tweepy error: %s', e) logger.info('counted %s objects for input %s', count, userid) logger.info('number of inputs queried so far: %s', num_inputs_queried) write_fd.close()
def twitter_query(context): ''' Gets user ids, and feeds them into a function to query twitter. ''' input_file = context['input'] auth_file = context['auth'] id_list = get_id_list(input_file) offset = context['start_idx_input'] start_idx = context['start_idx_api'] log('Creating oauth pool...') api_pool = TweepyPool(auth_file) for i, user_id in enumerate(id_list[offset:]): filename, s3_filename = get_user_id_file(user_id, context) if not s3.file_exists(s3_filename): log('writing user id: {} here'.format(user_id, filename)) with open(filename, 'w+') as write_fd: for item in Cursor(api_pool.friends, id=user_id, count=5000).items(): tweet_item = json.loads(json.dumps(item._json)) tweet_item['smapp_original_user_id'] = user_id tweet_item['smapp_timestamp'] = datetime.datetime.utcnow( ).strftime('%Y-%m-%d %H:%M:%S +0000') write_fd.write(json.dumps(tweet_item) + '\n') log('Sending file to s3: {}'.format(s3_filename)) s3.disk_2_s3(filename, s3_filename) s3.disk_2_s3(context['log'], context['s3_log']) os.remove(filename) else: log('{} already queried!!!'.format(user_id)) log('>>> {} out of {}'.format(i + offset, len(id_list))) time.sleep(1)
def query_user_tweets(output, id_list, auth_file): logger = logging.getLogger(__name__) num_inputs_queried = 0 #create the api pool api_pool = TweepyPool(auth_file) write_fd = open(output, 'w+') for userid in id_list: num_inputs_queried = num_inputs_queried + 1 # even though the count is 200 we can cycle through 3200 items. # if you put a count variable in this cursor it will iterate up # to about 3200 if not userid == '': try: count = 0 for item in Cursor(api_pool.user_timeline, user_id=userid, count=200).items(): logger.debug('tweet text: %s', item.text) count = count + 1 tweet_item = json.loads(json.dumps(item._json)) tweet_item['smapp_timestamp'] = datetime.datetime.utcnow( ).strftime('%Y-%m-%d %H:%M:%S +0000') write_fd.write(json.dumps(tweet_item)) write_fd.write('\n') except TweepError as e: logger.info('tweepy error: %s', e) logger.info('counted %s objects for input %s', count, userid) logger.info('number of inputs queried so far: %s', num_inputs_queried) write_fd.close()
def query_search_tweets(output, terms_list, auth_file): logger = logging.getLogger(__name__) num_inputs_queried = 0 #create the api pool api_pool = TweepyPool(auth_file) write_fd = open(output, 'w+') for term in terms_list: num_inputs_queried = num_inputs_queried + 1 count = 0 if not term == '': try: for item in Cursor(api_pool.search, q=urllib.parse.quote(term)).items(): logger.debug('tweet text: %s', item.text) count = count + 1 tweet_item = json.loads(json.dumps(item._json)) tweet_item['smapp_term'] = term tweet_item['smapp_count'] = count tweet_item['smapp_timestamp'] = datetime.datetime.utcnow( ).strftime('%Y-%m-%d %H:%M:%S +0000') write_fd.write(json.dumps(tweet_item)) write_fd.write('\n') except TweepError as e: logger.info('tweepy error: %s', e) logger.info('counted %s objects for input %s', count, term) logger.info('number of inputs queried so far: %s', num_inputs_queried) write_fd.close()
def query_search_tweets(output, terms_list, auth_file): logger = logging.getLogger(__name__) num_inputs_queried = 0 #create the api pool api_pool = TweepyPool(auth_file) write_fd = open(output, 'w+') hundred_terms = [] for term in terms_list: if len(hundred_terms) == 100: num_inputs_queried = num_inputs_queried + 1 count = 0 if not term == '': try: for item in api_pool.statuses_lookup(hundred_terms): print(item) logger.debug('tweet text: %s', item.text) count = count + 1 tweet_item = json.loads(json.dumps(item._json)) tweet_item['smapp_term'] = term tweet_item['smapp_count'] = count tweet_item[ 'smapp_timestamp'] = datetime.datetime.utcnow( ).strftime('%Y-%m-%d %H:%M:%S +0000') write_fd.write(json.dumps(tweet_item)) write_fd.write('\n') except TweepError as e: logger.info('tweepy error: %s', e) logger.info('counted %s objects for input %s', count, term) logger.info('number of inputs queried so far: %s', num_inputs_queried) # if we come in and the list already has 100, that means # we queried that 100 hundred_terms = [term] else: hundred_terms.append(term) write_fd.close()
def query_user_tweets(output, id_list, auth_file, max_id=-1, since_id=-1): ''' queries twitter for users from id_list and authentication from auth_file. ''' num_inputs_queried = 0 api_pool = TweepyPool(auth_file) write_fd = open(output, 'a+') for userid in id_list: num_inputs_queried = num_inputs_queried + 1 # even though the count is 200 we can cycle through 3200 items. # if you put a count variable in this cursor it will iterate up # to about 3200 if not userid == '': try: count = 0 if max_id and since_id: cursor = Cursor(api_pool.user_timeline, user_id=userid, count=200, max_id=max_id, since_id=since_id, tweet_mode='extended') elif max_id: cursor = Cursor(api_pool.user_timeline, user_id=userid, count=200, max_id=max_id, tweet_mode='extended') elif since_id: cursor = Cursor(api_pool.user_timeline, user_id=userid, count=200, since_id=since_id, tweet_mode='extended') else: cursor = Cursor(api_pool.user_timeline, user_id=userid, count=200, tweet_mode='extended') for item in cursor.items(): count = count + 1 tweet_item = json.loads(json.dumps(item._json)) tweet_item['smapp_timestamp'] = (datetime.datetime.utcnow( ).strftime('%Y-%m-%d %H:%M:%S +0000')) write_fd.write(json.dumps(tweet_item)) write_fd.write('\n') except TweepError as e: log('tweepy error: {}'.format(e)) log('counted {} objects for input {}'.format(count, userid)) log('number of inputs queried so far: {}'.format(num_inputs_queried)) s3.disk_2_s3(context['log'], context['s3_log']) write_fd.close()
def twitter_query(context): ''' Gets user ids, and feeds them into a function to query twitter. ''' output = context['output'] input_file = context['input'] auth_file = context['auth'] log('creating oauth pool...') id_list = get_id_list(input_file) log('creating oauth pool...') api_pool = TweepyPool(auth_file) log('starting query...') num_inputs_queried = 0 with open(output, 'w+') as write_fd: for user_id in id_list: num_inputs_queried = num_inputs_queried + 1 if not user_id == '': try: count = 0 for item in Cursor(api_pool.followers, id=user_id, count=5000).items(): log('user id: {}, and screen_name {}'.format( item.id, item.screen_name)) count = count + 1 tweet_item = json.loads(json.dumps(item._json)) tweet_item['smapp_original_user_id'] = user_id tweet_item[ 'smapp_timestamp'] = datetime.datetime.utcnow( ).strftime('%Y-%m-%d %H:%M:%S +0000') write_fd.write(json.dumps(tweet_item) + '\n') except TweepError as e: log('tweepy error: {}'.format(e)) # update the logs and send to s3 log('counted {} objects for input {}'.format(count, user_id)) s3.disk_2_s3(context['log'], context['s3_log']) log('number of inputs queried so far: {}'.format( num_inputs_queried)) s3.disk_2_s3(context['log'], context['s3_log'])
input_list = [] _, file_extension = os.path.splitext(args.input) if file_extension == '.json': logger.info('trying json...') id_data = open(args.input).read() input_list = json.loads(id_data) logger.info('loaded input_list as json') elif file_extension == '.csv': logger.info('is not json, trying csv') csvhandle = open(args.input) csvreader = csv.reader(csvhandle) count = 0 for row in csvreader: if count > 0: input_list.append(row[0]) count = count + 1 logger.info('loaded input_list as csv') #create the api pool api = TweepyPool(args.auth) if args.operation == 'ids_users': ids_to_usernames(input_list, args.output, api) elif args.operation == 'users_ids': usernames_to_ids(input_list, args.output, api) ''' author @yvan tweepy docs here : https://github.com/tweepy/tweepy/blob/master/tweepy/api.py#L146 '''