def RunMLCollector(month, day): filename = config.GetSetsFileName(month, day, collection_type) sets = config.ReadJSON(filename) times = [] for set_n in sets: for time in set_n['runtimes']: times.append(time) sorted_times = sorted(times) for sorted_time in sorted_times: for set_n in sets: if sorted_time in set_n['runtimes']: timer = sorted_time timex = datetime.datetime.strptime(timer, '%Y-%m-%d %H:%M:%S') if timex > datetime.datetime.now(): delta = int( (timex - datetime.datetime.now()).total_seconds()) print('Waiting', delta, 'seconds. Next Up', set_n['name'], set_n['call'], timex) sleeper.sleep(delta) # Set API calls if set_n['call'] == 1: collection.FilterStatusByLocation(set_n) else: set_n['call'] = collection.GetUpdatedStatuses(set_n) calltime = datetime.datetime.now() set_n['call_times'].append(calltime) config.WriteJSON(sets, filename) else: print('skipping old runtime.')
def GetUpdatedStatuses(params): print('Updating tweets') month_num = params['month'] day_num = params['day'] set_name = params['set'] call_num = params['call'] filename = config.GetTweetFileName(month_num, day_num, set_name, call_num - 1) tweet_ids = config.GetTweetIds(filename) # Pass id get status twit_api = GetTwitterRest() statuses = [] for tweet_id in tweet_ids: try: status = twit_api.statuses.show(id=tweet_id) statuses.append(status) except Exception as e: print(e, file=sys.stderr) print('Skipped tweet id:', tweet_id) continue # Update call count and write updated statuses to file filename = config.GetTweetFileName(month_num, day_num, set_name, call_num) call_num += 1 config.WriteJSON(statuses, filename) print('Saved', len(statuses), 'statuses to file.') # Do i need to return this value or is it saved like streaming return call_num
def GetFollowers(userid): print('Getting followers of', userid) twit_api = GetTwitterRest() filename = config.GetUserFileName(userid) user = config.ReadJSON(filename) pageCount = 0 followers = [] next_cursor = -1 while (next_cursor != 0 and pageCount < 5): if twit_api.application.rate_limit_status( )['resources']['followers']['/followers/list']['remaining'] > 0: follower = twit_api.followers.list(user_id=userid, count=200, cursor=next_cursor) influence_score = 0 for user in follower: influence_score = (user['followers_count'] * config.GetWeights()['followers_count']) + ( user['listed_count'] * config.GetWeights()['listed_count']) user['influence_score'] = influence_score followers.append(follower['users']) next_cursor = follower['next_cursor'] pageCount += 1 else: print("Sleeping") delta = 15 * 60 sleeper.sleep(delta) user['followers'] = followers config.WriteJSON(user, filename)
def GetUser(user_id): print('Getting users') twit_api = GetTwitterRest() user = twit_api.users.show(user_id=user_id) filename = config.GetUserFileName(userid) config.WriteJSON(user, filename)
def ReadTweets(list_ofQueries, search_location): twit_api = GetTwitterRest() for x in range(0, len(list_ofQueries)): query = list_ofQueries[x] search_results = twitter_api.search.tweets(q=query, geocode=search_location, count=100) filename = GetTweetsFolder() + query + '.json' config.WriteJSON(search_results, filename)
def FilterStatusByLocation(params): print('Getting new tweets by location') month_num = params['month'] day_num = params['day'] set_name = params['set'] call_num = params['call'] search_box = params['boundaries'] if call_num == 1: max_tweets = 100 twit_stream = GetTwitterStream() try: stream = twit_stream.statuses.filter(locations=search_box) # Load tweets to list statuses = [] for status in stream: statuses.append(status) if len(statuses) == max_tweets: break # Write tweets to file filename = config.GetTweetFileName(month_num, day_num, set_name, call_num) config.WriteJSON(statuses, filename) call_num += 1 print('Saved', len(statuses), 'statuses to file.') except Exception as e: print(e, file=sys.stderr) print('Could not get statuses.') return call_num else: print('Error. Check call number.')
def MakeSets(year, month, day, collection_interval, interval_identifier, user_schedule, collection_type, boundaries): for interval in range(1, collection_interval + 1): # Get start times from schedule num_sets = len(user_schedule.columns) num_calls = len(user_schedule.index) cols = list(user_schedule.columns) all_times = [] for col in cols: set_runtimes = user_schedule[col] for set_runtime in set_runtimes: # Create starting time for day. All times midnight to start time get called on following day but recorded for set start day. a = datetime.time(hour=6) if set_runtime < a: StartDate = datetime.datetime(year, month, day + 1) else: StartDate = datetime.datetime(year, month, day) runtime = datetime.datetime.combine(StartDate, set_runtime) all_times.append(runtime) # Create sets obj sets = [] j = 0 for i in range(1, num_sets + 1): k = j + num_calls name = 'set_' + str(i) runs = all_times[j:k] day = day set_x = { 'interval_name': interval_identifier, 'total_intervals': collection_interval, 'collection_type': collection_type, 'interval': interval, 'year': year, 'month': month, 'day': day, 'name': name, 'set': i, 'call': 1, 'runtimes': runs, 'call_times': [], 'boundaries': boundaries } sets.append(set_x) j += num_calls # Add month and day folders if needed APP_ROOT = os.path.dirname(os.path.abspath(__file__)) tweets_folder = config.GetTweetsFolder().replace('../', '') MONTH_ROOT = APP_ROOT.replace( 'scripts', tweets_folder + '/' + collection_type + '/M' + str(month)) if not os.path.isdir(MONTH_ROOT): os.mkdir(MONTH_ROOT) DAY_ROOT = MONTH_ROOT + "/D" + str(day) if not os.path.isdir(DAY_ROOT): os.mkdir(DAY_ROOT) config.WriteJSON(sets, config.GetSetsFileName(month, day, collection_type)) # increment day. if day is in next month, increment month, set day to 1 # Add year by EOY! day += 1 try: datetime.datetime(year, month, day) except ValueError as e: if str(e) == "day is out of range for month": month += 1 day = 1 try: datetime.datetime(year, month, day) except ValueError as e: if str(e) == "month must be in 1..12": month = 1
def RunNWCollector(month, day, localPlaces): import config filename = config.GetSetsFileName(month, day, collection_type) sets = config.ReadJSON(filename) # localPlaces = ['Erie, PA','Wesleyville, PA','Harborcreek, PA','Lawrence Park, PA'] times = [] for set_n in sets: for time in set_n['runtimes']: times.append(time) sorted_times = sorted(times) for sorted_time in sorted_times: for set_n in sets: if sorted_time in set_n['runtimes']: timer = sorted_time timex = datetime.datetime.strptime(timer, '%Y-%m-%d %H:%M:%S') if timex > datetime.datetime.now(): delta = int( (timex - datetime.datetime.now()).total_seconds()) print('Waiting', delta, 'seconds. Next Up', set_n['name'], set_n['call'], timex) sleeper.sleep(delta) # Get tweets collection.FilterStatusByLocation(set_n) # Update sets calltime = datetime.datetime.now() set_n['call_times'].append(calltime) config.WriteJSON(sets, filename) # Check each user, if in Erie write to users and get friends/followers tweets = config.ReadJSON( config.GetTweetFileName(set_n['month'], set_n['day'], set_n['set'], set_n['call'])) newUsers = [] for tweet in tweets: influence_score = 0 if tweet['place'] and tweet['place'][ 'full_name'] in localPlaces: influence_score = ( tweet['user']['followers_count'] * config.GetWeights()['followers_count']) + ( tweet['user']['listed_count'] * config.GetWeights()['listed_count']) tweet['user']['influence_score'] = influence_score newUsers.append(tweet['user']) # Write each user to file, get friends, followers for user in newUsers: filename = config.GetUserFileName(user['id_str']) config = Path(filename) if config.is_file(): # Update this - need the most current version but not if user is in this set print('User', user['id_str'], 'already exists. Skipping for now.') else: print('Writing user', user['id_str']) config.WriteJSON(user, filename) collection.GetFriends(user['id_str']) collection.GetFollowers(user['id_str']) else: print('skipping old runtime.')