def messages_all(): auth_data = get_auth(session['username']) if auth_data['status'] == 'authorized': messages = get_all_messages() return render_template('messages.html', messages=messages) elif auth_data['status'] == 'unauthorized': return render_template('error.html') else: return render_template('error.html')
def get_data_by_block(index_key): # Create Access For Block of Users api = get_auth(key_files[index_key]) # Select Block of Users users_block = np.array_split(users, len(key_files))[index_key] # Initialize Output File ID output_id = str(uuid.uuid4()) # Initialize DataFrame users_friends = pd.DataFrame() # Initialize Downloaded User List downloaded_ids = [] counter_ids = 0 for i, user_id in enumerate(users_block): # Try Downloading Friends friends = friends_ids(api, user_id, path_to_friends) if friends == None: print('Error:', user_id) continue # Append users_friends = pd.concat([users_friends, pd.DataFrame([(user_id, friends)], columns=['user_id', 'friends'])], sort=False) downloaded_ids.append(user_id) # Save after <cutoff> timelines or when reaching last user if len(downloaded_ids) == cutoff or user_id == users_block[-1][0]: counter_ids += len(downloaded_ids) filename = \ 'friends-' + \ str(SLURM_JOB_ID) + '-' + \ str(SLURM_ARRAY_TASK_ID) + '-' + \ str(index_key) + '-' + \ str(len(downloaded_ids)) + '-' + \ output_id + '.json.bz2' print('Process', index_key, 'downloaded', counter_ids, 'friends list with most recent output file:', os.path.join(path_to_friends, filename)) # Save as list of dict discarding index users_friends.to_json(os.path.join(path_to_friends, filename), orient='records') # Save User Id and File In Which Its Timeline Was Saved with open(os.path.join(path_to_friends, 'success'), 'a', encoding='utf-8') as file: for downloaded_id in downloaded_ids: file.write(downloaded_id + '\t' + filename + '\n') # Reset Output File ID, Data, and Downloaded Users del users_friends, downloaded_ids output_id = str(uuid.uuid4()) users_friends = pd.DataFrame() downloaded_ids = [] return 0
path_to_locations = os.path.join(path_to_data,'locations','profiles') path_to_friends = os.path.join(path_to_data,'friends','API',country_code) os.makedirs(path_to_friends, exist_ok=True) print(path_to_keys) print(path_to_users) print(path_to_locations) print(path_to_friends) # # Credentials key_files = get_key_files(SLURM_ARRAY_TASK_ID,SLURM_ARRAY_TASK_COUNT,SLURM_JOB_CPUS_PER_NODE, path_to_keys) print('\n'.join(key_files)) for key_file in np.random.permutation(glob(os.path.join(path_to_keys,'*.json'))): get_auth(key_file) print('Credentials Checked!') # # Users List print('Import Users By Account Locations') start = timer() l = [] for filename in sorted(glob(os.path.join(path_to_users,'user-ids-by-account-location-verified/*.json'))): try: df = pd.read_json(filename,lines=True) l.append(df) except: print('error importing', filename)
def download_timelines(index_key, country_code): # Create Access For Block of Users api = get_auth(key_files[index_key]) # Select Block of Users users_block = np.array_split(users_by_country[country_code], len(key_files))[index_key] # Initialize Output File ID output_id = str(uuid.uuid4()) # Initialize DataFrame timelines = pd.DataFrame() # Initialize Downloaded User List downloaded_ids = [] for user_index, user_id in enumerate(users_block): # Try Downloading Timeline timeline, error = get_timeline(user_id, api) if error != None: # print(user_id,index_key,error) continue # Append timelines = pd.concat([timelines, timeline], sort=False) downloaded_ids.append(user_id) # Save after <cutoff> timelines or when reaching last user if len(downloaded_ids) == cutoff or user_id == users_block[-1]: filename = \ 'timelines-' + \ str(SLURM_JOB_ID) + '-' + \ str(SLURM_ARRAY_TASK_ID) + '-' + \ str(index_key) + '-' + \ str(len(downloaded_ids)) + '-' + \ output_id + '.json.bz2' print('Process', index_key, 'processed', user_index, 'timelines with latest output file:', os.path.join(path_to_timelines, country_code, filename)) # Save as list of dict discarding index timelines.to_json(os.path.join(path_to_timelines, country_code, filename), orient='records', force_ascii=False, date_format=None, double_precision=15) # Save User Id and File In Which Its Timeline Was Saved with open(os.path.join(path_to_timelines, country_code, 'success'), 'a', encoding='utf-8') as file: for downloaded_id in downloaded_ids: file.write(downloaded_id + '\t' + filename + '\n') # Reset Output File ID, Data, and Downloaded Users del timelines, downloaded_ids output_id = str(uuid.uuid4()) timelines = pd.DataFrame() downloaded_ids = [] return 0
def __init__(self): self.auth = get_auth() self.start_date = datetime.strptime('2020-11-27', '%Y-%m-%d')
path_to_keys = os.path.join(path_to_data, 'keys', 'twitter') path_to_timelines = os.path.join(path_to_data, 'timelines') os.makedirs(os.path.join(path_to_timelines, this_batch, 'API', country_code), exist_ok=True) print(path_to_keys) print(path_to_timelines) # # Credentials key_files = get_key_files(SLURM_ARRAY_TASK_ID, SLURM_ARRAY_TASK_COUNT, SLURM_JOB_CPUS_PER_NODE, path_to_keys) print('\n'.join(key_files)) for key_file in key_files: api = get_auth(key_file) print('Credentials Checked!') # # User List start = timer() print('Select Users...') # Select most recent id across pulls users = pq.ParquetDataset( glob( os.path.join(path_to_timelines, '*', 'most_recent_id', country_code, '*.parquet'))).read().to_pandas() # Keep the most recent tweets for each user users = users.sort_values(['user_id', 'created_at'],