Beispiel #1
0
def messages_all():
    auth_data = get_auth(session['username'])
    if auth_data['status'] == 'authorized':
        messages = get_all_messages()
        return render_template('messages.html', messages=messages)
    elif auth_data['status'] == 'unauthorized':
        return render_template('error.html')
    else:
        return render_template('error.html')
def get_data_by_block(index_key):
    # Create Access For Block of Users
    api = get_auth(key_files[index_key])

    # Select Block of Users
    users_block = np.array_split(users, len(key_files))[index_key]

    # Initialize Output File ID
    output_id = str(uuid.uuid4())

    # Initialize DataFrame
    users_friends = pd.DataFrame()

    # Initialize Downloaded User List
    downloaded_ids = []
    counter_ids = 0

    for i, user_id in enumerate(users_block):

        # Try Downloading Friends
        friends = friends_ids(api, user_id, path_to_friends)

        if friends == None:
            print('Error:', user_id)
            continue

        # Append
        users_friends = pd.concat([users_friends, pd.DataFrame([(user_id, friends)], columns=['user_id', 'friends'])],
                                  sort=False)
        downloaded_ids.append(user_id)

        # Save after <cutoff> timelines or when reaching last user
        if len(downloaded_ids) == cutoff or user_id == users_block[-1][0]:

            counter_ids += len(downloaded_ids)

            filename = \
                'friends-' + \
                str(SLURM_JOB_ID) + '-' + \
                str(SLURM_ARRAY_TASK_ID) + '-' + \
                str(index_key) + '-' + \
                str(len(downloaded_ids)) + '-' + \
                output_id + '.json.bz2'

            print('Process', index_key, 'downloaded', counter_ids, 'friends list with most recent output file:',
                  os.path.join(path_to_friends, filename))

            # Save as list of dict discarding index
            users_friends.to_json(os.path.join(path_to_friends, filename), orient='records')

            # Save User Id and File In Which Its Timeline Was Saved
            with open(os.path.join(path_to_friends, 'success'), 'a', encoding='utf-8') as file:
                for downloaded_id in downloaded_ids:
                    file.write(downloaded_id + '\t' + filename + '\n')

            # Reset Output File ID, Data, and Downloaded Users
            del users_friends, downloaded_ids
            output_id = str(uuid.uuid4())
            users_friends = pd.DataFrame()
            downloaded_ids = []

    return 0
    path_to_locations = os.path.join(path_to_data,'locations','profiles')
    path_to_friends = os.path.join(path_to_data,'friends','API',country_code)
    os.makedirs(path_to_friends, exist_ok=True)
    print(path_to_keys)
    print(path_to_users)
    print(path_to_locations)
    print(path_to_friends)

    # # Credentials
    key_files = get_key_files(SLURM_ARRAY_TASK_ID,SLURM_ARRAY_TASK_COUNT,SLURM_JOB_CPUS_PER_NODE, path_to_keys)
    print('\n'.join(key_files))



    for key_file in np.random.permutation(glob(os.path.join(path_to_keys,'*.json'))):
        get_auth(key_file)
    print('Credentials Checked!')

    # # Users List

    print('Import Users By Account Locations')
    start = timer()

    l = []
    for filename in sorted(glob(os.path.join(path_to_users,'user-ids-by-account-location-verified/*.json'))):
        try:
            df = pd.read_json(filename,lines=True)
            l.append(df)
        except:
            print('error importing', filename)
Beispiel #4
0
def download_timelines(index_key, country_code):
    # Create Access For Block of Users
    api = get_auth(key_files[index_key])

    # Select Block of Users
    users_block = np.array_split(users_by_country[country_code],
                                 len(key_files))[index_key]

    # Initialize Output File ID
    output_id = str(uuid.uuid4())

    # Initialize DataFrame
    timelines = pd.DataFrame()

    # Initialize Downloaded User List
    downloaded_ids = []

    for user_index, user_id in enumerate(users_block):

        # Try Downloading Timeline
        timeline, error = get_timeline(user_id, api)

        if error != None:
            #             print(user_id,index_key,error)
            continue

        # Append
        timelines = pd.concat([timelines, timeline], sort=False)
        downloaded_ids.append(user_id)

        # Save after <cutoff> timelines or when reaching last user
        if len(downloaded_ids) == cutoff or user_id == users_block[-1]:

            filename = \
                'timelines-' + \
                str(SLURM_JOB_ID) + '-' + \
                str(SLURM_ARRAY_TASK_ID) + '-' + \
                str(index_key) + '-' + \
                str(len(downloaded_ids)) + '-' + \
                output_id + '.json.bz2'

            print('Process', index_key, 'processed', user_index,
                  'timelines with latest output file:',
                  os.path.join(path_to_timelines, country_code, filename))

            # Save as list of dict discarding index
            timelines.to_json(os.path.join(path_to_timelines, country_code,
                                           filename),
                              orient='records',
                              force_ascii=False,
                              date_format=None,
                              double_precision=15)

            # Save User Id and File In Which Its Timeline Was Saved
            with open(os.path.join(path_to_timelines, country_code, 'success'),
                      'a',
                      encoding='utf-8') as file:
                for downloaded_id in downloaded_ids:
                    file.write(downloaded_id + '\t' + filename + '\n')

            # Reset Output File ID, Data, and Downloaded Users
            del timelines, downloaded_ids
            output_id = str(uuid.uuid4())
            timelines = pd.DataFrame()
            downloaded_ids = []

    return 0
 def __init__(self):
     self.auth = get_auth()
     self.start_date = datetime.strptime('2020-11-27', '%Y-%m-%d')
    path_to_keys = os.path.join(path_to_data, 'keys', 'twitter')
    path_to_timelines = os.path.join(path_to_data, 'timelines')
    os.makedirs(os.path.join(path_to_timelines, this_batch, 'API',
                             country_code),
                exist_ok=True)
    print(path_to_keys)
    print(path_to_timelines)

    # # Credentials

    key_files = get_key_files(SLURM_ARRAY_TASK_ID, SLURM_ARRAY_TASK_COUNT,
                              SLURM_JOB_CPUS_PER_NODE, path_to_keys)
    print('\n'.join(key_files))

    for key_file in key_files:
        api = get_auth(key_file)
    print('Credentials Checked!')

    # # User List

    start = timer()
    print('Select Users...')

    # Select most recent id across pulls
    users = pq.ParquetDataset(
        glob(
            os.path.join(path_to_timelines, '*', 'most_recent_id',
                         country_code, '*.parquet'))).read().to_pandas()

    # Keep the most recent tweets for each user
    users = users.sort_values(['user_id', 'created_at'],