Beispiel #1
0
    def get_records_with_criteria(sql_parts: dict, criteria_values: list,
                                  ft: FusionTableHandler) -> dict:
        """Get a set of records that need analysis

    Returns the collective fusiontables#sqlresponse for the set of queries that need to be performed.
    @params
        sql_parts: dict, contains the respective sql clauses, including the logic for assembly with format_map
        criteria_values: list, the collection that needs to be iterated and included.
        ft: FusionTableHandler, an authenticated service handler
    @return: dict, the fusiontables#sqlresponse to the collective query set.
        """
        if not (isinstance(sql_parts, dict) and isinstance(
                criteria_values, list) and isinstance(ft, FusionTableHandler)):
            raise TypeError('Invalid argument types')
        if not sql_parts or 'assembly' not in sql_parts:
            raise AttributeError(
                'Input sql definition has no reassembly instructions')
        if isinstance(criteria_values[0], list):
            raise NotImplementedError(
                'Paired criteria restriction is not supported.')

        result = {'kind': 'fusiontables#sqlresponse', 'is_complete': False}
        margin = ft.remaining_query_length(
            sql_parts['assembly'].format_map(sql_parts))
        est_size = 0.25  # kB per row estimate
        # Show a progress bar (in case of a slow connection, large query, etc.).
        progress_parameters = {
            'total': len(criteria_values),
            'prefix': 'Member data retrieval: ',
            'length': 50
        }
        ppb(iteration=0, **progress_parameters)
        while criteria_values:
            queried_criteria = []
            where_str = ','.join(queried_criteria)
            while criteria_values and len(where_str) < margin:
                queried_criteria.append(criteria_values.pop())
                where_str = ','.join(queried_criteria)
            sql_parts['where_values'] = where_str
            response = ft.get_query_result(
                sql_parts['assembly'].format_map(sql_parts),
                kb_row_size=est_size)
            ppb(progress_parameters['total'] - len(criteria_values),
                **progress_parameters)
            if not response:  # HttpError, so an API issue or other. Already retried the query twice.
                return result
            if 'columns' not in result:  # Write columns once.
                result['columns'] = response['columns']
            if 'rows' not in response or not response['rows']:
                print('Warning: no results for query "{}"'.format(
                    sql_parts['assembly'].format_map(sql_parts)))
            else:
                result.setdefault('rows', []).extend(response['rows'])
                est_size = get_size_estimate(result['rows']) * 2048 / len(
                    result['rows'])

        # Send the dict back to the callee.
        result['is_complete'] = True
        return result
Beispiel #2
0
def authorize(local_keys: dict) -> 'Dict[str, GoogleService]':
    '''Authenticate the requested Google API scopes for a single user.
    '''
    def save_credentials(credentials, keys):
        '''Save the given access and refresh tokens to the local disk.
        '''
        keys['access_token'] = credentials.token
        keys['refresh_token'] = credentials.refresh_token
        with open('auth.txt', 'w', newline='') as f:
            writer = csv.writer(f, quoting=csv.QUOTE_ALL)
            for key, value in keys.items():
                writer.writerow([key, value])

    print('Checking authorization status...', end='')
    creds = None
    try:
        creds = Credentials(
            local_keys['access_token'],
            refresh_token=local_keys['refresh_token'],
            token_uri="https://accounts.google.com/o/oauth2/token",
            client_id=local_keys['client_id'],
            client_secret=local_keys['client_secret'],
            scopes=SCOPES)
    except KeyError:
        pass
    iapp_flow: InstalledAppFlow = InstalledAppFlow.from_client_secrets_file(
        "client_secret_MHCC.json", SCOPES)
    if not creds or creds.expired or not creds.valid:
        iapp_flow.run_local_server(
            authorization_prompt_message='opening browser for OAuth flow.')
        creds = iapp_flow.credentials
        save_credentials(creds, local_keys)
    else:
        print('... Credentials OK!')

    drive = DriveHandler(creds)
    print(
        '\nVerifying Drive access by requesting storage quota and user object.'
    )
    drive.verify_drive_service()

    fusiontables = FusionTableHandler(creds)
    print(
        '\nVerifying FusionTables access by requesting tables you\'ve accessed.'
    )
    fusiontables.verify_ft_service(export=True)

    bq = BigQueryHandler(local_keys['bq_project'], creds)
    print('\nVerifying BigQuery access by requesting project information.')
    # TODO

    print('Authorization & service verification completed successfully.')
    return {'FusionTables': fusiontables, 'Drive': drive, 'BigQuery': bq}
Beispiel #3
0
def get_table_data(service: FusionTableHandler, tableId: str, sql: str):
    '''Obtain annotated table data as determined from the input SQL'''
    try:
        data = service.get_query_result(query=sql, kb_row_size=0.2)
    except HttpError:
        print('Unable to obtain ROWIDs in bulk query')
        byte_data = service.query.sqlGet_media(sql=sql).execute()
        data = service.bytestring_to_queryresult(byte_data)
    # Convert list of list to list of dicts
    headers = data['columns']
    output = (dict(zip(headers, x)) for x in data['rows'])
    return coerce_to_typed_info(tableId, output)
Beispiel #4
0
def _perform_deletion(service: FusionTableHandler, tableId: str,
                      target_rowids: list):
    ''' Delete the rows with the given ROWIDs '''
    num_rows = service.count_rows(tableId)
    target_rows = num_rows - len(target_rowids)
    deleted = service.delete_records_by_rowid(tableId, target_rowids)
    new_count = service.count_rows(tableId)
    if new_count >= num_rows:
        print('Table {tableId} does not have fewer rows', new_count, num_rows)
    if new_count != target_rows:
        print(
            f'Expected table {tableId} to have {target_rows}, but it has {new_count}'
        )
    print(f'Deleted {deleted} rows from {tableId}')
Beispiel #5
0
def download_table_data(ft: FusionTableHandler, tableId: str,
                        table: bigquery.Table) -> list:
    """Download the data from the given FusionTable and process it to match the given schema"""
    data: dict = ft.get_query_result(f'select * from {tableId}')
    if 'rows' in data:
        transform_table_data(data['rows'], table)
        return data['rows']
Beispiel #6
0
def clean_regressions(service: FusionTableHandler,
                      time_start='2018-11-29T12:00:00.000000+0000',
                      time_end='2019-11-30T12:00:00.000000+0000'):
    from regression_fixer import clean_rank_regression, clean_crown_regression
    uids = [x[1] for x in service.get_user_batch()]
    args = (service, uids, time_start, time_end)
    clean_rank_regression(*args, tableId=TABLE_LIST['MHCC Rank DB'])
    clean_crown_regression(*args, tableId=TABLE_LIST['MHCC Crowns DB'])
Beispiel #7
0
def keep_interesting_records(tableId: str):
    '''
    Removes duplicated crown records, keeping each member's records which
    have a new LastSeen value, or a new Rank value.

    @params:
        tableId: str
            The FusionTable ID which should have extraneous records removed.
    '''
    task_start_time = time.perf_counter()
    if not tableId or not isinstance(tableId, str) or len(tableId) != 41:
        raise ValueError('Invalid table id')
    uids = [row[1] for row in get_user_batch(0, 100000)]
    if not uids:
        print('No members returned')
        return
    num_rows = get_total_row_count(tableId)
    rowids = identify_desirable_records(uids, tableId)
    if not rowids:
        print('No rowids received')
        return
    if len(rowids) == num_rows:
        print("All records are interesting")
        return
    local_filename = FusionTableHandler.get_filename_for_table(tableId)
    records_to_keep = ([] if not FusionTableHandler.can_use_local_data(
        tableId, local_filename) else
                       FusionTableHandler.read_local_data(local_filename))
    have_valid_data = (records_to_keep and len(records_to_keep) == len(rowids)
                       and validate_retrieved_records(records_to_keep,
                                                      tableId))
    if not have_valid_data:
        # Local data cannot be used, so get the row data associated with rowids.
        records_to_keep = retrieve_whole_records(rowids, tableId)
        # Validate this downloaded data.
        have_valid_data = validate_retrieved_records(records_to_keep, tableId)

    if have_valid_data:
        # Back up the table before we do anything crazy.
        backup_table(tableId)
        # Do something crazy.
        replace_table(tableId, records_to_keep)
    print('Completed "KeepInterestingRecords" task in %s sec.' %
          time.perf_counter() - task_start_time)
Beispiel #8
0
def prune_ranks(tableId: str, ft: FusionTableHandler):
    """Routine which prunes out boring Rank DB data.
    # [Member, UID, LastSeen, RankTime, Rank, MHCC]
    Multiple approaches are possible:
        1) Keep the first and last Ranks for a given LastSeen
        2) Keep the first and last records at a given chronologically observed Rank
        3) Keep the first record for each LastSeen-Rank pairing.
    These approaches differ data removal, and also for whom they target.
        Option 1 will prune many Rank DB records from users who rarely refresh their profile data.
        Option 2 will prune many records from members with relatively fixed rank positions, even
            if they frequently refresh their crown counts
        Option 3 is not as aggressive as options 1 and 2, and is the one implemented.
    """
    def get_records_with_criteria(sql_parts: dict, criteria_values: list,
                                  ft: FusionTableHandler) -> dict:
        """Get a set of records that need analysis

    Returns the collective fusiontables#sqlresponse for the set of queries that need to be performed.
    @params
        sql_parts: dict, contains the respective sql clauses, including the logic for assembly with format_map
        criteria_values: list, the collection that needs to be iterated and included.
        ft: FusionTableHandler, an authenticated service handler
    @return: dict, the fusiontables#sqlresponse to the collective query set.
        """
        if not (isinstance(sql_parts, dict) and isinstance(
                criteria_values, list) and isinstance(ft, FusionTableHandler)):
            raise TypeError('Invalid argument types')
        if not sql_parts or 'assembly' not in sql_parts:
            raise AttributeError(
                'Input sql definition has no reassembly instructions')
        if isinstance(criteria_values[0], list):
            raise NotImplementedError(
                'Paired criteria restriction is not supported.')

        result = {'kind': 'fusiontables#sqlresponse', 'is_complete': False}
        margin = ft.remaining_query_length(
            sql_parts['assembly'].format_map(sql_parts))
        est_size = 0.25  # kB per row estimate
        # Show a progress bar (in case of a slow connection, large query, etc.).
        progress_parameters = {
            'total': len(criteria_values),
            'prefix': 'Member data retrieval: ',
            'length': 50
        }
        ppb(iteration=0, **progress_parameters)
        while criteria_values:
            queried_criteria = []
            where_str = ','.join(queried_criteria)
            while criteria_values and len(where_str) < margin:
                queried_criteria.append(criteria_values.pop())
                where_str = ','.join(queried_criteria)
            sql_parts['where_values'] = where_str
            response = ft.get_query_result(
                sql_parts['assembly'].format_map(sql_parts),
                kb_row_size=est_size)
            ppb(progress_parameters['total'] - len(criteria_values),
                **progress_parameters)
            if not response:  # HttpError, so an API issue or other. Already retried the query twice.
                return result
            if 'columns' not in result:  # Write columns once.
                result['columns'] = response['columns']
            if 'rows' not in response or not response['rows']:
                print('Warning: no results for query "{}"'.format(
                    sql_parts['assembly'].format_map(sql_parts)))
            else:
                result.setdefault('rows', []).extend(response['rows'])
                est_size = get_size_estimate(result['rows']) * 2048 / len(
                    result['rows'])

        # Send the dict back to the callee.
        result['is_complete'] = True
        return result

    def select_interesting_rank_records(records: list, rowids: list,
                                        indices: dict, tracker: dict) -> list:
        """Add the rowid of interesting records into input, and return a copy with only the interesting records"""
        kept_records = []
        for record in records:
            uid = str(record[indices['uid']])
            try:
                ls = str(int(record[indices['ls']]))
            except ValueError:
                continue
            rank = str(int(record[indices['rank']]))
            rt = str(int(record[indices['rt']]))
            if uid not in tracker:
                tracker[uid] = dict([(ls, {rank})])
            elif ls not in tracker[uid]:
                tracker[uid][ls] = {rank}
            elif rank not in tracker[uid][ls]:
                tracker[uid][ls].add(rank)
            else:
                continue
            rowids.append(str(record[indices['rowid']]))
            kept_records.append(record[:])
        return kept_records

    def validate_retained_rank_records(tableId: str, records: list,
                                       members: list) -> bool:
        """Ensure that the input records do not delete all of any members' data
        """

        if not (isinstance(records, list) and isinstance(records[0], list)):
            return False

        # Get metadata from the target table.
        remote_columns = ft.get_all_columns(tableId)
        remote_row_counts = ft.get_query_result(
            f'SELECT UID, COUNT() FROM {tableId} GROUP BY UID', 0.05)['rows']
        if len(records) < len(remote_row_counts):
            return False

        # Some UIDs may have .0 at the end from the initial column type being Number.
        # Coerce the remote UID columns where this is true to the proper representation.
        for row in remote_row_counts:
            if '.' in row[0]:
                row[0] = row[0].__str__().partition('.')[0]
        assert not [x for x in remote_row_counts if '.' in x[0]]

        mhcc_members = set(x[1] for x in members)
        local_row_counts = {}
        rows_with_errors = []
        coerced = set()
        for row in records:
            # Check that the row is well-formed.
            if len(row) != remote_columns['total']:
                rows_with_errors.append(row)
            # Coerce the UID columns to not have :
            if '.' in row[1]:
                coerced.add(row[1])
                row[1] = row[1].__str__().partition('.')[0]

            # Increment the member's row count.
            try:
                local_row_counts[str(row[1])] += 1
            except KeyError:
                local_row_counts[str(row[1])] = 1

        if rows_with_errors:
            print(
                f'{len(rows_with_errors)} rows had incorrect column lengths:')
            print(rows_with_errors)
        if coerced:
            print(
                f'{len(coerced)} UIDs were repartitioned to remove decimals.')
            print(coerced)

        # Inspect the remote table data and ensure each remaining member is represented.
        has_valid_dataset = True
        for row in remote_row_counts:
            if row[0] not in mhcc_members:
                continue
            elif row[0] not in local_row_counts:
                print(
                    f'Unable to find member with UID=\'{row[0]}\' in data to upload'
                )
                has_valid_dataset = False
            elif int(row[1]) < local_row_counts[row[0]]:
                print(
                    f'More rows in upload data than source data for member UID=\'{row[0]}\''
                )
                has_valid_dataset = False
        revalidation = select_interesting_rank_records(records,
                                                       rowids=[],
                                                       tracker={},
                                                       indices={
                                                           'uid': 1,
                                                           'ls': 2,
                                                           'rt': 3,
                                                           'rank': 4,
                                                           'rowid': 0
                                                       })
        if len(revalidation) != len(records):
            print(
                f'Reanalysis of upload data yielded {len(records) - len(revalidation)} non-interesting rows.'
            )
            has_valid_dataset = False
        return has_valid_dataset

    if not tableId or not isinstance(tableId, str):
        return

    # Download only the columns necessary to pick records to keep.
    criteria_sql = {
        'assembly':
        '{select} {from} {where_start}{where_values}{where_end} {order}',
        'select': "SELECT ROWID, UID, Rank, LastSeen, RankTime",
        'from': "FROM " + tableId,
        'where_start': "WHERE UID IN (",
        'where_values': '',
        'where_end': ")",
        'order': "ORDER BY UID ASC, LastSeen ASC, RankTime ASC"
    }
    members = ft.get_user_batch()

    # Always requery the target table for rowid data.
    print('Pruning ranks for {} members'.format(len(members)))
    uids = [x[1] for x in members]
    criteria_result = get_records_with_criteria(criteria_sql, uids, ft)
    if not criteria_result['is_complete']:
        print(
            'Criteria querying exited prior to full retrieval. Aborting prune...'
        )
        return
    try:
        criteria_records = criteria_result['rows']
    except KeyError:
        print('No records matching criteria')
        return

    # Analyse the records to get the desired rowids
    rowids = []
    seen = {}
    # Index the columns of the criteria query.
    criteria_indices = {'rowid': 0, 'uid': 1, 'rank': 2, 'ls': 3, 'rt': 4}
    print('Selecting records of interest...')
    interesting_records = select_interesting_rank_records(
        criteria_records, rowids, criteria_indices, seen)
    if len(interesting_records) == len(criteria_records):
        print('No redundant data detected.')
        return
    print('Found {:,} records to remove from {:,} total records.'.format(
        len(criteria_records) - len(interesting_records),
        len(criteria_records)))

    # Download the records to be kept.
    local_filename = ft.get_filename_for_table(tableId)
    table_data = ([] if not ft.can_use_local_data(tableId, local_filename,
                                                  handlers['Drive']) else
                  ft.read_local_data(local_filename))
    # Verify record validity
    data_is_valid = (table_data and len(table_data) == len(rowids)
                     and validate_retained_rank_records(
                         tableId, table_data, members))
    if not data_is_valid:
        print('Downloading full records...')
        table_data = ft.get_records_by_rowid(rowids, tableId)
        data_is_valid = validate_retained_rank_records(tableId, table_data,
                                                       members)

    if not data_is_valid:
        print('Unable to obtain validated data')
        save(table_data, 'invalid_rank_data_snapshot.csv')
        return

    backup = ft.backup_table(tableId, await_clone=True)
    if not backup:
        print('Failed to create table backup. Aborting prune...')
        return

    # Do the actual replacement.
    ft.replace_rows(tableId, table_data)
    print('Ranks have been successfully pruned.')