def bootstrap_profiles(profiles, alpha=0.05, repetitions=1000, randfunc=randint): """Performs bootstrapping over the sample 'profiles' Inputs: profiles: list of profiles alpha: defines the confidence interval as 1 - alpha repetitions: number of bootstrap iterations randfunc: random function for generate the bootstrap samples Returns: profile: the bootstrapped profile of the profiles list sample_mean: the bootstrap mean of the amount shared sample_stdev: the bootstrap standard deviation of the amount shared ci: the confidence interval for the bootstrap mean """ length = len(profiles) normalize_profiles(profiles) boot_shared = [] boot_profiles = [] for i in range(repetitions): # Construct the bootstrap sample resample = [profiles[randfunc(0, length)] for j in range(length)] profile = compare_profiles(resample) # Store the amount shared boot_shared.append(1.0 - profile['not_shared']) # Store the result profile boot_profiles.append(profile) # Convert data to a numpy array boot_shared = array(boot_shared) # Get the mean and the standard deviation of the shared data sample_mean = mean(boot_shared) sample_stdev = std(boot_shared) # Compute the confidence interval for the bootstrapped data # using bootstrap percentile interval ci = quantile(boot_shared, [alpha / 2, 1 - (alpha / 2)]) # Compute the bootstrapped profile of the profiles list profile = compare_profiles(profiles) return profile, 1.0 - profile['not_shared'], sample_stdev, ci
def bootstrap_profiles(profiles, alpha=0.05, repetitions=1000, randfunc=randint): """Performs bootstrapping over the sample 'profiles' Inputs: profiles: list of profiles alpha: defines the confidence interval as 1 - alpha repetitions: number of bootstrap iterations randfunc: random function for generate the bootstrap samples Returns: profile: the bootstrapped profile of the profiles list sample_mean: the bootstrap mean of the amount shared sample_stdev: the bootstrap standard deviation of the amount shared ci: the confidence interval for the bootstrap mean """ length = len(profiles) normalize_profiles(profiles) boot_shared = [] boot_profiles = [] for i in range(repetitions): # Construct the bootstrap sample resample = [profiles[randfunc(0, length)] for j in range(length)] profile = compare_profiles(resample) # Store the amount shared boot_shared.append(1.0 - profile['not_shared']) # Store the result profile boot_profiles.append(profile) # Convert data to a numpy array boot_shared = array(boot_shared) # Get the mean and the standard deviation of the shared data sample_mean = mean(boot_shared) sample_stdev = std(boot_shared) # Compute the confidence interval for the bootstrapped data # using bootstrap percentile interval ci = quantile(boot_shared, [alpha/2, 1-(alpha/2)]) # Compute the bootstrapped profile of the profiles list profile = compare_profiles(profiles) return profile, 1.0-profile['not_shared'], sample_stdev, ci
def add_alpha_diversity_values_to_mapping_file(metrics, alpha_sample_ids, alpha_data, mapping_file_headers, mapping_file_data, bins, method='equal', missing_value_name='N/A'): """add 3 columns in the mapping file representing the alpha diversity data Inputs: metrics: list of alpha diversity metrics alpha_sample_ids: list of sample identifiers in the alpha diversity data alpha_data: alpha diversity data, as many columns as metrics and as many rows as elments in alpha_sample_ids mapping_file_headers: list of headers for the metadata mapping file mapping_file_data: metadata mapping file data bins: bins to classify the alpha diversity data method: binning method selection, the options are 'equal' and 'quantile'. 'equal', will get you equally spaced limits and 'quantile' will assign the limits using quantiles, using the selected number of bins. missing_value_name: string to place for the sample ids in the mapping file but not in the alpha diversity data Output: mapping_file_headers: modified headers mapping file headers, including three new columns per metric i. e. for chao1 the new fields would be: 'chao1_alpha', 'chao1_alpha_norm' and 'chao1_alpha_bin' mapping_file_data: extended data including the alpha diversity values, normalized values and bins """ norm = lambda x, x_min, x_max: (x-x_min)/(x_max-x_min) # data will be modified and returned so get your own copy new_mapping_file_data = deepcopy(mapping_file_data) new_mapping_file_headers = deepcopy(mapping_file_headers) # regular levels assigned based on equally spaced bins overall_probs = [i/bins for i in range(1, bins)] # if we are using the average method the levels are equal to the probs list if method == 'equal': levels = overall_probs for index, metric in enumerate(metrics): # get the alpha diversity value for the metric being evaluated data = [[row[index]] for row in alpha_data] metric_max = max(data)[0] metric_min = min(data)[0] # add headers for each metric new_mapping_file_headers.append('{0}_alpha'.format(metric)) new_mapping_file_headers.append('{0}_normalized_alpha'.format(metric)) new_mapping_file_headers.append('{0}_alpha_label'.format(metric)) # when using the quantile method the levels change depending on the # metric being used; hence the calculation and normalization of the data if method == 'quantile': levels = quantile([norm(element[0], metric_min, metric_max) for element in data], overall_probs) # get the normalized value of diversity and the tag for each value for value in data: norm_value = norm(value[0], metric_min, metric_max) value.append(norm_value) value.append(_get_level(norm_value, levels, 'bin')) # iterate using the mapping file instead of using the alpha diversity # data because more often that you will like you will have more samples # in the mapping file than in your downstream analysis data for row in new_mapping_file_data: try: data_index = alpha_sample_ids.index(row[0]) # data fields should be strings row.extend(map(str, data[data_index])) except ValueError: row.extend([missing_value_name, missing_value_name, missing_value_name]) return new_mapping_file_data, new_mapping_file_headers
def add_alpha_diversity_values_to_mapping_file(metrics, alpha_sample_ids, alpha_data, mapping_file_headers, mapping_file_data, bins, method='equal', missing_value_name='N/A'): """add 3 columns in the mapping file representing the alpha diversity data Inputs: metrics: list of alpha diversity metrics alpha_sample_ids: list of sample identifiers in the alpha diversity data alpha_data: alpha diversity data, as many columns as metrics and as many rows as elments in alpha_sample_ids mapping_file_headers: list of headers for the metadata mapping file mapping_file_data: metadata mapping file data bins: bins to classify the alpha diversity data method: binning method selection, the options are 'equal' and 'quantile'. 'equal', will get you equally spaced limits and 'quantile' will assign the limits using quantiles, using the selected number of bins. missing_value_name: string to place for the sample ids in the mapping file but not in the alpha diversity data Output: mapping_file_headers: modified headers mapping file headers, including three new columns per metric i. e. for chao1 the new fields would be: 'chao1_alpha', 'chao1_alpha_norm' and 'chao1_alpha_bin' mapping_file_data: extended data including the alpha diversity values, normalized values and bins """ norm = lambda x, x_min, x_max: (x - x_min) / (x_max - x_min) # data will be modified and returned so get your own copy new_mapping_file_data = deepcopy(mapping_file_data) new_mapping_file_headers = deepcopy(mapping_file_headers) # regular levels assigned based on equally spaced bins overall_probs = [i / bins for i in range(1, bins)] # if we are using the average method the levels are equal to the probs list if method == 'equal': levels = overall_probs for index, metric in enumerate(metrics): # get the alpha diversity value for the metric being evaluated data = [[row[index]] for row in alpha_data] metric_max = max(data)[0] metric_min = min(data)[0] # add headers for each metric new_mapping_file_headers.append('{0}_alpha'.format(metric)) new_mapping_file_headers.append('{0}_normalized_alpha'.format(metric)) new_mapping_file_headers.append('{0}_alpha_label'.format(metric)) # when using the quantile method the levels change depending on the # metric being used; hence the calculation and normalization of the # data if method == 'quantile': levels = quantile( [norm(element[0], metric_min, metric_max) for element in data], overall_probs) # get the normalized value of diversity and the tag for each value for value in data: norm_value = norm(value[0], metric_min, metric_max) value.append(norm_value) value.append(_get_level(norm_value, levels, 'bin')) # iterate using the mapping file instead of using the alpha diversity # data because more often that you will like you will have more samples # in the mapping file than in your downstream analysis data for row in new_mapping_file_data: try: data_index = alpha_sample_ids.index(row[0]) # data fields should be strings row.extend(map(str, data[data_index])) except ValueError: row.extend([ missing_value_name, missing_value_name, missing_value_name ]) return new_mapping_file_data, new_mapping_file_headers