Esempio n. 1
0
def bootstrap_profiles(profiles,
                       alpha=0.05,
                       repetitions=1000,
                       randfunc=randint):
    """Performs bootstrapping over the sample 'profiles'

    Inputs:
        profiles: list of profiles
        alpha: defines the confidence interval as 1 - alpha
        repetitions: number of bootstrap iterations
        randfunc: random function for generate the bootstrap samples

    Returns:
        profile: the bootstrapped profile of the profiles list
        sample_mean: the bootstrap mean of the amount shared
        sample_stdev: the bootstrap standard deviation of the amount shared
        ci: the confidence interval for the bootstrap mean
    """
    length = len(profiles)
    normalize_profiles(profiles)
    boot_shared = []
    boot_profiles = []
    for i in range(repetitions):
        # Construct the bootstrap sample
        resample = [profiles[randfunc(0, length)] for j in range(length)]
        profile = compare_profiles(resample)
        # Store the amount shared
        boot_shared.append(1.0 - profile['not_shared'])
        # Store the result profile
        boot_profiles.append(profile)
    # Convert data to a numpy array
    boot_shared = array(boot_shared)
    # Get the mean and the standard deviation of the shared data
    sample_mean = mean(boot_shared)
    sample_stdev = std(boot_shared)
    # Compute the confidence interval for the bootstrapped data
    # using bootstrap percentile interval
    ci = quantile(boot_shared, [alpha / 2, 1 - (alpha / 2)])
    # Compute the bootstrapped profile of the profiles list
    profile = compare_profiles(profiles)

    return profile, 1.0 - profile['not_shared'], sample_stdev, ci
Esempio n. 2
0
def bootstrap_profiles(profiles, alpha=0.05, repetitions=1000,
    randfunc=randint):
    """Performs bootstrapping over the sample 'profiles'

    Inputs:
        profiles: list of profiles
        alpha: defines the confidence interval as 1 - alpha
        repetitions: number of bootstrap iterations
        randfunc: random function for generate the bootstrap samples

    Returns:
        profile: the bootstrapped profile of the profiles list
        sample_mean: the bootstrap mean of the amount shared
        sample_stdev: the bootstrap standard deviation of the amount shared
        ci: the confidence interval for the bootstrap mean
    """
    length = len(profiles)
    normalize_profiles(profiles)
    boot_shared = []
    boot_profiles = []
    for i in range(repetitions):
        # Construct the bootstrap sample
        resample = [profiles[randfunc(0, length)] for j in range(length)]
        profile = compare_profiles(resample)
        # Store the amount shared
        boot_shared.append(1.0 - profile['not_shared'])
        # Store the result profile
        boot_profiles.append(profile)
    # Convert data to a numpy array
    boot_shared = array(boot_shared)
    # Get the mean and the standard deviation of the shared data
    sample_mean = mean(boot_shared)
    sample_stdev = std(boot_shared)
    # Compute the confidence interval for the bootstrapped data
    # using bootstrap percentile interval
    ci = quantile(boot_shared, [alpha/2, 1-(alpha/2)])
    # Compute the bootstrapped profile of the profiles list
    profile = compare_profiles(profiles)

    return profile, 1.0-profile['not_shared'], sample_stdev, ci
Esempio n. 3
0
def add_alpha_diversity_values_to_mapping_file(metrics, alpha_sample_ids,
                                            alpha_data, mapping_file_headers,
                                            mapping_file_data, bins, method='equal',
                                            missing_value_name='N/A'):
    """add 3 columns in the mapping file representing the alpha diversity data

    Inputs:
    metrics: list of alpha diversity metrics
    alpha_sample_ids: list of sample identifiers in the alpha diversity data
    alpha_data: alpha diversity data, as many columns as metrics and as many
    rows as elments in alpha_sample_ids
    mapping_file_headers: list of headers for the metadata mapping file
    mapping_file_data: metadata mapping file data
    bins: bins to classify the alpha diversity data
    method: binning method selection, the options are 'equal' and 'quantile'.
    'equal', will get you equally spaced limits and 'quantile' will assign the
    limits using quantiles, using the selected number of bins.

    missing_value_name: string to place for the sample ids in the mapping file
    but not in the alpha diversity data

    Output:
    mapping_file_headers: modified headers mapping file headers, including three
    new columns per metric i. e. for chao1 the new fields would be:
    'chao1_alpha', 'chao1_alpha_norm' and 'chao1_alpha_bin'
    mapping_file_data: extended data including the alpha diversity values,
    normalized values and bins

    """
    norm = lambda x, x_min, x_max: (x-x_min)/(x_max-x_min)

    # data will be modified and returned so get your own copy
    new_mapping_file_data = deepcopy(mapping_file_data)
    new_mapping_file_headers = deepcopy(mapping_file_headers)

    # regular levels assigned based on equally spaced bins
    overall_probs = [i/bins for i in range(1, bins)]

    # if we are using the average method the levels are equal to the probs list
    if method == 'equal':
        levels = overall_probs

    for index, metric in enumerate(metrics):
        # get the alpha diversity value for the metric being evaluated
        data = [[row[index]] for row in alpha_data]
        metric_max = max(data)[0]
        metric_min = min(data)[0]

        # add headers for each metric
        new_mapping_file_headers.append('{0}_alpha'.format(metric))
        new_mapping_file_headers.append('{0}_normalized_alpha'.format(metric))
        new_mapping_file_headers.append('{0}_alpha_label'.format(metric))

        # when using the quantile method the levels change depending on the
        # metric being used; hence the calculation and normalization of the data
        if method == 'quantile':
            levels = quantile([norm(element[0], metric_min, metric_max)
                for element in data], overall_probs)

        # get the normalized value of diversity and the tag for each value
        for value in data:
            norm_value = norm(value[0], metric_min, metric_max)
            value.append(norm_value)
            value.append(_get_level(norm_value, levels, 'bin'))

        # iterate using the mapping file instead of using the alpha diversity
        # data because more often that you will like you will have more samples
        # in the mapping file than in your downstream analysis data
        for row in new_mapping_file_data:
            try:
                data_index = alpha_sample_ids.index(row[0])

                # data fields should be strings
                row.extend(map(str, data[data_index]))
            except ValueError:
                row.extend([missing_value_name, missing_value_name,
                    missing_value_name])

    return new_mapping_file_data, new_mapping_file_headers
Esempio n. 4
0
def add_alpha_diversity_values_to_mapping_file(metrics,
                                               alpha_sample_ids,
                                               alpha_data,
                                               mapping_file_headers,
                                               mapping_file_data,
                                               bins,
                                               method='equal',
                                               missing_value_name='N/A'):
    """add 3 columns in the mapping file representing the alpha diversity data

    Inputs:
    metrics: list of alpha diversity metrics
    alpha_sample_ids: list of sample identifiers in the alpha diversity data
    alpha_data: alpha diversity data, as many columns as metrics and as many
    rows as elments in alpha_sample_ids
    mapping_file_headers: list of headers for the metadata mapping file
    mapping_file_data: metadata mapping file data
    bins: bins to classify the alpha diversity data
    method: binning method selection, the options are 'equal' and 'quantile'.
    'equal', will get you equally spaced limits and 'quantile' will assign the
    limits using quantiles, using the selected number of bins.

    missing_value_name: string to place for the sample ids in the mapping file
    but not in the alpha diversity data

    Output:
    mapping_file_headers: modified headers mapping file headers, including three
    new columns per metric i. e. for chao1 the new fields would be:
    'chao1_alpha', 'chao1_alpha_norm' and 'chao1_alpha_bin'
    mapping_file_data: extended data including the alpha diversity values,
    normalized values and bins

    """
    norm = lambda x, x_min, x_max: (x - x_min) / (x_max - x_min)

    # data will be modified and returned so get your own copy
    new_mapping_file_data = deepcopy(mapping_file_data)
    new_mapping_file_headers = deepcopy(mapping_file_headers)

    # regular levels assigned based on equally spaced bins
    overall_probs = [i / bins for i in range(1, bins)]

    # if we are using the average method the levels are equal to the probs list
    if method == 'equal':
        levels = overall_probs

    for index, metric in enumerate(metrics):
        # get the alpha diversity value for the metric being evaluated
        data = [[row[index]] for row in alpha_data]
        metric_max = max(data)[0]
        metric_min = min(data)[0]

        # add headers for each metric
        new_mapping_file_headers.append('{0}_alpha'.format(metric))
        new_mapping_file_headers.append('{0}_normalized_alpha'.format(metric))
        new_mapping_file_headers.append('{0}_alpha_label'.format(metric))

        # when using the quantile method the levels change depending on the
        # metric being used; hence the calculation and normalization of the
        # data
        if method == 'quantile':
            levels = quantile(
                [norm(element[0], metric_min, metric_max) for element in data],
                overall_probs)

        # get the normalized value of diversity and the tag for each value
        for value in data:
            norm_value = norm(value[0], metric_min, metric_max)
            value.append(norm_value)
            value.append(_get_level(norm_value, levels, 'bin'))

        # iterate using the mapping file instead of using the alpha diversity
        # data because more often that you will like you will have more samples
        # in the mapping file than in your downstream analysis data
        for row in new_mapping_file_data:
            try:
                data_index = alpha_sample_ids.index(row[0])

                # data fields should be strings
                row.extend(map(str, data[data_index]))
            except ValueError:
                row.extend([
                    missing_value_name, missing_value_name, missing_value_name
                ])

    return new_mapping_file_data, new_mapping_file_headers