def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) sample_id_map_fp = opts.sample_id_map_fp if sample_id_map_fp: sample_id_map = dict([(k,v[0]) \ for k,v in fields_to_dict(open(sample_id_map_fp, "U")).items()]) else: sample_id_map = None input_dm_fps = opts.input_dms.split(',') output_f = open(opts.output_fp,'w') output_f.write(comment) output_f.write('DM1\tDM2\tNumber of entries\tMantel p-value\n') num_iterations = opts.num_iterations for i,fp1 in enumerate(input_dm_fps): for fp2 in input_dm_fps[i+1:]: (dm1_labels, dm1), (dm2_labels, dm2) =\ make_compatible_distance_matrices(parse_distmat(open(fp1,'U')), parse_distmat(open(fp2,'U')), lookup=sample_id_map) if len(dm1_labels) < 2: output_f.write('%s\t%s\t%d\tToo few samples\n' % (fp1,fp2,len(dm1_labels))) continue p = mantel(dm1,dm2,n=num_iterations) p_str = format_p_value_for_num_iters(p,num_iterations) output_f.write('%s\t%s\t%d\t%s\n' % (fp1,fp2,len(dm1_labels),p_str)) output_f.close()
def compute_ordination_correlation(map_f, coord_f, category, axis=1, correlation_type='pearson', num_permutations=999): if correlation_type not in CORRELATION_TYPES: raise ValueError("Invalid correlation type '%s'. Must be one of %r." % (correlation_type, CORRELATION_TYPES)) if num_permutations < 0: raise ValueError("Invalid number of permutations: %d. Must be greater " "than or equal to zero." % num_permutations) coords_samp_ids, coords, _, _ = parse_coords(coord_f) num_axes = len(coords[0]) if axis < 1 or axis > num_axes: raise ValueError("Invalid axis number %d. Must be greater than zero " "and less than or equal to the number of axes in the " "input coordinates file (found %d axes)." % (axis, num_axes)) axis_data = coords[:, axis - 1] mdm, _ = parse_mapping_file_to_dict(map_f) gradient_data = [] for samp_id in coords_samp_ids: if category not in mdm[samp_id]: raise ValueError("Category '%s' does not exist in the input " "mapping file." % category) md_value = mdm[samp_id][category] try: md_value = float(md_value) except ValueError: raise ValueError("The category state '%s' could not be converted " "to a number. All states in the '%s' category " "must be numeric." % (md_value, category)) gradient_data.append(md_value) corr_coeff, param_p_val, _, nonparam_p_val, _ = \ correlation_test(axis_data, gradient_data, method=correlation_type, permutations=num_permutations) if num_permutations > 0: nonparam_p_val = format_p_value_for_num_iters(nonparam_p_val, num_permutations) else: nonparam_p_val = 'N/A' return corr_coeff, param_p_val, nonparam_p_val
def test_format_p_value_for_num_iters(self): """ format_p_value_for_num_iters functions as expected """ self.assertEqual(format_p_value_for_num_iters(0.119123123123, 100), "0.12") self.assertEqual(format_p_value_for_num_iters(0.119123123123, 250), "0.12") self.assertEqual(format_p_value_for_num_iters(0.119123123123, 1000), "0.119") # test num_iters too low still returns a string (this can # be the last step of a long process, so we don't want to fail) self.assertEqual(format_p_value_for_num_iters(0.119123123123, 9), "Too few iters to compute p-value (num_iters=9)") self.assertEqual(format_p_value_for_num_iters(0.119123123123, 1), "Too few iters to compute p-value (num_iters=1)") self.assertEqual(format_p_value_for_num_iters(0.119123123123, 0), "Too few iters to compute p-value (num_iters=0)")
def test_format_p_value_for_num_iters(self): """ format_p_value_for_num_iters functions as expected """ self.assertEqual(format_p_value_for_num_iters(0.119123123123, 100), "0.12") self.assertEqual(format_p_value_for_num_iters(0.119123123123, 250), "0.12") self.assertEqual(format_p_value_for_num_iters(0.119123123123, 1000), "0.119") # test num_iters too low still returns a string (this can # be the last step of a long process, so we don't want to fail) self.assertEqual( format_p_value_for_num_iters(0.119123123123, 9), "Too few iters to compute p-value (num_iters=9)" ) self.assertEqual( format_p_value_for_num_iters(0.119123123123, 1), "Too few iters to compute p-value (num_iters=1)" ) self.assertEqual( format_p_value_for_num_iters(0.119123123123, 0), "Too few iters to compute p-value (num_iters=0)" )
def compare_alpha_diversities(rarefaction_lines, mapping_lines, category, depth=None, test_type='nonparametric', num_permutations=999): """Compares alpha diversity values for differences per category treatment. Notes: Returns a defaultdict which as keys has the pairs of treatments being compared, and as values, lists of (pval,tval) tuples for each comparison at for a given iteration. Inputs: rarefaction_lines - list of lines, result of multiple rarefactions. mapping_lines - list of lines, mapping file lines. category - str, the category to be compared, eg 'Treatment' or 'Age'. depth - int, depth of the rarefaction file to use. if None, then will use the deepest available in the file. test_type - str, the type of t-test to perform. Must be either 'parametric' or 'nonparametric'. num_permutations - int, the number of Monte Carlo permutations to use if test_type is 'nonparametric'. """ if test_type == 'nonparametric' and num_permutations < 1: raise ValueError("Invalid number of permutations: %d. Must be greater " "than zero." % num_permutations) rarefaction_data = parse_rarefaction(rarefaction_lines) mapping_data = parse_mapping_file_to_dict(mapping_lines)[0] # samid_pairs, treatment_pairs are in the same order samid_pairs, treatment_pairs = sampleId_pairs(mapping_data, rarefaction_data, category) ps_avg_div = get_per_sample_average_diversities(rarefaction_data, depth) ttest_results, ad_avgs = {}, {} for sid_pair, treatment_pair in zip(samid_pairs, treatment_pairs): # if there is only 1 sample for each treatment in a comparison, and mc # using mc method, will error (e.g. mc_t_two_sample([1],[1]). if len(sid_pair[0]) == 1 and len(sid_pair[1]) == 1: ttest_results[treatment_pair] = (None, None) # add alpha diversity averages and standard deviations. since their # is only a single sample if we are in this part of the loop, we can # just record the sample value as the avg and 0 as the std. ad_avgs[treatment_pair[0]] = (sid_pair[0][0], 0.) ad_avgs[treatment_pair[1]] = (sid_pair[1][0], 0.) else: i = array([ps_avg_div[x] for x in sid_pair[0]]) j = array([ps_avg_div[x] for x in sid_pair[1]]) # add alpha diversity averages and standard deviations. ad_avgs[treatment_pair[0]] = (i.mean(), i.std()) ad_avgs[treatment_pair[1]] = (j.mean(), j.std()) # conduct tests if isnan(np_min(i)) or isnan(np_min(j)): ttest_results[treatment_pair] = (None, None) continue if test_type == 'parametric': obs_t, p_val = t_two_sample(i, j) elif test_type == 'nonparametric': obs_t, _, _, p_val = mc_t_two_sample( i, j, permutations=num_permutations) if p_val is not None: p_val = float( format_p_value_for_num_iters( p_val, num_iters=num_permutations)) elif p_val is None: # None will error in format_p_val obs_t, p_val = None, None else: raise ValueError("Invalid test type '%s'." % test_type) ttest_results[treatment_pair] = (obs_t, p_val) return ttest_results, ad_avgs
def compare_alpha_diversities(rarefaction_lines, mapping_lines, category, depth=None, test_type='nonparametric', num_permutations=999): """Compares alpha diversity values for differences per category treatment. Notes: Returns a defaultdict which as keys has the pairs of treatments being compared, and as values, lists of (pval,tval) tuples for each comparison at for a given iteration. Inputs: rarefaction_lines - list of lines, result of multiple rarefactions. mapping_lines - list of lines, mapping file lines. category - str, the category to be compared, eg 'Treatment' or 'Age'. depth - int, depth of the rarefaction file to use. if None, then will use the deepest available in the file. test_type - str, the type of t-test to perform. Must be either 'parametric' or 'nonparametric'. num_permutations - int, the number of Monte Carlo permutations to use if test_type is 'nonparametric'. """ if test_type == 'nonparametric' and num_permutations < 1: raise ValueError("Invalid number of permutations: %d. Must be greater " "than zero." % num_permutations) rarefaction_data = parse_rarefaction(rarefaction_lines) mapping_data = parse_mapping_file_to_dict(mapping_lines)[0] # samid_pairs, treatment_pairs are in the same order samid_pairs, treatment_pairs = sampleId_pairs(mapping_data, rarefaction_data, category) # extract only rows of the rarefaction data that are at the given depth # if depth is not given default to the deepest rarefaction available # rarefaction file is not guaranteed to be in order of rarefaction depth if depth == None: depth = array(rarefaction_data[3])[:, 0].max() rare_mat = array([row for row in rarefaction_data[3] if row[0] == depth]) # Average each col of the rarefaction mtx. Computing t test on averages over # all iterations. Avoids more comps which kills signifigance. rare_mat = (rare_mat.sum(0) / rare_mat.shape[0])[2:] #remove depth,iter cols sids = rarefaction_data[0][3:] # 0-2 are header strings ttest_results = {} for sid_pair, treatment_pair in zip(samid_pairs, treatment_pairs): # if there is only 1 sample for each treatment in a comparison, and mc # using mc method, will error (e.g. mc_t_two_sample([1],[1]). if len(sid_pair[0]) == 1 and len(sid_pair[1]) == 1: ttest_results[treatment_pair] = (None, None) else: pair0_indices = [sids.index(i) for i in sid_pair[0]] pair1_indices = [sids.index(i) for i in sid_pair[1]] i = rare_mat.take(pair0_indices) j = rare_mat.take(pair1_indices) # found discussion of how to quickly check an array for nan here: # http://stackoverflow.com/questions/6736590/fast-check-for-nan-in-numpy if isnan(np_min(i)) or isnan(np_min(j)): ttest_results[treatment_pair] = (None, None) continue if test_type == 'parametric': obs_t, p_val = t_two_sample(i, j) elif test_type == 'nonparametric': obs_t, _, _, p_val = mc_t_two_sample( i, j, permutations=num_permutations) if p_val != None: p_val = float( format_p_value_for_num_iters( p_val, num_iters=num_permutations)) elif p_val == None: #None will error in format_p_val obs_t, p_val = None, None else: raise ValueError("Invalid test type '%s'." % test_type) ttest_results[treatment_pair] = (obs_t, p_val) # create dict of average alpha diversity values alphadiv_avgs = {} for sid_pair, treatment_pair in zip(samid_pairs, treatment_pairs): # calculate the alpha diversity average, std vals. choosing only first # treatment pair doesn't guarantees full covering, must look at both for sid_list, treatment_str in zip(sid_pair, treatment_pair): # check if already computed and added if not treatment_str in alphadiv_avgs.keys(): alphadiv_vals = \ rare_mat.take([sids.index(i) for i in sid_list]) ad_mean = alphadiv_vals.mean() ad_std = alphadiv_vals.std() alphadiv_avgs[treatment_str] = (ad_mean, ad_std) return ttest_results, alphadiv_avgs
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) random_trials = opts.random_trials if random_trials != None and random_trials < 10: option_parser.error('Must perform >= 10 trails for Monte Carlo analysis.') output_dir = opts.output_dir sample_id_map_fp = opts.sample_id_map_fp num_dimensions = opts.num_dimensions if not exists(output_dir): makedirs(output_dir) if opts.store_trial_details: trial_output_dir = '%s/trial_details/' % output_dir else: trial_output_dir = None input_fp1 = opts.input_fps[0] input_fp2 = opts.input_fps[1] input_fp1_dir, input_fn1 = split(input_fp1) input_fp1_basename, input_fp1_ext = splitext(input_fn1) input_fp2_dir, input_fn2 = split(input_fp2) input_fp2_basename, input_fp2_ext = splitext(input_fn2) output_summary_fp = '%s/%s_%s_procrustes_results.txt' %\ (output_dir,input_fp1_basename,input_fp2_basename) output_matrix1_fp = '%s/pc1_transformed.txt' % output_dir output_matrix2_fp = '%s/pc2_transformed.txt' % output_dir if sample_id_map_fp: sample_id_map = dict([(k,v[0]) \ for k,v in fields_to_dict(open(sample_id_map_fp, "U")).items()]) else: sample_id_map = None transformed_coords1, transformed_coords2, m_squared, randomized_coords2 =\ get_procrustes_results(open(input_fp1,'U'),\ open(input_fp2,'U'),\ sample_id_map=sample_id_map,\ randomize=False, max_dimensions=num_dimensions) output_matrix1_f = open(output_matrix1_fp,'w') output_matrix1_f.write(transformed_coords1) output_matrix1_f.close() output_matrix2_f = open(output_matrix2_fp,'w') output_matrix2_f.write(transformed_coords2) output_matrix2_f.close() if random_trials: summary_file_lines = ['FP1 FP2 Included_dimensions MC_p_value Count_better M^2'] coords_f1 = list(open(input_fp1,'U')) coords_f2 = list(open(input_fp2,'U')) actual_m_squared, trial_m_squareds, count_better, mc_p_value =\ procrustes_monte_carlo(coords_f1,\ coords_f2,\ trials=random_trials,\ max_dimensions=num_dimensions, sample_id_map=sample_id_map, trial_output_dir=trial_output_dir) # truncate the p-value to the correct number of significant # digits mc_p_value_str = format_p_value_for_num_iters(mc_p_value, random_trials) max_dims_str = str(num_dimensions or 'alldim') summary_file_lines.append('%s %s %s %s %d %1.3f' %\ (input_fp1, input_fp2, str(max_dims_str), mc_p_value_str,\ count_better, actual_m_squared)) f = open(output_summary_fp,'w') f.write('\n'.join(summary_file_lines)) f.write('\n') f.close()
def compare_alpha_diversities(rarefaction_lines, mapping_lines, category, depth=None, test_type='nonparametric', num_permutations=999): """Compares alpha diversity values for differences per category treatment. Notes: Returns a defaultdict which as keys has the pairs of treatments being compared, and as values, lists of (pval,tval) tuples for each comparison at for a given iteration. Inputs: rarefaction_lines - list of lines, result of multiple rarefactions. mapping_lines - list of lines, mapping file lines. category - str, the category to be compared, eg 'Treatment' or 'Age'. depth - int, depth of the rarefaction file to use. if None, then will use the deepest available in the file. test_type - str, the type of t-test to perform. Must be either 'parametric' or 'nonparametric'. num_permutations - int, the number of Monte Carlo permutations to use if test_type is 'nonparametric'. """ if test_type == 'nonparametric' and num_permutations < 1: raise ValueError("Invalid number of permutations: %d. Must be greater " "than zero." % num_permutations) rarefaction_data = parse_rarefaction(rarefaction_lines) mapping_data = parse_mapping_file_to_dict(mapping_lines)[0] # samid_pairs, treatment_pairs are in the same order samid_pairs, treatment_pairs = sampleId_pairs(mapping_data, rarefaction_data, category) # extract only rows of the rarefaction data that are at the given depth # if depth is not given default to the deepest rarefaction available # rarefaction file is not guaranteed to be in order of rarefaction depth if depth == None: depth = array(rarefaction_data[3])[:,0].max() rare_mat = array([row for row in rarefaction_data[3] if row[0]==depth]) # Average each col of the rarefaction mtx. Computing t test on averages over # all iterations. Avoids more comps which kills signifigance. rare_mat = (rare_mat.sum(0)/rare_mat.shape[0])[2:] #remove depth,iter cols sids = rarefaction_data[0][3:] # 0-2 are header strings ttest_results = {} for sid_pair, treatment_pair in zip(samid_pairs, treatment_pairs): # if there is only 1 sample for each treatment in a comparison, and mc # using mc method, will error (e.g. mc_t_two_sample([1],[1]). if len(sid_pair[0])==1 and len(sid_pair[1])==1: ttest_results[treatment_pair]= (None,None) else: pair0_indices = [sids.index(i) for i in sid_pair[0]] pair1_indices = [sids.index(i) for i in sid_pair[1]] i = rare_mat.take(pair0_indices) j = rare_mat.take(pair1_indices) # found discussion of how to quickly check an array for nan here: # http://stackoverflow.com/questions/6736590/fast-check-for-nan-in-numpy if isnan(np_min(i)) or isnan(np_min(j)): ttest_results[treatment_pair]= (None,None) continue if test_type == 'parametric': obs_t, p_val = t_two_sample(i,j) elif test_type == 'nonparametric': obs_t, _, _, p_val = mc_t_two_sample(i,j, permutations=num_permutations) if p_val != None: p_val = float(format_p_value_for_num_iters(p_val, num_iters=num_permutations)) elif p_val == None: #None will error in format_p_val obs_t, p_val = None, None else: raise ValueError("Invalid test type '%s'." % test_type) ttest_results[treatment_pair]= (obs_t,p_val) # create dict of average alpha diversity values alphadiv_avgs = {} for sid_pair, treatment_pair in zip(samid_pairs, treatment_pairs): # calculate the alpha diversity average, std vals. choosing only first # treatment pair doesn't guarantees full covering, must look at both for sid_list, treatment_str in zip(sid_pair, treatment_pair): # check if already computed and added if not treatment_str in alphadiv_avgs.keys(): alphadiv_vals = \ rare_mat.take([sids.index(i) for i in sid_list]) ad_mean = alphadiv_vals.mean() ad_std = alphadiv_vals.std() alphadiv_avgs[treatment_str] = (ad_mean, ad_std) return ttest_results, alphadiv_avgs
def run_mantel_correlogram(fps, distmats, num_perms, comment, alpha, sample_id_map=None, variable_size_distance_classes=False): """Runs a Mantel correlogram analysis on all pairs of distance matrices. Returns a string suitable for writing out to a file containing the results of the test, a list of correlogram filepath names, and a list of matplotlib Figure objects representing each correlogram. The correlogram filepaths can have an extension string appended to the end of them and then be used to save each of the correlogram Figures to a file. Each correlogram filepath will be a combination of the two distance matrix filepaths that were used to create it. WARNING: Only symmetric, hollow distance matrices may be used as input. Asymmetric distance matrices, such as those obtained by the UniFrac Gain metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input. Arguments: fps - list of filepaths of the distance matrices distmats - list of tuples containing dm labels and dm data (i.e. the output of parse_distmat) num_perms - the number of permutations to use to calculate the p-value(s) comment - comment string to add to the beginning of the results string alpha - the alpha value to use to determine significance in the correlogram plots sample_id_map - dict mapping sample IDs (i.e. what is expected by make_compatible_distance_matrices) variable_size_distance_classes - create distance classes that vary in size (i.e. width) but have the same number of distances in each class """ if len(fps) != len(distmats): raise ValueError("Must provide the same number of filepaths as there " "are distance matrices.") if comment is None: comment = '' result = comment + 'DM1\tDM2\tNumber of entries\t' + \ 'Number of permutations\tClass index\t' + \ 'Number of distances\tMantel r statistic\t' + \ 'p-value\tp-value (Bonferroni corrected)\tTail type\n' correlogram_fps = [] correlograms = [] # Loop over all pairs of dms. for i, (fp1, (dm1_labels, dm1_data)) in enumerate(zip(fps, distmats)): for fp2, (dm2_labels, dm2_data) in zip(fps, distmats)[i + 1:]: # Make the current pair of distance matrices compatible by only # keeping samples that match between them, and ordering them by # the same sample IDs. (dm1_labels, dm1_data), (dm2_labels, dm2_data) = \ make_compatible_distance_matrices((dm1_labels, dm1_data), (dm2_labels, dm2_data), lookup=sample_id_map) if len(dm1_labels) < 3: result += '%s\t%s\t%d\tToo few samples\n' % (fp1, fp2, len(dm1_labels)) continue dm1 = SymmetricDistanceMatrix(dm1_data, dm1_labels) dm2 = SymmetricDistanceMatrix(dm2_data, dm2_labels) # Create an instance of our Mantel correlogram test and run it with # the specified number of permutations. mc = MantelCorrelogram( dm1, dm2, alpha=alpha, variable_size_distance_classes=variable_size_distance_classes) results = mc(num_perms) # Generate a name for the current correlogram and save it and the # correlogram itself. dm1_name = path.basename(fp1) dm2_name = path.basename(fp2) correlogram_fps.append('_'.join((dm1_name, 'AND', dm2_name, 'mantel_correlogram')) + '.') correlograms.append(results['correlogram_plot']) # Iterate over the results and write them to the text file. first_time = True for class_idx, num_dist, r, p, p_corr in zip( results['class_index'], results['num_dist'], results['mantel_r'], results['mantel_p'], results['mantel_p_corr']): # Format p-values and figure out which tail type we have based # on the sign of r. p_str = None if p is not None: p_str = format_p_value_for_num_iters(p, num_perms) p_corr_str = None if p_corr is not None: p_corr_str = format_p_value_for_num_iters( p_corr, num_perms) if r is None: tail_type = None elif r < 0: tail_type = 'less' else: tail_type = 'greater' if first_time: result += '%s\t%s\t%d\t%d\t%s\t%d\t%s\t%s\t%s\t%s\n' % ( fp1, fp2, len(dm1_labels), num_perms, class_idx, num_dist, r, p_str, p_corr_str, tail_type) first_time = False else: result += '\t\t\t\t%s\t%d\t%s\t%s\t%s\t%s\n' % ( class_idx, num_dist, r, p_str, p_corr_str, tail_type) return result, correlogram_fps, correlograms
def run_mantel_test(method, fps, distmats, num_perms, tail_type, comment, control_dm_fp=None, control_dm=None, sample_id_map=None): """Runs a Mantel test on all pairs of distance matrices. Returns a string suitable for writing out to a file containing the results of the test. WARNING: Only symmetric, hollow distance matrices may be used as input. Asymmetric distance matrices, such as those obtained by the UniFrac Gain metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input. Arguments: method - which Mantel test to run (either 'mantel' or 'partial_mantel') fps - list of filepaths of the distance matrices distmats - list of tuples containing dm labels and dm data (i.e. the output of parse_distmat) num_perms - the number of permutations to use to calculate the p-value(s) tail_type - the type of tail test to use when calculating the p-value(s). Can be 'two sided', 'greater', or 'less'. Only applies when method is mantel comment - comment string to add to the beginning of the results string control_dm_fp - filepath of the control distance matrix. Only applies when method is partial_mantel (it is required then) control_dm - tuple containing control distance matrix labels and matrix data. Only applies when method is partial_mantel (it is required then) sample_id_map - dict mapping sample IDs (i.e. what is expected by make_compatible_distance_matrices) """ if len(fps) != len(distmats): raise ValueError("Must provide the same number of filepaths as there " "are distance matrices.") if comment is None: comment = '' result = comment if method == 'mantel': result += 'DM1\tDM2\tNumber of entries\tMantel r statistic\t' + \ 'p-value\tNumber of permutations\tTail type\n' elif method == 'partial_mantel': if not control_dm_fp or not control_dm: raise ValueError("You must provide a control matrix filepath and " "control matrix when running the partial Mantel " "test.") result += 'DM1\tDM2\tCDM\tNumber of entries\t' + \ 'Mantel r statistic\tp-value\tNumber of permutations\t' +\ 'Tail type\n' else: raise ValueError("Invalid method '%s'. Must be either 'mantel' or " "'partial_mantel'." % method) # Loop over all pairs of dms. for i, (fp1, (dm1_labels, dm1_data)) in enumerate(zip(fps, distmats)): for fp2, (dm2_labels, dm2_data) in zip(fps, distmats)[i + 1:]: # Make the current pair of distance matrices compatible by only # keeping samples that match between them, and ordering them by # the same sample IDs. (dm1_labels, dm1_data), (dm2_labels, dm2_data) = \ make_compatible_distance_matrices((dm1_labels, dm1_data), (dm2_labels, dm2_data), lookup=sample_id_map) if method == 'partial_mantel': # We need to intersect three sets (three matrices). (dm1_labels, dm1_data), (cdm_labels, cdm_data) = \ make_compatible_distance_matrices( (dm1_labels, dm1_data), control_dm, lookup=sample_id_map) (dm1_labels, dm1_data), (dm2_labels, dm2_data) = \ make_compatible_distance_matrices( (dm1_labels, dm1_data), (dm2_labels, dm2_data), lookup=sample_id_map) if len(dm1_labels) < 3: result += '%s\t%s\t%s\t%d\tToo few samples\n' % (fp1, fp2, control_dm_fp, len(dm1_labels)) continue elif len(dm1_labels) < 3: result += '%s\t%s\t%d\tToo few samples\n' % (fp1, fp2, len(dm1_labels)) continue dm1 = SymmetricDistanceMatrix(dm1_data, dm1_labels) dm2 = SymmetricDistanceMatrix(dm2_data, dm2_labels) # Create an instance of our correlation test and run it with # the specified number of permutations. if method == 'mantel': results = Mantel(dm1, dm2, tail_type)(num_perms) p_str = format_p_value_for_num_iters(results['p_value'], num_perms) result += "%s\t%s\t%d\t%.5f\t%s\t%d\t%s\n" % (fp1, fp2, len(dm1_labels), results[ 'r_value'], p_str, num_perms, tail_type) elif method == 'partial_mantel': cdm = SymmetricDistanceMatrix(cdm_data, cdm_labels) results = PartialMantel(dm1, dm2, cdm)(num_perms) p_str = format_p_value_for_num_iters(results['mantel_p'], num_perms) result += "%s\t%s\t%s\t%d\t%.5f\t%s\t%d\t%s\n" % (fp1, fp2, control_dm_fp, len( dm1_labels), results['mantel_r'], p_str, num_perms, 'greater') return result
def run_mantel_correlogram(fps, distmats, num_perms, comment, alpha, sample_id_map=None, variable_size_distance_classes=False): """Runs a Mantel correlogram analysis on all pairs of distance matrices. Returns a string suitable for writing out to a file containing the results of the test, a list of correlogram filepath names, and a list of matplotlib Figure objects representing each correlogram. The correlogram filepaths can have an extension string appended to the end of them and then be used to save each of the correlogram Figures to a file. Each correlogram filepath will be a combination of the two distance matrix filepaths that were used to create it. WARNING: Only symmetric, hollow distance matrices may be used as input. Asymmetric distance matrices, such as those obtained by the UniFrac Gain metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input. Arguments: fps - list of filepaths of the distance matrices distmats - list of tuples containing dm labels and dm data (i.e. the output of parse_distmat) num_perms - the number of permutations to use to calculate the p-value(s) comment - comment string to add to the beginning of the results string alpha - the alpha value to use to determine significance in the correlogram plots sample_id_map - dict mapping sample IDs (i.e. what is expected by make_compatible_distance_matrices) variable_size_distance_classes - create distance classes that vary in size (i.e. width) but have the same number of distances in each class """ if len(fps) != len(distmats): raise ValueError("Must provide the same number of filepaths as there " "are distance matrices.") if comment is None: comment = '' result = comment + 'DM1\tDM2\tNumber of entries\t' + \ 'Number of permutations\tClass index\t' + \ 'Number of distances\tMantel r statistic\t' + \ 'p-value\tp-value (Bonferroni corrected)\tTail type\n' correlogram_fps = [] correlograms = [] # Loop over all pairs of dms. for i, (fp1, (dm1_labels, dm1_data)) in enumerate(zip(fps, distmats)): for fp2, (dm2_labels, dm2_data) in zip(fps, distmats)[i + 1:]: # Make the current pair of distance matrices compatible by only # keeping samples that match between them, and ordering them by # the same sample IDs. (dm1_labels, dm1_data), (dm2_labels, dm2_data) = \ make_compatible_distance_matrices((dm1_labels, dm1_data), (dm2_labels, dm2_data), lookup=sample_id_map) if len(dm1_labels) < 3: result += '%s\t%s\t%d\tToo few samples\n' % (fp1, fp2, len(dm1_labels)) continue dm1 = SymmetricDistanceMatrix(dm1_data, dm1_labels) dm2 = SymmetricDistanceMatrix(dm2_data, dm2_labels) # Create an instance of our Mantel correlogram test and run it with # the specified number of permutations. mc = MantelCorrelogram(dm1, dm2, alpha=alpha, variable_size_distance_classes=variable_size_distance_classes) results = mc(num_perms) # Generate a name for the current correlogram and save it and the # correlogram itself. dm1_name = path.basename(fp1) dm2_name = path.basename(fp2) correlogram_fps.append('_'.join((dm1_name, 'AND', dm2_name, 'mantel_correlogram')) + '.') correlograms.append(results['correlogram_plot']) # Iterate over the results and write them to the text file. first_time = True for class_idx, num_dist, r, p, p_corr in zip( results['class_index'], results['num_dist'], results['mantel_r'], results['mantel_p'], results['mantel_p_corr']): # Format p-values and figure out which tail type we have based # on the sign of r. p_str = None if p is not None: p_str = format_p_value_for_num_iters(p, num_perms) p_corr_str = None if p_corr is not None: p_corr_str = format_p_value_for_num_iters(p_corr, num_perms) if r is None: tail_type = None elif r < 0: tail_type = 'less' else: tail_type = 'greater' if first_time: result += '%s\t%s\t%d\t%d\t%s\t%d\t%s\t%s\t%s\t%s\n' % ( fp1, fp2, len(dm1_labels), num_perms, class_idx, num_dist, r, p_str, p_corr_str, tail_type) first_time = False else: result += '\t\t\t\t%s\t%d\t%s\t%s\t%s\t%s\n' % (class_idx, num_dist, r, p_str, p_corr_str, tail_type) return result, correlogram_fps, correlograms
def compare_alpha_diversities(rarefaction_lines, mapping_lines, category, depth=None, test_type='nonparametric', num_permutations=999): """Compares alpha diversity values for differences per category treatment. Notes: Returns a defaultdict which as keys has the pairs of treatments being compared, and as values, lists of (pval,tval) tuples for each comparison at for a given iteration. Inputs: rarefaction_lines - list of lines, result of multiple rarefactions. mapping_lines - list of lines, mapping file lines. category - str, the category to be compared, eg 'Treatment' or 'Age'. depth - int, depth of the rarefaction file to use. if None, then will use the deepest available in the file. test_type - str, the type of t-test to perform. Must be either 'parametric' or 'nonparametric'. num_permutations - int, the number of Monte Carlo permutations to use if test_type is 'nonparametric'. """ if test_type == 'nonparametric' and num_permutations < 1: raise ValueError("Invalid number of permutations: %d. Must be greater " "than zero." % num_permutations) rarefaction_data = parse_rarefaction(rarefaction_lines) mapping_data = parse_mapping_file_to_dict(mapping_lines)[0] # samid_pairs, treatment_pairs are in the same order samid_pairs, treatment_pairs = sampleId_pairs(mapping_data, rarefaction_data, category) ps_avg_div = get_per_sample_average_diversities(rarefaction_data, depth) ttest_results, ad_avgs = {}, {} for sid_pair, treatment_pair in zip(samid_pairs, treatment_pairs): # if there is only 1 sample for each treatment in a comparison, and mc # using mc method, will error (e.g. mc_t_two_sample([1],[1]). if len(sid_pair[0]) == 1 and len(sid_pair[1]) == 1: ttest_results[treatment_pair] = (None, None) # add alpha diversity averages and standard deviations. since their # is only a single sample if we are in this part of the loop, we can # just record the sample value as the avg and 0 as the std. ad_avgs[treatment_pair[0]] = (sid_pair[0][0], 0.) ad_avgs[treatment_pair[1]] = (sid_pair[1][0], 0.) else: i = array([ps_avg_div[x] for x in sid_pair[0]]) j = array([ps_avg_div[x] for x in sid_pair[1]]) # add alpha diversity averages and standard deviations. ad_avgs[treatment_pair[0]] = (i.mean(), i.std()) ad_avgs[treatment_pair[1]] = (j.mean(), j.std()) # conduct tests if isnan(np_min(i)) or isnan(np_min(j)): ttest_results[treatment_pair] = (None, None) continue if test_type == 'parametric': obs_t, p_val = t_two_sample(i, j) elif test_type == 'nonparametric': obs_t, _, _, p_val = mc_t_two_sample(i, j, permutations=num_permutations) if p_val is not None: p_val = float(format_p_value_for_num_iters(p_val, num_iters=num_permutations)) elif p_val is None: # None will error in format_p_val obs_t, p_val = None, None else: raise ValueError("Invalid test type '%s'." % test_type) ttest_results[treatment_pair] = (obs_t, p_val) return ttest_results, ad_avgs
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # Create the output dir if it doesn't already exist. try: if not path.exists(opts.output_dir): create_dir(opts.output_dir) except: option_parser.error("Could not create or access output directory " "specified with the -o option.") # Parse the mapping file and distance matrix. md_map = MetadataMap.parseMetadataMap(open(opts.mapping_file,'U')) dm = DistanceMatrix.parseDistanceMatrix(open(opts.input_dm,'U')) # Separate all categories into a list, then grab the first category. categories = opts.categories.split(',') # Cursory check to make sure all categories passed in are in mapping file. maps = parse_mapping_file(open(opts.mapping_file,'U').readlines()) for category in categories: if not category in maps[1][1:]: option_parser.error("Category '%s' not found in mapping file " "columns:" % category) # Make sure the input distance matrix is symmetric and hollow. Must check # here before allowing R to use it, as R will silently ignore the diagonal # and upper triangle of the distance matrix. if not dm.is_symmetric_and_hollow(): option_parser.error("The distance matrix must be symmetric and " "hollow.") # Figure out which method we need to run. if opts.method == 'adonis': command_args = ["-d " + opts.input_dm + " -m " + opts.mapping_file + \ " -c " + categories[0] + " -o " + opts.output_dir + " -n " + \ str(opts.num_permutations)] rex = RExecutor() rex(command_args, "adonis.r", output_dir=opts.output_dir) elif opts.method == 'anosim': anosim = Anosim(md_map, dm, categories[0]) anosim_results = anosim(opts.num_permutations) output_file = open(opts.output_dir + "/" + opts.method + \ "_results.txt", "w+") output_file.write("Method Name\tR-value\tP-value") output_file.write("\n") output_file.write(anosim_results["method_name"]+"\t"+\ str(anosim_results["r_value"])+"\t"+\ str(anosim_results["p_value"])+"\t") output_file.write("\n") output_file.close() elif opts.method == 'best': bioenv = BioEnv(dm, md_map, categories) bioenv_results = bioenv() output_file = open(opts.output_dir+"/best_results.txt", 'w+') output_file.write("Method Name:\tNum_Vars:\t") output_file.write("\n") output_file.write(bioenv_results["method_name"]+"\t"+\ str(bioenv_results["num_vars"]) + "\t") output_file.write("\n") output_file.write("Variables:\t") output_file.write("\n") for variable in bioenv_results["vars"]: output_file.write(str(variable) + "\t") output_file.write("\n") output_file.write("RHO_Values:\t") output_file.write("\n") for rho_val in bioenv_results["bioenv_rho_vals"]: output_file.write(str(rho_val) + "\t") output_file.write("\n") output_file.close() elif opts.method == 'morans_i': command_args = ["-i " + opts.input_dm + " -m " + opts.mapping_file + \ " -c " + categories[0] + " -o " + opts.output_dir] rex = RExecutor() rex(command_args, "morans_i.r", output_dir=opts.output_dir) elif opts.method == 'mrpp': command_args = ["-d " + opts.input_dm + " -m " + opts.mapping_file + \ " -c " + categories[0] + " -o " + opts.output_dir + \ " -n " + str(opts.num_permutations)] rex = RExecutor() rex(command_args, "mrpp.r", output_dir=opts.output_dir) elif opts.method == 'permanova': permanova_plain = Permanova(md_map, dm, categories[0]) permanova_results = permanova_plain(opts.num_permutations) output_file = open(opts.output_dir+"/permanova_results.txt", 'w+') output_file.write("Method Name\tF-value\tP-value") output_file.write("\n") output_file.write(permanova_results["method_name"]+"\t"+\ str(permanova_results["f_value"]) + "\t" + \ format_p_value_for_num_iters(permanova_results["p_value"], \ opts.num_permutations)+"\t") output_file.write("\n") output_file.close() elif opts.method == 'permdisp': command_args = ["-d " + opts.input_dm + " -m " + opts.mapping_file + \ " -c " + categories[0] + " -o " + opts.output_dir + " -n " + \ str(opts.num_permutations)] rex = RExecutor() rex(command_args, "permdisp.r", output_dir=opts.output_dir) elif opts.method == 'dbrda': command_args = ["-i " + opts.input_dm + " -m " + opts.mapping_file + \ " -c " + categories[0] + " -o " + opts.output_dir + " -n " + \ str(opts.num_permutations)] rex = RExecutor() rex(command_args, "dbrda.r", output_dir=opts.output_dir)
def compare_alpha_diversities(rarefaction_lines, mapping_lines, category, depth, test_type='nonparametric', num_permutations=999): """Compares alpha diversity values for differences per category treatment. Notes: Returns a defaultdict which as keys has the pairs of treatments being compared, and as values, lists of (pval,tval) tuples for each comparison at for a given iteration. Inputs: rarefaction_lines - list of lines, result of multiple rarefactions. mapping_lines - list of lines, mapping file lines. category - str, the category to be compared, eg 'Treatment' or 'Age'. depth - int, depth of the rarefaction file to use. test_type - str, the type of t-test to perform. Must be either 'parametric' or 'nonparametric'. num_permutations - int, the number of Monte Carlo permutations to use if test_type is 'nonparametric'. """ if test_type == 'nonparametric' and num_permutations < 1: raise ValueError("Invalid number of permutations: %d. Must be greater " "than zero." % num_permutations) rarefaction_data = parse_rarefaction(rarefaction_lines) mapping_data = parse_mapping_file_to_dict(mapping_lines)[0] # samid_pairs, treatment_pairs are in the same order samid_pairs, treatment_pairs = sampleId_pairs(mapping_data, rarefaction_data, category) # extract only rows of the rarefaction data that are at the given depth rare_mat = array([row for row in rarefaction_data[3] if row[0]==depth]) # Average each col of the rarefaction mtx. Computing t test on averages over # all iterations. Avoids more comps which kills signifigance. rare_mat = (rare_mat.sum(0)/rare_mat.shape[0])[2:] #remove depth,iter cols sids = rarefaction_data[0][3:] # 0-2 are header strings results = {} for sid_pair, treatment_pair in zip(samid_pairs, treatment_pairs): # if there is only 1 sample for each treatment in a comparison, and mc # using mc method, will error (e.g. mc_t_two_sample([1],[1]). if len(sid_pair[0])==1 and len(sid_pair[1])==1: t_key = '%s,%s' % (treatment_pair[0], treatment_pair[1]) results[t_key]= (None,None) else: pair0_indices = [sids.index(i) for i in sid_pair[0]] pair1_indices = [sids.index(i) for i in sid_pair[1]] t_key = '%s,%s' % (treatment_pair[0], treatment_pair[1]) i = rare_mat.take(pair0_indices) j = rare_mat.take(pair1_indices) # found discussion of how to quickly check an array for nan here: # http://stackoverflow.com/questions/6736590/fast-check-for-nan-in-numpy if isnan(np_min(i)) or isnan(np_min(j)): results[t_key]= (None,None) continue if test_type == 'parametric': obs_t, p_val = t_two_sample(i,j) elif test_type == 'nonparametric': obs_t, _, _, p_val = mc_t_two_sample(i,j, permutations=num_permutations) if p_val != None: p_val = float(format_p_value_for_num_iters(p_val, num_iters=num_permutations)) elif p_val == None: #None will error in format_p_val obs_t, p_val = None, None else: raise ValueError("Invalid test type '%s'." % test_type) results[t_key]= (obs_t,p_val) return results
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_fps = opts.input_fps sample_id_map_fps = opts.sample_id_map_fps num_dimensions = opts.num_dimensions max_dims_str = str(num_dimensions or 'alldim') output_dir = opts.output_dir random_trials = opts.random_trials if random_trials != None and random_trials < 10: option_parser.error('Must perform >= 10 trails for Monte Carlo analysis.') if sample_id_map_fps and \ (len(sample_id_map_fps) + 1) != len(opts.input_fps): option_parser.error('If providing sample id maps, there must be exactly' ' one fewer sample id maps than input coordinate' ' matrices.') if not exists(output_dir): makedirs(output_dir) reference_input_fp = input_fps[0] reference_input_fp_dir, input_fn1 = split(reference_input_fp) reference_input_fp_basename, reference_input_fp_ext = splitext(input_fn1) output_summary_fp = join(output_dir,'procrustes_results.txt') summary_file_lines = \ ['#FP1\tFP2\tNum included dimensions\tMonte Carlo p-value\tCount better\tM^2', '#Warning: p-values in this file are NOT currently adjusted for multiple comparisons.'] for i,query_input_fp in enumerate(input_fps[1:]): query_input_fp_dir, query_input_fn = split(query_input_fp) query_input_fp_basename, query_input_fp_ext = splitext(query_input_fn) output_matrix1_fp = join(output_dir, '%s_transformed_reference.txt' % reference_input_fp_basename) output_matrix2_fp = join(output_dir,\ '%s_transformed_q%d.txt' % (query_input_fp_basename, i+1)) if sample_id_map_fps: sample_id_map = dict([(k,v[0]) \ for k,v in fields_to_dict(open(sample_id_map_fps[i], "U")).items()]) else: sample_id_map = None transformed_coords1, transformed_coords2, m_squared, randomized_coords2 =\ get_procrustes_results(open(reference_input_fp,'U'),\ open(query_input_fp,'U'),\ sample_id_map=sample_id_map,\ randomize=False, max_dimensions=num_dimensions) output_matrix1_f = open(output_matrix1_fp,'w') output_matrix1_f.write(transformed_coords1) output_matrix1_f.close() output_matrix2_f = open(output_matrix2_fp,'w') output_matrix2_f.write(transformed_coords2) output_matrix2_f.close() if random_trials: if opts.store_trial_details: trial_output_dir = join(output_dir,'trial_details_%d' % i+2) else: trial_output_dir = None coords_f1 = list(open(reference_input_fp,'U')) coords_f2 = list(open(query_input_fp,'U')) actual_m_squared, trial_m_squareds, count_better, mc_p_value =\ procrustes_monte_carlo(coords_f1, coords_f2, trials=random_trials, max_dimensions=num_dimensions, sample_id_map=sample_id_map, trial_output_dir=trial_output_dir) # truncate the p-value to the correct number of significant # digits mc_p_value_str = format_p_value_for_num_iters(mc_p_value, random_trials) summary_file_lines.append('%s\t%s\t%s\t%s\t%d\t%1.3f' %\ (reference_input_fp, query_input_fp, max_dims_str, mc_p_value_str,\ count_better, actual_m_squared)) else: summary_file_lines.append('%s\t%s\t%s\tNA\tNA\t%1.3f' %\ (reference_input_fp, query_input_fp, max_dims_str, m_squared)) # Write output summary f = open(output_summary_fp,'w') f.write('\n'.join(summary_file_lines)) f.write('\n') f.close()
def run_mantel_test(method, fps, distmats, num_perms, tail_type, comment, control_dm_fp=None, control_dm=None, sample_id_map=None): """Runs a Mantel test on all pairs of distance matrices. Returns a string suitable for writing out to a file containing the results of the test. WARNING: Only symmetric, hollow distance matrices may be used as input. Asymmetric distance matrices, such as those obtained by the UniFrac Gain metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input. Arguments: method - which Mantel test to run (either 'mantel' or 'partial_mantel') fps - list of filepaths of the distance matrices distmats - list of tuples containing dm labels and dm data (i.e. the output of parse_distmat) num_perms - the number of permutations to use to calculate the p-value(s) tail_type - the type of tail test to use when calculating the p-value(s). Can be 'two sided', 'greater', or 'less'. Only applies when method is mantel comment - comment string to add to the beginning of the results string control_dm_fp - filepath of the control distance matrix. Only applies when method is partial_mantel (it is required then) control_dm - tuple containing control distance matrix labels and matrix data. Only applies when method is partial_mantel (it is required then) sample_id_map - dict mapping sample IDs (i.e. what is expected by make_compatible_distance_matrices) """ if len(fps) != len(distmats): raise ValueError("Must provide the same number of filepaths as there " "are distance matrices.") if comment is None: comment = '' result = comment if method == 'mantel': result += 'DM1\tDM2\tNumber of entries\tMantel r statistic\t' + \ 'p-value\tNumber of permutations\tTail type\n' elif method == 'partial_mantel': if not control_dm_fp or not control_dm: raise ValueError("You must provide a control matrix filepath and " "control matrix when running the partial Mantel " "test.") result += 'DM1\tDM2\tCDM\tNumber of entries\t' + \ 'Mantel r statistic\tp-value\tNumber of permutations\t' +\ 'Tail type\n' else: raise ValueError("Invalid method '%s'. Must be either 'mantel' or " "'partial_mantel'." % method) # Loop over all pairs of dms. for i, (fp1, (dm1_labels, dm1_data)) in enumerate(zip(fps, distmats)): for fp2, (dm2_labels, dm2_data) in zip(fps, distmats)[i + 1:]: # Make the current pair of distance matrices compatible by only # keeping samples that match between them, and ordering them by # the same sample IDs. (dm1_labels, dm1_data), (dm2_labels, dm2_data) = \ make_compatible_distance_matrices((dm1_labels, dm1_data), (dm2_labels, dm2_data), lookup=sample_id_map) if method == 'partial_mantel': # We need to intersect three sets (three matrices). (dm1_labels, dm1_data), (cdm_labels, cdm_data) = \ make_compatible_distance_matrices( (dm1_labels, dm1_data), control_dm, lookup=sample_id_map) (dm1_labels, dm1_data), (dm2_labels, dm2_data) = \ make_compatible_distance_matrices( (dm1_labels, dm1_data), (dm2_labels, dm2_data), lookup=sample_id_map) if len(dm1_labels) < 3: result += '%s\t%s\t%s\t%d\tToo few samples\n' % ( fp1, fp2, control_dm_fp, len(dm1_labels)) continue elif len(dm1_labels) < 3: result += '%s\t%s\t%d\tToo few samples\n' % (fp1, fp2, len(dm1_labels)) continue dm1 = SymmetricDistanceMatrix(dm1_data, dm1_labels) dm2 = SymmetricDistanceMatrix(dm2_data, dm2_labels) # Create an instance of our correlation test and run it with # the specified number of permutations. if method == 'mantel': results = Mantel(dm1, dm2, tail_type)(num_perms) p_str = format_p_value_for_num_iters(results['p_value'], num_perms) result += "%s\t%s\t%d\t%.5f\t%s\t%d\t%s\n" % ( fp1, fp2, len(dm1_labels), results['r_value'], p_str, num_perms, tail_type) elif method == 'partial_mantel': cdm = SymmetricDistanceMatrix(cdm_data, cdm_labels) results = PartialMantel(dm1, dm2, cdm)(num_perms) p_str = format_p_value_for_num_iters(results['mantel_p'], num_perms) result += "%s\t%s\t%s\t%d\t%.5f\t%s\t%d\t%s\n" % ( fp1, fp2, control_dm_fp, len(dm1_labels), results['mantel_r'], p_str, num_perms, 'greater') return result
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_fps = opts.input_fps sample_id_map_fps = opts.sample_id_map_fps num_dimensions = opts.num_dimensions max_dims_str = str(num_dimensions or 'alldim') output_dir = opts.output_dir random_trials = opts.random_trials if random_trials is not None and random_trials < 10: option_parser.error( 'Must perform >= 10 trails for Monte Carlo analysis.') if sample_id_map_fps and \ (len(sample_id_map_fps) + 1) != len(opts.input_fps): option_parser.error('If providing sample id maps, there must be ' 'exactly one fewer sample id maps than input ' 'coordinate matrices.') if not exists(output_dir): makedirs(output_dir) reference_input_fp = input_fps[0] reference_input_fp_dir, input_fn1 = split(reference_input_fp) reference_input_fp_basename, reference_input_fp_ext = splitext(input_fn1) output_summary_fp = join(output_dir, 'procrustes_results.txt') summary_file_lines = [ '#FP1\tFP2\tNum included dimensions\tMonte Carlo ' 'p-value\tCount better\tM^2', '#Warning: p-values in this file are NOT currently ' 'adjusted for multiple comparisons.' ] for i, query_input_fp in enumerate(input_fps[1:]): query_input_fp_dir, query_input_fn = split(query_input_fp) query_input_fp_basename, query_input_fp_ext = splitext(query_input_fn) output_matrix1_fp = join( output_dir, '%s_transformed_reference.txt' % reference_input_fp_basename) output_matrix2_fp = join( output_dir, '%s_transformed_q%d.txt' % (query_input_fp_basename, i + 1)) if sample_id_map_fps: with open(sample_id_map_fps[i], "U") as f: sample_id_map = dict([ (k, v[0]) for k, v in fields_to_dict(f).iteritems() ]) else: sample_id_map = None with open(reference_input_fp, 'U') as ref_in_f: with open(query_input_fp, 'U') as query_in_f: transf_coords1, transf_coords2, m_squared, rand_coords2 =\ get_procrustes_results(ref_in_f, query_in_f, sample_id_map=sample_id_map, randomize=False, max_dimensions=num_dimensions) transf_coords1.write(output_matrix1_fp) transf_coords2.write(output_matrix2_fp) if random_trials: if opts.store_trial_details: trial_output_dir = join(output_dir, 'trial_details_%d' % i + 2) else: trial_output_dir = None coords_f1 = open(reference_input_fp, 'U') coords_f2 = open(query_input_fp, 'U') actual_m_squared, trial_m_squareds, count_better, mc_p_value =\ procrustes_monte_carlo(coords_f1, coords_f2, trials=random_trials, max_dimensions=num_dimensions, sample_id_map=sample_id_map, trial_output_dir=trial_output_dir) # truncate the p-value to the correct number of significant # digits mc_p_value_str = format_p_value_for_num_iters( mc_p_value, random_trials) summary_file_lines.append( '%s\t%s\t%s\t%s\t%d\t%1.3f' % (reference_input_fp, query_input_fp, max_dims_str, mc_p_value_str, count_better, actual_m_squared)) else: summary_file_lines.append( '%s\t%s\t%s\tNA\tNA\t%1.3f' % (reference_input_fp, query_input_fp, max_dims_str, m_squared)) # Write output summary with open(output_summary_fp, 'w') as f: f.write('\n'.join(summary_file_lines)) f.write('\n')
def compare_alpha_diversities(rarefaction_lines, mapping_lines, category, depth, test_type='nonparametric', num_permutations=999): """compares alpha diversities inputs: rarefaction_file - rarefaction file which gives scores for various rarefactions and depths mapping_file - file that has ID's and categories that the ID's fall in category - the category to be compared, is a string depth - the depth of the rarefaction_file to use, is an integer test_type - the type of t-test to perform, is a string. Must be either 'parametric' or 'nonparametric' num_permutations - the number of Monte Carlo permutations to use if test_type is 'nonparametric', is an integer outputs: results - a nested dictionary which specifies the category as the top level key, and as its value, dictionaries which give the results of the t_two_sample test for all unique pairs of values in the specified category """ if test_type == 'nonparametric' and num_permutations < 1: raise ValueError("Invalid number of permutations: %d. Must be greater " "than zero." % num_permutations) rarefaction_data = parse_rarefaction(rarefaction_lines) mapping_data = parse_mapping_file_to_dict(mapping_lines)[0] value_pairs = make_value_pairs_from_category(mapping_data, category) category_values_Ids = make_category_values_Id_dict(mapping_data, category) SampleId_pairs = map_category_value_pairs_to_Ids(value_pairs, category_values_Ids) map_from_Id_to_col = make_SampleIds_rarefaction_columns_dict( rarefaction_data) reduced_rarefaction_mtx = extract_rarefaction_scores_at_depth(depth, rarefaction_data) results = {category:{}} for pair in range(len(SampleId_pairs)): # Must flatten the matrix because t_two_sample only operates on # non-nested sequences (otherwise we'll get the wrong degrees of # freedom). i=(convert_SampleIds_to_rarefaction_mtx(SampleId_pairs[pair][0], reduced_rarefaction_mtx, map_from_Id_to_col)).flatten() j=(convert_SampleIds_to_rarefaction_mtx(SampleId_pairs[pair][1], reduced_rarefaction_mtx, map_from_Id_to_col)).flatten() if test_type == 'parametric': obs_t, p_val = t_two_sample(i,j) elif test_type == 'nonparametric': obs_t, _, _, p_val = mc_t_two_sample(i,j, permutations=num_permutations) p_val = format_p_value_for_num_iters(p_val, num_permutations) else: raise ValueError("Invalid test type '%s'." % test_type) results[category][(str(value_pairs[pair][0]), str(value_pairs[pair][1]))] = obs_t, p_val return results