def test_relative_abundance(self): """ Testing relative abundance() function of biom.calc.py. :return: Returns OK, if testing goal is achieved, otherwise raises error. """ self.result = bc.relative_abundance(self.biomf) # List containing manual calculations hand_calc = {"S1": {"GG_OTU_1": 0.192307692, "GG_OTU_2": 0.076923077, "GG_OTU_3": 0.192307692, "GG_OTU_4": 0.346153846, "GG_OTU_5": 0.192307692}, "S10": {"GG_OTU_1": 0.083333333, "GG_OTU_2": 0.125, "GG_OTU_3": 0.166666667, "GG_OTU_4": 0.333333333, "GG_OTU_5": 0.291666667}, "S2": {"GG_OTU_1": 0.161290323, "GG_OTU_2": 0.258064516, "GG_OTU_3": 0.258064516, "GG_OTU_4": 0.258064516, "GG_OTU_5": 0.064516129}, "S3": {"GG_OTU_1": 0.111111111, "GG_OTU_2": 0.222222222, "GG_OTU_3": 0.0, "GG_OTU_4": 0.277777778, "GG_OTU_5": 0.388888889}, "S4": {"GG_OTU_1": 0.181818182, "GG_OTU_2": 0.0, "GG_OTU_3": 0.545454545, "GG_OTU_4": 0.272727273, "GG_OTU_5": 0.0}, "S5": {"GG_OTU_1": 0.086956522, "GG_OTU_2": 0.260869565, "GG_OTU_3": 0.304347826, "GG_OTU_4": 0.217391304, "GG_OTU_5": 0.130434783}, "S6": {"GG_OTU_1": 0.333333333, "GG_OTU_2": 0.148148148, "GG_OTU_3": 0.296296296, "GG_OTU_4": 0.185185185, "GG_OTU_5": 0.037037037}, "S7": {"GG_OTU_1": 0.071428571, "GG_OTU_2": 0.178571429, "GG_OTU_3": 0.142857143, "GG_OTU_4": 0.285714286, "GG_OTU_5": 0.321428571}, "S8": {"GG_OTU_1": 0.230769231, "GG_OTU_2": 0.192307692, "GG_OTU_3": 0.307692308, "GG_OTU_4": 0.115384615, "GG_OTU_5": 0.153846154}, "S9": {"GG_OTU_1": 0.0, "GG_OTU_2": 0.111111111, "GG_OTU_3": 0.444444444, "GG_OTU_4": 0.0, "GG_OTU_5": 0.444444444}} # Testing the validity of relative_abundance() function. for sid in sorted(hand_calc.keys()): for otuid in sorted(hand_calc[sid].keys()): self.assertAlmostEqual( hand_calc[sid][otuid], self.result[sid][otuid], msg="Relative abundances not calculated accurately." ) # Test for valid sample IDs passed into function with self.assertRaisesRegexp(ValueError, "\nError while calculating relative " "abundances: The sampleIDs provided do not match " "the sampleIDs in biom file. Please double check " "the sampleIDs provided.\n"): bc.relative_abundance(self.biomf, sampleIDs=["NS01", "NS02", "NS03"])
def test_MRA(self): """ Testing mean relative abundance calculation, MRA() function of biom_calc.py. :return: Returns OK, if testing goal was achieved, otherwise raises error. """ self.result = bc.MRA(self.biom) self.mean_otu = bc.mean_otu_pct_abundance( bc.relative_abundance(self.biom), ['GG_OTU_1', 'GG_OTU_2', 'GG_OTU_3', 'GG_OTU_4', 'GG_OTU_5'] ) # Obtaining lists of function calculations and # manual hand calculations func_calc = self.result.values() hand_calc = self.mean_otu.values() # Testing the validity of the calculations of mean_otu_pct_abundance(). for hand, res in zip(hand_calc, func_calc): self.assertAlmostEqual( hand, res, msg='Mean OTU not calculated accurately.' )
def test_mean_otu_pct_abundance(self): """ Testing mean_otu_pct_abundance() function of biom_calc.py. :return: Returns OK, if testing goal was achieved, otherwise raises error. """ self.rel_a = bc.relative_abundance(self.biomf) self.result = bc.mean_otu_pct_abundance( self.rel_a, ["GG_OTU_1", "GG_OTU_2"] ) # Obtaining lists of function calculations and manual hand calculations func_calc = self.result.values() # list containing hand calculated relative abundance values hand_calc = [(0 + 0.0153846153846 + 0.0285714285714 + 0.04 + 0.05 + 0.0588235294118)/6, (0.1 + 0.107692307692 + 0.114285714286 + 0.12 + 0.125 + 0.129411764706)/6] # Testing the validity of the calculations of mean_otu_pct_abundance(). for hand, res in zip(hand_calc, func_calc): self.assertAlmostEqual( hand*100, res, msg="Mean OTU not calculated accurately." )
def test_relative_abundance(self): """ Testing relative abundance() function of biom.calc.py. :return: Returns OK, if testing goal is achieved, otherwise raises error. """ sample = 'Sample3' self.result = bc.relative_abundance(self.biom) # List containing manual calculations hand_calc = [1/4.0, 1/4.0, 1/4.0, 1/4.0] # Obtaining list of function calculated relative abundance for sample # result1 = self.result.values() # result1 is a list # result2 = result1[0] # result2 is a dict # list containing the calculated relative abundance values func_calc = self.result['Sample3'].values() # Testing the validity of relative_abundance() function. for hand, res in zip(hand_calc, func_calc): self.assertAlmostEqual( hand, res, msg='Relative abundances not calculated accurately.' )
def test_mean_otu_pct_abundance(self): """ Testing mean_otu_pct_abundance() function of biom_calc.py. :return: Returns OK, if testing goal was achieved, otherwise raises error. """ self.rel_a = bc.relative_abundance(self.biom) self.result = bc.mean_otu_pct_abundance( self.rel_a, ['GG_OTU_1','GG_OTU_2'] ) # Obtaining lists of function calculations and manual hand calculations func_calc = self.result.values() result1 = self.rel_a.values() # result1 is a list # list containing hand calculated relative abundance values hand_calc = [0.25/6, (1.0+0.3333333333333333+0.25+0.7142857142857143+0.3333333333333333)/6] # Testing the validity of the calculations of mean_otu_pct_abundance(). for hand, res in zip(hand_calc, func_calc): self.assertAlmostEqual( hand*100, res, msg='Mean OTU not calculated accurately.' )
def test_arcsine_sqrt_transform(self): """ Testing arcsine_sqrt_transform() function of biom_calc.py. :return: Returns OK if testing goal is achieved, otherwise raises error. """ self.result1 = bc.relative_abundance(self.biomf) self.result2 = bc.arcsine_sqrt_transform(self.result1) # Obtaining results to compare. hand_calc = {"S1": {"GG_OTU_1": 0.453961252, "GG_OTU_2": 0.281034902, "GG_OTU_3": 0.453961252, "GG_OTU_4": 0.629014802, "GG_OTU_5": 0.453961252}, "S10": {"GG_OTU_1": 0.292842772, "GG_OTU_2": 0.361367124, "GG_OTU_3": 0.420534335, "GG_OTU_4": 0.615479709, "GG_OTU_5": 0.570510448}, "S2": {"GG_OTU_1": 0.413273808, "GG_OTU_2": 0.532861869, "GG_OTU_3": 0.532861869, "GG_OTU_4": 0.532861869, "GG_OTU_5": 0.256813917}, "S3": {"GG_OTU_1": 0.339836909, "GG_OTU_2": 0.490882678, "GG_OTU_3": 0, "GG_OTU_4": 0.555121168, "GG_OTU_5": 0.673351617}, "S4": {"GG_OTU_1": 0.440510663, "GG_OTU_2": 0, "GG_OTU_3": 0.830915552, "GG_OTU_4": 0.549467245, "GG_OTU_5": 0}, "S5": {"GG_OTU_1": 0.299334026, "GG_OTU_2": 0.53606149, "GG_OTU_3": 0.584373897, "GG_OTU_4": 0.485049787, "GG_OTU_5": 0.36950894}, "S6": {"GG_OTU_1": 0.615479709, "GG_OTU_2": 0.395099667, "GG_OTU_3": 0.575591472, "GG_OTU_4": 0.444859969, "GG_OTU_5": 0.1936583}, "S7": {"GG_OTU_1": 0.270549763, "GG_OTU_2": 0.436286927, "GG_OTU_3": 0.387596687, "GG_OTU_4": 0.563942641, "GG_OTU_5": 0.602794553}, "S8": {"GG_OTU_1": 0.501093013, "GG_OTU_2": 0.453961252, "GG_OTU_3": 0.588002604, "GG_OTU_4": 0.346579954, "GG_OTU_5": 0.403057074}, "S9": {"GG_OTU_1": 0, "GG_OTU_2": 0.339836909, "GG_OTU_3": 0.729727656, "GG_OTU_4": 0, "GG_OTU_5": 0.729727656}} # Testing validity of the transforms. for sid in sorted(hand_calc.keys()): for otuid in sorted(hand_calc[sid].keys()): self.assertAlmostEqual( hand_calc[sid][otuid], self.result2[sid][otuid], msg="Arcsine squareroot transformation was not accurate." )
def main(): args = handle_program_options() try: with open(args.input_biom_fp): pass except IOError as ioe: sys.exit('\nError in BIOM file path: {}\n'.format(ioe)) biomf = biom.load_table(args.input_biom_fp) rel_abd = bc.relative_abundance(biomf) if args.stabilize_variance: rel_abd = bc.arcsine_sqrt_transform(rel_abd) write_relative_abundance(rel_abd, biomf, args.output_tsv_fp)
def assign_otu_membership(biomfile): """ Determines the OTUIDs present in each sample. :type biomfile: biom.table.Table :param biomfile: BIOM table object from the biom-format library. :rtype: dict :return: Returns a dictionary keyed on Sample ID with sets containing the IDs of OTUIDs found in each sample. """ samples = defaultdict(set) rel_abd = bc.relative_abundance(biomfile) for sid in rel_abd: samples[sid].update([oid for oid, ra in rel_abd[sid].items() if ra > 0]) return samples
def test_arcsine_sqrt_transform(self): """ Testing arcsine_sqrt_transform() function of biom_calc.py. :return: Returns OK if testing goal is achieved, otherwise raises error. """ self.result1 = bc.relative_abundance(self.biomf) self.result2 = bc.arcsine_sqrt_transform(self.result1) # Obtaining results to compare. hand_calc = [0, 0.32175055439, 0.463647609, 0.57963974036, 0.684719203] func_calc = self.result2.values()[3].values() # Testing validity of the transforms. for hand, func in zip(hand_calc, func_calc): self.assertAlmostEqual( hand, func, places=7, msg="Function did not calculate transformation accurately." )
def test_assign_otu_membership(self): """ Testing assign_otu_membership() function of otu_calc.py. :return: Returns OK if the test goals were achieved, otherwise raises error. """ self.result = oc.assign_otu_membership(self.biom) # Obtaining the values to be tested result1 = bc.relative_abundance(self.biom, ['Sample1']) hand_calc = result1.values()[0].values() func_calc = [0.714286, 0.285714] # Testing the validity of assign_otu_membership() function for hand, func in zip(hand_calc, func_calc): self.assertAlmostEqual( hand, func, places=5, msg='Error! OTU membership calculations are inaccurate!' )
def test_arcsine_sqrt_transform(self): """ Testing arcsine_sqrt_transform() function of biom_calc.py. :return: Returns OK if testing goal is achieved, otherwise raises error. """ self.result1 = bc.relative_abundance(self.biom) self.result2 = bc.arcsine_sqrt_transform(self.result1) # Obtaining results to compare. hand_calc = [1.00685369, 0.563942641] func_calc = self.result2.values()[3].values() # Testing validity of the transforms. for hand, func in zip(hand_calc, func_calc): self.assertAlmostEqual( hand, func, places=7, msg='Function did not calculate trnasformation accurately.' )
def test_mean_otu_pct_abundance(self): """ Testing mean_otu_pct_abundance() function of biom_calc.py. :return: Returns OK, if testing goal was achieved, otherwise raises error. """ self.rel_a = bc.relative_abundance(self.biomf) self.result = bc.mean_otu_pct_abundance(self.rel_a, ["GG_OTU_1", "GG_OTU_2"]) # list containing hand calculated relative abundance values hand_calc = {"GG_OTU_1": 14.52348298, "GG_OTU_2": 15.73217761, "GG_OTU_3": 26.58131438, "GG_OTU_4": 22.91732137, "GG_OTU_5": 20.24570366} # Testing the validity of the calculations of mean_otu_pct_abundance(). for oid in ["GG_OTU_1", "GG_OTU_2"]: self.assertAlmostEqual( hand_calc[oid], self.result[oid], msg="Mean OTU percent abundance not calculated accurately." )
def test_relative_abundance(self): """ Testing relative abundance() function of biom.calc.py. :return: Returns OK, if testing goal is achieved, otherwise raises error. """ self.result = bc.relative_abundance(self.biomf) # List containing manual calculations hand_calc = [0.02857142857, 0.11428571429, 0.2, 0.28571428571, 0.37142857143] # List containing the calculated relative abundance values func_calc = self.result["Sample3"].values() # Testing the validity of relative_abundance() function. for hand, res in zip(hand_calc, func_calc): self.assertAlmostEqual( hand, res, msg="Relative abundances not calculated accurately." )
def main(): args = handle_program_options() # Parse and read mapping file try: header, imap = util.parse_map_file(args.map_fp) category_idx = header.index(args.group_by) except IOError as ioe: err_msg = "\nError in metadata mapping filepath (-m): {}\n" sys.exit(err_msg.format(ioe)) # Obtain group colors try: assert args.colors is not None except AssertionError: categories = {v[category_idx] for k, v in imap.items()} color_cycle = cycle(Set3_12.hex_colors) class_colors = {c: color_cycle.next() for c in categories} else: class_colors = util.color_mapping(imap, header, args.group_by, args.colors) if args.dist_matrix_file: try: dm_data = pd.read_csv(args.dist_matrix_file, sep="\t", index_col=0) except IOError as ioe: err_msg = "\nError with unifrac distance matrix file (-d): {}\n" sys.exit(err_msg.format(ioe)) dm_data.insert(0, "Condition", [imap[str(sid)][category_idx] for sid in dm_data.index]) if args.annotate_points: sampleids = [str(sid) for sid in dm_data.index] else: sampleids = None if args.save_lda_input: dm_data.to_csv(args.save_lda_input, sep="\t") # Run LDA X_lda, y_lda, exp_var = run_LDA(dm_data) else: # Load biom file and calculate relative abundance try: biomf = biom.load_table(args.otu_table) except IOError as ioe: err_msg = "\nError with biom format file (-d): {}\n" sys.exit(err_msg.format(ioe)) # Get normalized relative abundances rel_abd = bc.relative_abundance(biomf) rel_abd = bc.arcsine_sqrt_transform(rel_abd) df_rel_abd = pd.DataFrame(rel_abd).T df_rel_abd.insert(0, "Condition", [imap[sid][category_idx] for sid in df_rel_abd.index]) if args.annotate_points: sampleids = df_rel_abd.index else: sampleids = None if args.save_lda_input: df_rel_abd.to_csv(args.save_lda_input, sep="\t") # Run LDA X_lda, y_lda, exp_var = run_LDA(df_rel_abd) # Plot LDA if args.dimensions == 3: plot_LDA(X_lda, y_lda, class_colors, exp_var, style=args.ggplot2_style, fig_size=args.figsize, label_pad=args.label_padding, font_size=args.font_size, sids=sampleids, dim=3, zangles=args.z_angles, pt_size=args.point_size, out_fp=args.out_fp) else: plot_LDA(X_lda, y_lda, class_colors, exp_var, style=args.ggplot2_style, fig_size=args.figsize, label_pad=args.label_padding, font_size=args.font_size, sids=sampleids, pt_size=args.point_size, out_fp=args.out_fp)
def main(): args = handle_program_options() # Parse and read mapping file try: header, imap = util.parse_map_file(args.map_fp) category_idx = header.index(args.group_by) except IOError as ioe: err_msg = "\nError in metadata mapping filepath (-m): {}\n" sys.exit(err_msg.format(ioe)) # Obtain group colors try: assert args.colors is not None except AssertionError: categories = {v[category_idx] for k, v in imap.items()} color_cycle = cycle(Set3_12.hex_colors) class_colors = {c: color_cycle.next() for c in categories} else: class_colors = util.color_mapping(imap, header, args.group_by, args.colors) if args.dist_matrix_file: try: dm_data = pd.read_csv(args.dist_matrix_file, sep="\t", index_col=0) except IOError as ioe: err_msg = "\nError with unifrac distance matrix file (-d): {}\n" sys.exit(err_msg.format(ioe)) dm_data.insert(0, "Condition", [imap[str(sid)][category_idx] for sid in dm_data.index]) if args.annotate_points: sampleids = [str(sid) for sid in dm_data.index] else: sampleids = None if args.save_lda_input: dm_data.to_csv(args.save_lda_input, sep="\t") # Run LDA X_lda, y_lda, exp_var = run_LDA(dm_data) else: # Load biom file and calculate relative abundance try: biomf = biom.load_table(args.otu_table) except IOError as ioe: err_msg = "\nError with biom format file (-d): {}\n" sys.exit(err_msg.format(ioe)) # Get normalized relative abundances rel_abd = bc.relative_abundance(biomf) rel_abd = bc.arcsine_sqrt_transform(rel_abd) df_rel_abd = pd.DataFrame(rel_abd).T df_rel_abd.insert( 0, "Condition", [imap[sid][category_idx] for sid in df_rel_abd.index]) if args.annotate_points: sampleids = df_rel_abd.index else: sampleids = None if args.save_lda_input: df_rel_abd.to_csv(args.save_lda_input, sep="\t") # Run LDA X_lda, y_lda, exp_var = run_LDA(df_rel_abd) # Plot LDA if args.dimensions == 3: plot_LDA(X_lda, y_lda, class_colors, exp_var, style=args.ggplot2_style, fig_size=args.figsize, label_pad=args.label_padding, font_size=args.font_size, sids=sampleids, dim=3, zangles=args.z_angles, pt_size=args.point_size, out_fp=args.out_fp) else: plot_LDA(X_lda, y_lda, class_colors, exp_var, style=args.ggplot2_style, fig_size=args.figsize, label_pad=args.label_padding, font_size=args.font_size, sids=sampleids, pt_size=args.point_size, out_fp=args.out_fp)
def main(): args = handle_program_options() # Parse and read mapping file try: header, imap = util.parse_map_file(args.map_fp) category_idx = header.index(args.group_by) except IOError as ioe: err_msg = "\nError in metadata mapping filepath (-m): {}\n" sys.exit(err_msg.format(ioe)) # Obtain group colors class_colors = util.color_mapping(imap, header, args.group_by, args.color_by) # Get otus for LDA bubble plots try: bubble_otus = set(pd.read_csv(args.otu_ids_fp, sep="\n", header=None)[0]) except IOError as ioe: err_msg = "\nError in OTU IDs file (--bubble): {}\n" sys.exit(err_msg.format(ioe)) # Load biom file and calculate relative abundance try: biomf = biom.load_table(args.otu_table) except IOError as ioe: err_msg = "\nError with biom format file (-d): {}\n" sys.exit(err_msg.format(ioe)) # Get normalized relative abundances rel_abd = bc.relative_abundance(biomf) rel_abd = bc.arcsine_sqrt_transform(rel_abd) abd_val = {abd for sid, v1 in rel_abd.items() for otuid, abd in v1.items() if abd > 0} bubble_range = np.linspace(min(abd_val), max(abd_val), num=5) * args.scale_by # Get abundance to the nearest 50 bubble_range = [int(50 * round(float(abd)/50)) for abd in bubble_range[1:]] # Set up input for LDA calc and get LDA transformed data if args.dist_matrix_file: try: uf_data = pd.read_csv(args.dist_matrix_file, sep="\t", index_col=0) except IOError as ioe: err_msg = "\nError with unifrac distance matrix file (-d): {}\n" sys.exit(err_msg.format(ioe)) uf_data.insert(0, "Condition", [imap[sid][category_idx] for sid in uf_data.index]) sampleids = uf_data.index if args.save_lda_input: uf_data.to_csv(args.save_lda_input, sep="\t") # Run LDA X_lda, y_lda, exp_var = run_LDA(uf_data) else: df_rel_abd = pd.DataFrame(rel_abd).T df_rel_abd.insert(0, "Condition", [imap[sid][category_idx] for sid in df_rel_abd.index]) sampleids = df_rel_abd.index if args.save_lda_input: df_rel_abd.to_csv(args.save_lda_input, sep="\t") # Run LDA X_lda, y_lda, exp_var = run_LDA(df_rel_abd) # Calculate position and size of SampleIDs to plot for each OTU for otuid in bubble_otus: otuname = oc.otu_name(biomf.metadata(otuid, axis="observation")["taxonomy"]) plot_data = {cat: {"x": [], "y": [], "size": [], "label": []} for cat in class_colors.keys()} for sid, data in zip(sampleids, X_lda): category = plot_data[imap[sid][category_idx]] try: size = rel_abd[sid][otuid] * args.scale_by except KeyError as ke: print("{} not found in {} sample.".format(ke, sid)) continue category["x"].append(float(data[0])) category["y"].append(float(data[1])) category["size"].append(size) # Plot LDA bubble for each OTU fig = plt.figure(figsize=args.figsize) ax = fig.add_subplot(111) for i, cat in enumerate(plot_data): plt.scatter(plot_data[cat]["x"], plot_data[cat]["y"], s=plot_data[cat]["size"], label=cat, color=class_colors[cat], alpha=0.85, edgecolors="k") if X_lda.shape[1] == 1: plt.ylim((0.5, 2.5)) plt.title(" ".join(otuname.split("_")), style="italic", fontsize=13) try: plt.xlabel("LD1 (Percent Explained Variance: {:.3f}%)".format(exp_var[0]*100), fontsize=13, labelpad=15) except: plt.xlabel("LD1", fontsize=13, labelpad=15) try: plt.ylabel("LD2 (Percent Explained Variance: {:.3f}%)".format(exp_var[1]*100), fontsize=13, labelpad=15) except: plt.ylabel("LD2", fontsize=13, labelpad=15) lgnd1 = plt.legend(loc="best", scatterpoints=3, fontsize=13) for i in range(len(class_colors.keys())): lgnd1.legendHandles[i]._sizes = [80] # Change the legend marker size manually # Add the legend manually to the current plot plt.gca().add_artist(lgnd1) c = [plt.scatter([], [], c="w", edgecolors="k", s=s1) for s1 in bubble_range] plt.legend(c, ["{}".format(s2) for s2 in bubble_range], title="Scaled Bubble\n Sizes", frameon=True, labelspacing=2, fontsize=13, loc=4, scatterpoints=1, borderpad=1.1) # Set style for LDA bubble plots if args.ggplot2_style: gu.ggplot2_style(ax) fc = "0.8" else: fc = "none" # Save LDA bubble plots to output directory if args.verbose: print("Saving chart for {}".format(" ".join(otuname.split("_")))) fig.savefig(pj(args.output_dir, "_".join(otuname.split())) + "." + args.save_as, facecolor=fc, edgecolor="none", dpi=300, bbox_inches="tight", pad_inches=0.2) plt.close(fig)
def main(): args = handle_program_options() try: with open(args.otu_table): pass except IOError as ioe: sys.exit("\nError with BIOM format file:{}\n".format(ioe)) try: with open(args.pcoa_fp): pass except IOError as ioe: sys.exit("\nError with principal coordinates file:{}\n".format(ioe)) try: with open(args.mapping): pass except IOError as ioe: sys.exit("\nError with mapping file:{}\n".format(ioe)) # check that the output dir exists, create it if not util.ensure_dir(args.output_dir) # load the BIOM table biomtbl = biom.load_table(args.otu_table) # Read unifrac principal coordinates file unifrac = util.parse_unifrac(args.pcoa_fp) # Read otu data file otus = set() with open(args.otu_ids_fp, "rU") as nciF: for line in nciF.readlines(): line = line.strip() otus.add(line) # Gather categories from mapping file header, imap = util.parse_map_file(args.mapping) try: category_idx = header.index(args.group_by) except ValueError: msg = "Error: Specified mapping category '{}' not found." sys.exit(msg.format(args.group_by)) category_ids = util.gather_categories(imap, header, [args.group_by]) color_map = util.color_mapping(imap, header, args.group_by, args.colors) rel_abd = bc.relative_abundance(biomtbl) rel_abd = bc.arcsine_sqrt_transform(rel_abd) # plot samples based on relative abundance of some OTU ID for otuid in otus: otuname = oc.otu_name( biomtbl.metadata(otuid, axis="observation")["taxonomy"]) cat_data = { cat: { "pc1": [], "pc2": [], "size": [] } for cat in category_ids } for sid in unifrac["pcd"]: category = cat_data[imap[sid][category_idx]] try: size = rel_abd[sid][otuid] * args.scale_by except KeyError as ke: print("{} not found in {} sample.".format(ke, sid)) continue category["pc1"].append(float(unifrac["pcd"][sid][0])) category["pc2"].append(float(unifrac["pcd"][sid][1])) category["size"].append(size) if args.verbose: print("Saving chart for {}".format(" ".join(otuname.split("_")))) xr, yr = calculate_xy_range(cat_data) plot_PCoA(cat_data, otuname, unifrac, color_map.keys(), color_map, xr, yr, args.output_dir, args.save_as, args.ggplot2_style)
def main(): args = handle_program_options() try: with open(args.otu_table): pass except IOError as ioe: sys.exit("\nError with BIOM format file:{}\n".format(ioe)) try: with open(args.pcoa_fp): pass except IOError as ioe: sys.exit("\nError with principal coordinates file:{}\n".format(ioe)) try: with open(args.mapping): pass except IOError as ioe: sys.exit("\nError with mapping file:{}\n".format(ioe)) # check that the output dir exists, create it if not util.ensure_dir(args.output_dir) # load the BIOM table biomtbl = biom.load_table(args.otu_table) # Read unifrac principal coordinates file unifrac = util.parse_unifrac(args.pcoa_fp) # Read otu data file otus = set() with open(args.otu_ids_fp, "rU") as nciF: for line in nciF.readlines(): line = line.strip() otus.add(line) # Gather categories from mapping file header, imap = util.parse_map_file(args.mapping) try: category_idx = header.index(args.group_by) except ValueError: msg = "Error: Specified mapping category '{}' not found." sys.exit(msg.format(args.group_by)) category_ids = util.gather_categories(imap, header, [args.group_by]) color_map = util.color_mapping(imap, header, args.group_by, args.colors) rel_abd = bc.relative_abundance(biomtbl) rel_abd = bc.arcsine_sqrt_transform(rel_abd) # plot samples based on relative abundance of some OTU ID for otuid in otus: otuname = oc.otu_name(biomtbl.metadata(otuid, axis="observation")["taxonomy"]) cat_data = {cat: {"pc1": [], "pc2": [], "size": []} for cat in category_ids} for sid in unifrac["pcd"]: category = cat_data[imap[sid][category_idx]] try: size = rel_abd[sid][otuid] * args.scale_by except KeyError as ke: print("{} not found in {} sample.".format(ke, sid)) continue category["pc1"].append(float(unifrac["pcd"][sid][0])) category["pc2"].append(float(unifrac["pcd"][sid][1])) category["size"].append(size) if args.verbose: print("Saving chart for {}".format(" ".join(otuname.split("_")))) xr, yr = calculate_xy_range(cat_data) plot_PCoA(cat_data, otuname, unifrac, color_map.keys(), color_map, xr, yr, args.output_dir, args.save_as, args.ggplot2_style)
def main(): args = program_options() try: biomf = biom.load_table(args.in_biomf) except IOError as ioe: sys.exit("Error with input BIOM format file: {}".format(ioe)) else: rel_abd = relative_abundance(biomf) ast_rel_abd = ast(rel_abd) # Get pairwise combinations of OTUs otu_combos = list(combinations(biomf.ids("observation"), 2)) try: mheader, mdata = parse_map_file(args.map_fnh) except IOError as ioe: sys.exit("Error with input mapping file: {}".format(ioe)) else: # Gather sampleID categories sid_cat = gather_categories(mdata, mheader, [args.category_column]) # Create arguments for helper function to be supplied to multiprocessing pool.map() chunksize = 10000 jobs = [( otu_combos[x:x + chunksize], sid_cat, ast_rel_abd, ) for x in xrange(0, len(otu_combos), chunksize)] print("{0} jobs created.".format(len(jobs))) # Start multiprocessing jobs try: print("Starting map_async()...") pool = Pool() res = pool.map_async(calc_corr_helper, jobs) pool.close() pool.join() except Exception: sys.exit("Error while calculating correlations\n{}".format( format_exc())) else: s_rho_calc = [] k_tau_calc = [] for r in res.get(): for s in r: if s[0] == "Spearman": s_rho_calc.append(s) else: k_tau_calc.append(s) # Get FDR corrected correlation results print("Running FDR correction on {} Spearman's Rho.".format( len(s_rho_calc))) fdr_corr_s_rho = run_fdr(s_rho_calc) print("Running FDR correction on {} Kendall Tau.".format(len(k_tau_calc))) fdr_corr_k_tau = run_fdr(k_tau_calc) # Consolidate correlation results k_kos = {( e[2], e[3], ) for e in fdr_corr_k_tau} s_kos = {( f[2], f[3], ) for f in fdr_corr_s_rho} final_kos = s_kos & k_kos print( "{0} elements from KendallTau\n{1} elements from SpearmanRho\n{2} elements are " "common to both.".format(len(k_kos), len(s_kos), len(final_kos))) final_fdr_corr_results = [ cdata[1:] for cdata in fdr_corr_s_rho if ( cdata[2], cdata[3], ) in final_kos ] # Write our results to file with open(args.out_fnh, "w") as outf: outf.write("Category\tVariable\tby Variable\tCorrelation\tp value\n") for k in final_fdr_corr_results: outf.write("{0}\t{1}\t{2}\t{3}\t{4}\n".format( k[0], k[1], k[2], k[3], k[4]))