def get_relative_abundance(biomfile): """ Return arcsine transformed relative abundance from a BIOM format file. :type biomfile: BIOM format file :param biomfile: BIOM format file used to obtain relative abundances for each OTU in a SampleID, which are used as node sizes in network plots. :type return: Dictionary of dictionaries. :return: Dictionary keyed on SampleID whose value is a dictionarykeyed on OTU Name whose value is the arc sine tranfsormed relative abundance value for that SampleID-OTU Name pair. """ biomf = biom.load_table(biomfile) norm_biomf = biomf.norm(inplace=False) rel_abd = {} for sid in norm_biomf.ids(): rel_abd[sid] = {} for otuid in norm_biomf.ids("observation"): otuname = oc.otu_name(norm_biomf.metadata(otuid, axis="observation")["taxonomy"]) otuname = " ".join(otuname.split("_")) abd = norm_biomf.get_value_by_ids(otuid, sid) rel_abd[sid][otuname] = abd ast_rel_abd = bc.arcsine_sqrt_transform(rel_abd) return ast_rel_abd
def calc_rel_abd(biomf, sampleIDs=None): """ Calculate relative abundance from a biom table either based on sampleIDs or otuIDs. :type biomf: Biom file format :param biomf: Biom table loaded object :type sampleIDs: list :param sampleIDs: Only calculate relative abundances for these sampleIDs. Default is None. :return type: defaultdict(dict) :return: A dict keyed on sampleID with its value denoting a dict keyed on otuID and abundance value for that [sampleID, otuID] pair. """ norm_biomf = biomf.norm(inplace=False) if sampleIDs is None: sampleIDs = norm_biomf.ids() otuIDs = norm_biomf.ids(axis="observation") rel_abd = defaultdict(dict) for sample in sampleIDs: for otu in otuIDs: abd = norm_biomf.get_value_by_ids(otu, sample) otu_tax = norm_biomf.metadata(otu, "observation")["taxonomy"] otu_name = oc.otu_name(otu_tax) rel_abd[sample][otu_name] = abd trans_rel_abd = bc.arcsine_sqrt_transform(rel_abd) return trans_rel_abd
def get_relative_abundance(biomfile): """ Return arcsine transformed relative abundance from a BIOM format file. :type biomfile: BIOM format file :param biomfile: BIOM format file used to obtain relative abundances for each OTU in a SampleID, which are used as node sizes in network plots. :type return: Dictionary of dictionaries. :return: Dictionary keyed on SampleID whose value is a dictionarykeyed on OTU Name whose value is the arc sine tranfsormed relative abundance value for that SampleID-OTU Name pair. """ biomf = biom.load_table(biomfile) norm_biomf = biomf.norm(inplace=False) rel_abd = {} for sid in norm_biomf.ids(): rel_abd[sid] = {} for otuid in norm_biomf.ids("observation"): otuname = oc.otu_name( norm_biomf.metadata(otuid, axis="observation")["taxonomy"]) otuname = " ".join(otuname.split("_")) abd = norm_biomf.get_value_by_ids(otuid, sid) rel_abd[sid][otuname] = abd ast_rel_abd = bc.arcsine_sqrt_transform(rel_abd) return ast_rel_abd
def test_arcsine_sqrt_transform(self): """ Testing arcsine_sqrt_transform() function of biom_calc.py. :return: Returns OK if testing goal is achieved, otherwise raises error. """ self.result1 = bc.relative_abundance(self.biomf) self.result2 = bc.arcsine_sqrt_transform(self.result1) # Obtaining results to compare. hand_calc = {"S1": {"GG_OTU_1": 0.453961252, "GG_OTU_2": 0.281034902, "GG_OTU_3": 0.453961252, "GG_OTU_4": 0.629014802, "GG_OTU_5": 0.453961252}, "S10": {"GG_OTU_1": 0.292842772, "GG_OTU_2": 0.361367124, "GG_OTU_3": 0.420534335, "GG_OTU_4": 0.615479709, "GG_OTU_5": 0.570510448}, "S2": {"GG_OTU_1": 0.413273808, "GG_OTU_2": 0.532861869, "GG_OTU_3": 0.532861869, "GG_OTU_4": 0.532861869, "GG_OTU_5": 0.256813917}, "S3": {"GG_OTU_1": 0.339836909, "GG_OTU_2": 0.490882678, "GG_OTU_3": 0, "GG_OTU_4": 0.555121168, "GG_OTU_5": 0.673351617}, "S4": {"GG_OTU_1": 0.440510663, "GG_OTU_2": 0, "GG_OTU_3": 0.830915552, "GG_OTU_4": 0.549467245, "GG_OTU_5": 0}, "S5": {"GG_OTU_1": 0.299334026, "GG_OTU_2": 0.53606149, "GG_OTU_3": 0.584373897, "GG_OTU_4": 0.485049787, "GG_OTU_5": 0.36950894}, "S6": {"GG_OTU_1": 0.615479709, "GG_OTU_2": 0.395099667, "GG_OTU_3": 0.575591472, "GG_OTU_4": 0.444859969, "GG_OTU_5": 0.1936583}, "S7": {"GG_OTU_1": 0.270549763, "GG_OTU_2": 0.436286927, "GG_OTU_3": 0.387596687, "GG_OTU_4": 0.563942641, "GG_OTU_5": 0.602794553}, "S8": {"GG_OTU_1": 0.501093013, "GG_OTU_2": 0.453961252, "GG_OTU_3": 0.588002604, "GG_OTU_4": 0.346579954, "GG_OTU_5": 0.403057074}, "S9": {"GG_OTU_1": 0, "GG_OTU_2": 0.339836909, "GG_OTU_3": 0.729727656, "GG_OTU_4": 0, "GG_OTU_5": 0.729727656}} # Testing validity of the transforms. for sid in sorted(hand_calc.keys()): for otuid in sorted(hand_calc[sid].keys()): self.assertAlmostEqual( hand_calc[sid][otuid], self.result2[sid][otuid], msg="Arcsine squareroot transformation was not accurate." )
def main(): args = handle_program_options() try: with open(args.input_biom_fp): pass except IOError as ioe: sys.exit('\nError in BIOM file path: {}\n'.format(ioe)) biomf = biom.load_table(args.input_biom_fp) rel_abd = bc.relative_abundance(biomf) if args.stabilize_variance: rel_abd = bc.arcsine_sqrt_transform(rel_abd) write_relative_abundance(rel_abd, biomf, args.output_tsv_fp)
def get_relative_abundance(biomfile): """ Return relative abundance from a OTU table. OTUIDs are converted to their genus-species identifier. """ biomf = biom.load_table(biomfile) norm_biomf = biomf.norm(inplace=False) rel_abd = {} for sid in norm_biomf.ids(): rel_abd[sid] = {} for otuid in norm_biomf.ids("observation"): otuname = oc.otu_name(norm_biomf.metadata(otuid, axis="observation")["taxonomy"]) abd = norm_biomf.get_value_by_ids(otuid, sid) rel_abd[sid][otuname] = abd ast_rel_abd = bc.arcsine_sqrt_transform(rel_abd) return ast_rel_abd
def test_arcsine_sqrt_transform(self): """ Testing arcsine_sqrt_transform() function of biom_calc.py. :return: Returns OK if testing goal is achieved, otherwise raises error. """ self.result1 = bc.relative_abundance(self.biom) self.result2 = bc.arcsine_sqrt_transform(self.result1) # Obtaining results to compare. hand_calc = [1.00685369, 0.563942641] func_calc = self.result2.values()[3].values() # Testing validity of the transforms. for hand, func in zip(hand_calc, func_calc): self.assertAlmostEqual( hand, func, places=7, msg='Function did not calculate trnasformation accurately.' )
def test_arcsine_sqrt_transform(self): """ Testing arcsine_sqrt_transform() function of biom_calc.py. :return: Returns OK if testing goal is achieved, otherwise raises error. """ self.result1 = bc.relative_abundance(self.biomf) self.result2 = bc.arcsine_sqrt_transform(self.result1) # Obtaining results to compare. hand_calc = [0, 0.32175055439, 0.463647609, 0.57963974036, 0.684719203] func_calc = self.result2.values()[3].values() # Testing validity of the transforms. for hand, func in zip(hand_calc, func_calc): self.assertAlmostEqual( hand, func, places=7, msg="Function did not calculate transformation accurately." )
def main(): args = handle_program_options() # Parse and read mapping file try: header, imap = util.parse_map_file(args.map_fp) category_idx = header.index(args.group_by) except IOError as ioe: err_msg = "\nError in metadata mapping filepath (-m): {}\n" sys.exit(err_msg.format(ioe)) # Obtain group colors try: assert args.colors is not None except AssertionError: categories = {v[category_idx] for k, v in imap.items()} color_cycle = cycle(Set3_12.hex_colors) class_colors = {c: color_cycle.next() for c in categories} else: class_colors = util.color_mapping(imap, header, args.group_by, args.colors) if args.dist_matrix_file: try: dm_data = pd.read_csv(args.dist_matrix_file, sep="\t", index_col=0) except IOError as ioe: err_msg = "\nError with unifrac distance matrix file (-d): {}\n" sys.exit(err_msg.format(ioe)) dm_data.insert(0, "Condition", [imap[str(sid)][category_idx] for sid in dm_data.index]) if args.annotate_points: sampleids = [str(sid) for sid in dm_data.index] else: sampleids = None if args.save_lda_input: dm_data.to_csv(args.save_lda_input, sep="\t") # Run LDA X_lda, y_lda, exp_var = run_LDA(dm_data) else: # Load biom file and calculate relative abundance try: biomf = biom.load_table(args.otu_table) except IOError as ioe: err_msg = "\nError with biom format file (-d): {}\n" sys.exit(err_msg.format(ioe)) # Get normalized relative abundances rel_abd = bc.relative_abundance(biomf) rel_abd = bc.arcsine_sqrt_transform(rel_abd) df_rel_abd = pd.DataFrame(rel_abd).T df_rel_abd.insert(0, "Condition", [imap[sid][category_idx] for sid in df_rel_abd.index]) if args.annotate_points: sampleids = df_rel_abd.index else: sampleids = None if args.save_lda_input: df_rel_abd.to_csv(args.save_lda_input, sep="\t") # Run LDA X_lda, y_lda, exp_var = run_LDA(df_rel_abd) # Plot LDA if args.dimensions == 3: plot_LDA(X_lda, y_lda, class_colors, exp_var, style=args.ggplot2_style, fig_size=args.figsize, label_pad=args.label_padding, font_size=args.font_size, sids=sampleids, dim=3, zangles=args.z_angles, pt_size=args.point_size, out_fp=args.out_fp) else: plot_LDA(X_lda, y_lda, class_colors, exp_var, style=args.ggplot2_style, fig_size=args.figsize, label_pad=args.label_padding, font_size=args.font_size, sids=sampleids, pt_size=args.point_size, out_fp=args.out_fp)
def main(): args = handle_program_options() try: with open(args.otu_table): pass except IOError as ioe: sys.exit("\nError with BIOM format file:{}\n".format(ioe)) try: with open(args.pcoa_fp): pass except IOError as ioe: sys.exit("\nError with principal coordinates file:{}\n".format(ioe)) try: with open(args.mapping): pass except IOError as ioe: sys.exit("\nError with mapping file:{}\n".format(ioe)) # check that the output dir exists, create it if not util.ensure_dir(args.output_dir) # load the BIOM table biomtbl = biom.load_table(args.otu_table) # Read unifrac principal coordinates file unifrac = util.parse_unifrac(args.pcoa_fp) # Read otu data file otus = set() with open(args.otu_ids_fp, "rU") as nciF: for line in nciF.readlines(): line = line.strip() otus.add(line) # Gather categories from mapping file header, imap = util.parse_map_file(args.mapping) try: category_idx = header.index(args.group_by) except ValueError: msg = "Error: Specified mapping category '{}' not found." sys.exit(msg.format(args.group_by)) category_ids = util.gather_categories(imap, header, [args.group_by]) color_map = util.color_mapping(imap, header, args.group_by, args.colors) rel_abd = bc.relative_abundance(biomtbl) rel_abd = bc.arcsine_sqrt_transform(rel_abd) # plot samples based on relative abundance of some OTU ID for otuid in otus: otuname = oc.otu_name(biomtbl.metadata(otuid, axis="observation")["taxonomy"]) cat_data = {cat: {"pc1": [], "pc2": [], "size": []} for cat in category_ids} for sid in unifrac["pcd"]: category = cat_data[imap[sid][category_idx]] try: size = rel_abd[sid][otuid] * args.scale_by except KeyError as ke: print("{} not found in {} sample.".format(ke, sid)) continue category["pc1"].append(float(unifrac["pcd"][sid][0])) category["pc2"].append(float(unifrac["pcd"][sid][1])) category["size"].append(size) if args.verbose: print("Saving chart for {}".format(" ".join(otuname.split("_")))) xr, yr = calculate_xy_range(cat_data) plot_PCoA(cat_data, otuname, unifrac, color_map.keys(), color_map, xr, yr, args.output_dir, args.save_as, args.ggplot2_style)
def main(): args = handle_program_options() try: with open(args.otu_table): pass except IOError as ioe: sys.exit("\nError with BIOM format file:{}\n".format(ioe)) try: with open(args.pcoa_fp): pass except IOError as ioe: sys.exit("\nError with principal coordinates file:{}\n".format(ioe)) try: with open(args.mapping): pass except IOError as ioe: sys.exit("\nError with mapping file:{}\n".format(ioe)) # check that the output dir exists, create it if not util.ensure_dir(args.output_dir) # load the BIOM table biomtbl = biom.load_table(args.otu_table) # Read unifrac principal coordinates file unifrac = util.parse_unifrac(args.pcoa_fp) # Read otu data file otus = set() with open(args.otu_ids_fp, "rU") as nciF: for line in nciF.readlines(): line = line.strip() otus.add(line) # Gather categories from mapping file header, imap = util.parse_map_file(args.mapping) try: category_idx = header.index(args.group_by) except ValueError: msg = "Error: Specified mapping category '{}' not found." sys.exit(msg.format(args.group_by)) category_ids = util.gather_categories(imap, header, [args.group_by]) color_map = util.color_mapping(imap, header, args.group_by, args.colors) rel_abd = bc.relative_abundance(biomtbl) rel_abd = bc.arcsine_sqrt_transform(rel_abd) # plot samples based on relative abundance of some OTU ID for otuid in otus: otuname = oc.otu_name( biomtbl.metadata(otuid, axis="observation")["taxonomy"]) cat_data = { cat: { "pc1": [], "pc2": [], "size": [] } for cat in category_ids } for sid in unifrac["pcd"]: category = cat_data[imap[sid][category_idx]] try: size = rel_abd[sid][otuid] * args.scale_by except KeyError as ke: print("{} not found in {} sample.".format(ke, sid)) continue category["pc1"].append(float(unifrac["pcd"][sid][0])) category["pc2"].append(float(unifrac["pcd"][sid][1])) category["size"].append(size) if args.verbose: print("Saving chart for {}".format(" ".join(otuname.split("_")))) xr, yr = calculate_xy_range(cat_data) plot_PCoA(cat_data, otuname, unifrac, color_map.keys(), color_map, xr, yr, args.output_dir, args.save_as, args.ggplot2_style)
def main(): args = handle_program_options() # Parse and read mapping file try: header, imap = util.parse_map_file(args.map_fp) category_idx = header.index(args.group_by) except IOError as ioe: err_msg = "\nError in metadata mapping filepath (-m): {}\n" sys.exit(err_msg.format(ioe)) # Obtain group colors class_colors = util.color_mapping(imap, header, args.group_by, args.color_by) # Get otus for LDA bubble plots try: bubble_otus = set(pd.read_csv(args.otu_ids_fp, sep="\n", header=None)[0]) except IOError as ioe: err_msg = "\nError in OTU IDs file (--bubble): {}\n" sys.exit(err_msg.format(ioe)) # Load biom file and calculate relative abundance try: biomf = biom.load_table(args.otu_table) except IOError as ioe: err_msg = "\nError with biom format file (-d): {}\n" sys.exit(err_msg.format(ioe)) # Get normalized relative abundances rel_abd = bc.relative_abundance(biomf) rel_abd = bc.arcsine_sqrt_transform(rel_abd) abd_val = {abd for sid, v1 in rel_abd.items() for otuid, abd in v1.items() if abd > 0} bubble_range = np.linspace(min(abd_val), max(abd_val), num=5) * args.scale_by # Get abundance to the nearest 50 bubble_range = [int(50 * round(float(abd)/50)) for abd in bubble_range[1:]] # Set up input for LDA calc and get LDA transformed data if args.dist_matrix_file: try: uf_data = pd.read_csv(args.dist_matrix_file, sep="\t", index_col=0) except IOError as ioe: err_msg = "\nError with unifrac distance matrix file (-d): {}\n" sys.exit(err_msg.format(ioe)) uf_data.insert(0, "Condition", [imap[sid][category_idx] for sid in uf_data.index]) sampleids = uf_data.index if args.save_lda_input: uf_data.to_csv(args.save_lda_input, sep="\t") # Run LDA X_lda, y_lda, exp_var = run_LDA(uf_data) else: df_rel_abd = pd.DataFrame(rel_abd).T df_rel_abd.insert(0, "Condition", [imap[sid][category_idx] for sid in df_rel_abd.index]) sampleids = df_rel_abd.index if args.save_lda_input: df_rel_abd.to_csv(args.save_lda_input, sep="\t") # Run LDA X_lda, y_lda, exp_var = run_LDA(df_rel_abd) # Calculate position and size of SampleIDs to plot for each OTU for otuid in bubble_otus: otuname = oc.otu_name(biomf.metadata(otuid, axis="observation")["taxonomy"]) plot_data = {cat: {"x": [], "y": [], "size": [], "label": []} for cat in class_colors.keys()} for sid, data in zip(sampleids, X_lda): category = plot_data[imap[sid][category_idx]] try: size = rel_abd[sid][otuid] * args.scale_by except KeyError as ke: print("{} not found in {} sample.".format(ke, sid)) continue category["x"].append(float(data[0])) category["y"].append(float(data[1])) category["size"].append(size) # Plot LDA bubble for each OTU fig = plt.figure(figsize=args.figsize) ax = fig.add_subplot(111) for i, cat in enumerate(plot_data): plt.scatter(plot_data[cat]["x"], plot_data[cat]["y"], s=plot_data[cat]["size"], label=cat, color=class_colors[cat], alpha=0.85, edgecolors="k") if X_lda.shape[1] == 1: plt.ylim((0.5, 2.5)) plt.title(" ".join(otuname.split("_")), style="italic", fontsize=13) try: plt.xlabel("LD1 (Percent Explained Variance: {:.3f}%)".format(exp_var[0]*100), fontsize=13, labelpad=15) except: plt.xlabel("LD1", fontsize=13, labelpad=15) try: plt.ylabel("LD2 (Percent Explained Variance: {:.3f}%)".format(exp_var[1]*100), fontsize=13, labelpad=15) except: plt.ylabel("LD2", fontsize=13, labelpad=15) lgnd1 = plt.legend(loc="best", scatterpoints=3, fontsize=13) for i in range(len(class_colors.keys())): lgnd1.legendHandles[i]._sizes = [80] # Change the legend marker size manually # Add the legend manually to the current plot plt.gca().add_artist(lgnd1) c = [plt.scatter([], [], c="w", edgecolors="k", s=s1) for s1 in bubble_range] plt.legend(c, ["{}".format(s2) for s2 in bubble_range], title="Scaled Bubble\n Sizes", frameon=True, labelspacing=2, fontsize=13, loc=4, scatterpoints=1, borderpad=1.1) # Set style for LDA bubble plots if args.ggplot2_style: gu.ggplot2_style(ax) fc = "0.8" else: fc = "none" # Save LDA bubble plots to output directory if args.verbose: print("Saving chart for {}".format(" ".join(otuname.split("_")))) fig.savefig(pj(args.output_dir, "_".join(otuname.split())) + "." + args.save_as, facecolor=fc, edgecolor="none", dpi=300, bbox_inches="tight", pad_inches=0.2) plt.close(fig)
def main(): args = handle_program_options() # Parse and read mapping file try: header, imap = util.parse_map_file(args.map_fp) category_idx = header.index(args.group_by) except IOError as ioe: err_msg = "\nError in metadata mapping filepath (-m): {}\n" sys.exit(err_msg.format(ioe)) # Obtain group colors try: assert args.colors is not None except AssertionError: categories = {v[category_idx] for k, v in imap.items()} color_cycle = cycle(Set3_12.hex_colors) class_colors = {c: color_cycle.next() for c in categories} else: class_colors = util.color_mapping(imap, header, args.group_by, args.colors) if args.dist_matrix_file: try: dm_data = pd.read_csv(args.dist_matrix_file, sep="\t", index_col=0) except IOError as ioe: err_msg = "\nError with unifrac distance matrix file (-d): {}\n" sys.exit(err_msg.format(ioe)) dm_data.insert(0, "Condition", [imap[str(sid)][category_idx] for sid in dm_data.index]) if args.annotate_points: sampleids = [str(sid) for sid in dm_data.index] else: sampleids = None if args.save_lda_input: dm_data.to_csv(args.save_lda_input, sep="\t") # Run LDA X_lda, y_lda, exp_var = run_LDA(dm_data) else: # Load biom file and calculate relative abundance try: biomf = biom.load_table(args.otu_table) except IOError as ioe: err_msg = "\nError with biom format file (-d): {}\n" sys.exit(err_msg.format(ioe)) # Get normalized relative abundances rel_abd = bc.relative_abundance(biomf) rel_abd = bc.arcsine_sqrt_transform(rel_abd) df_rel_abd = pd.DataFrame(rel_abd).T df_rel_abd.insert( 0, "Condition", [imap[sid][category_idx] for sid in df_rel_abd.index]) if args.annotate_points: sampleids = df_rel_abd.index else: sampleids = None if args.save_lda_input: df_rel_abd.to_csv(args.save_lda_input, sep="\t") # Run LDA X_lda, y_lda, exp_var = run_LDA(df_rel_abd) # Plot LDA if args.dimensions == 3: plot_LDA(X_lda, y_lda, class_colors, exp_var, style=args.ggplot2_style, fig_size=args.figsize, label_pad=args.label_padding, font_size=args.font_size, sids=sampleids, dim=3, zangles=args.z_angles, pt_size=args.point_size, out_fp=args.out_fp) else: plot_LDA(X_lda, y_lda, class_colors, exp_var, style=args.ggplot2_style, fig_size=args.figsize, label_pad=args.label_padding, font_size=args.font_size, sids=sampleids, pt_size=args.point_size, out_fp=args.out_fp)