def main(): args = handle_program_options() map_header, imap = util.parse_map_file(args.map_fp) df = pd.read_csv(args.biom_tsv, sep='\t', index_col=0).T # exclude Sample IDs not in the mapping file df = df.loc[imap.keys()] cat_gather = util.gather_categories(imap, map_header, args.group_by) if len(cat_gather) < 2: sys.stderr.write("ERROR: Only one category value found. Linear \ Discriminant Analysis requires at least two categories to compare.") return color_gather = util.gather_categories(imap, map_header, [args.color_by]) class_map = merge_dicts(*[{sid: cat for sid in cat_gather[cat].sids} for cat in cat_gather]) class_colors = merge_dicts(*[{class_map[sid]: color for sid in color_gather[color].sids} for color in color_gather]) df.insert(0, "Condition", [class_map[entry] for entry in df.index]) if args.save_lda_input: df.to_csv(args.save_lda_input) X_lda, y_lda = run_LDA(df) plot_LDA(X_lda, y_lda, class_colors, out_fp=args.out_fp, dpi=args.dpi, title=args.plot_title)
def main(): args = prog_options() try: biomf = biom.load_table(args.in_biomf) except IOError as ioe: sys.exit("Error with input BIOM format file: {}".format(ioe)) else: biomf_pa = biomf.pa( inplace=False) # convert to presence/absence BIOM table obs_ids = biomf_pa.ids("observation") try: mheader, mdata = parse_map_file(args.map_fnh) except IOError as ioe: sys.exit("Error with input mapping file: {}".format(ioe)) else: if args.group_by: sid_cat = gather_categories(mdata, mheader, [args.group_by]) else: sid_cat = gather_categories(mdata, mheader) # calculate core core_calc = {k: set() for k in sid_cat.keys()} for idx in obs_ids: for cat, val in sid_cat.iteritems(): obs_count = 0 num_of_samples = len(val.sids) for sid in val.sids: try: assert biomf_pa.get_value_by_ids(idx, sid) == 1 except AssertionError: continue else: obs_count += 1 try: assert obs_count > round(args.core_pct * num_of_samples) except AssertionError: continue else: core_calc[cat].add(idx) # Check if output directory exists, if not, create it try: assert os.path.exists(os.path.abspath(args.out_fnh)) is True except AssertionError: os.makedirs(os.path.abspath(args.out_fnh)) finally: for k, v in core_calc.iteritems(): print("{0} core IDs in {1}".format(len(v), k)) idx_filename = os.path.join(os.path.abspath(args.out_fnh), k + "_80_pct_core_ids.txt") with open(idx_filename, "w") as of: of.write("{0}".format("\n".join(sorted(v)))) filtered_biomf = biomf.filter(v, axis="observation", inplace=False) if args.biom_out: biom_filename = os.path.join(os.path.abspath(args.out_fnh), k + "_80_pct_core.biom") with biom_open(biom_filename, "w") as f: filtered_biomf.to_hdf5(f, "CORE BIOM")
def color_mapping(sample_map, header, group_column, color_column=None): """ Determine color-category mapping. If color_column was specified, then map the category names to color values. Otherwise, use the brewer colors to automatically generate a set of colors for the group values. """ group_colors = {} group_gather = putil.gather_categories(sample_map, header, [group_column]) if color_column is not None: color_gather = putil.gather_categories(sample_map, header, [color_column]) # match sample IDs between color_gather and group_gather for group in group_gather: for color in color_gather: # allow incomplete assignment of colors, if group sids overlap at # all with the color sids, consider it a match if group_gather[group].sids.intersection(color_gather[color].sids): group_colors[group] = color else: bmap = qualitative.Paired[12] bcolors = itertools.cycle(bmap.hex_colors) for group in group_gather: group_colors[group] = bcolors.next() return group_colors
def main(): args = handle_program_options() # Read in biom file try: shared_biom = biom.load_table(args.input_biom_fp) except IOError as ie: sys.exit("\nError reading BIOM file: {}\n".format(ie)) norm_shared_biom = shared_biom.norm(axis="sample", inplace=False) # Read in mapping file try: header, imap = parse_map_file(args.map_fp) except IOError as ioe: sys.exit("\nError in metadata mapping filepath: {}\n".format(ioe)) # Samples for each group and get DO values per category try: assert args.group_by is None except AssertionError: data_gather = gather_categories(imap, header, args.group_by.split(",")) sample_list = [ sid for cat in data_gather.keys() for sid in data_gather[cat].sids ] else: sample_list = norm_shared_biom.ids() doc = calc_doc(norm_shared_biom, sample_list) try: assert doc is not None except AssertionError: sys.exit("Error in DOC calculations. Please check the modules.") # Get confidence interval sl_lowess_regr = get_doc_ci(doc, args.frac, args.plot_ci, sample_list, num_of_seqs=args.num_iterations) # Plot the residual figure plot_residplot(sl_lowess_regr, args.residplot, save=args.save_image) # Plot DOC plot_doc(sl_lowess_regr, args.residplot, ci=args.plot_ci, title=args.title, save=args.save_image)
def main(): args = handle_program_options() try: # Load biom format file biomf = biom.load_table(args.input_biom_fp) except TypeError as te: sys.exit( "The data in the path does not appear to be a BIOM format table. " "Error: {}.".format(te)) # Determine OTUIDs present in each sample sample_otus = oc.assign_otu_membership(biomf) try: # Parse mapping file header, imap = util.parse_map_file(args.mapping_file) except ValueError as ve: sys.exit("Error: {}.".format(ve)) # Get relevant category information group_data = util.gather_categories(imap, header, [args.category_column]) # Initialize results dict in group_data with {"otuids": set()} for each category for group in group_data: group_data[group].results["otuids"] = set() # Collect all OTUIDs present in each category for sid in sample_otus: group = sample_group(sid, group_data) group_data[group].results["otuids"].update(sample_otus[sid]) if args.reverse: # Get shared OTUIDs shared = shared_otuids(group_data) # Write out shared OTUIDs results shared_df = pd.DataFrame.from_dict(shared, orient="index").T shared_df.to_csv(args.reverse, sep="\t", index=False) # Create input for unique_otus group_otuids = { group: group_data[group].results["otuids"] for group in group_data } # Write out unique OTUIDs to file write_uniques(args.output_dir, args.prefix, unique_otuids(group_otuids))
def main(): args = handle_program_options() try: # Load biom format file biomf = biom.load_table(args.input_biom_fp) except TypeError as te: sys.exit("The data in the path does not appear to be a BIOM format table. " "Error: {}.".format(te)) # Determine OTUIDs present in each sample sample_otus = assign_otu_membership(biomf) try: # Parse mapping file header, imap = util.parse_map_file(args.mapping_file) except ValueError as ve: sys.exit("Error: {}.".format(ve)) # Get relevant category information group_data = util.gather_categories(imap, header, [args.category_column]) # Initialize results dict in group_data with {"otuids": set()} for each category for group in group_data: group_data[group].results["otuids"] = set() # Collect all OTUIDs present in each category for sid in sample_otus: group = sample_group(sid, group_data) group_data[group].results["otuids"].update(sample_otus[sid]) if args.reverse: # Get shared OTUIDs shared = shared_otuids(group_data) # Write out shared OTUIDs results shared_df = pd.DataFrame.from_dict(shared, orient="index").T shared_df.to_csv(args.reverse, sep="\t", index=False) else: # Create input for unique_otus group_otuids = {group: group_data[group].results["otuids"] for group in group_data} # Write out unique OTUIDs to file write_uniques(args.output_dir, args.prefix, unique_otuids(group_otuids))
def main(): args = handle_program_options() try: with open(args.otu_table): pass except IOError as ioe: sys.exit("\nError with BIOM format file:{}\n".format(ioe)) try: with open(args.pcoa_fp): pass except IOError as ioe: sys.exit("\nError with principal coordinates file:{}\n".format(ioe)) try: with open(args.mapping): pass except IOError as ioe: sys.exit("\nError with mapping file:{}\n".format(ioe)) if not os.path.exists(args.output_dir): try: os.mkdir(args.output_dir) except OSError as oe: if os.errno == 2: msg = ("One or more directories in the path provided for " + "--output-dir ({}) do not exist. If you are specifying " + "a new directory for output, please ensure all other " + "directories in the path currently exist.") sys.exit(msg.format(args.output_dir)) else: msg = ("An error occurred trying to create the output " + "directory ({}) with message: {}") sys.exit(msg.format(args.output_dir, oe.strerror)) # load the BIOM table biomtbl = biom.load_table(args.otu_table) # Read unifrac principal coordinates file unifrac = util.parse_unifrac(args.pcoa_fp) # Read otu data file otus = set() with open(args.otu_ids_fp, "rU") as nciF: for line in nciF.readlines(): line = line.strip() otus.add(line) # Gather categories from mapping file header, imap = util.parse_map_file(args.mapping) try: category_idx = header.index(args.group_by) except ValueError: msg = "Error: Specified mapping category '{}' not found." sys.exit(msg.format(args.group_by)) category_ids = util.gather_categories(imap, header, [args.group_by]) color_map = util.color_mapping(imap, header, args.group_by, args.colors) rel_abd = get_relative_abundance(biomtbl) # plot samples based on relative abundance of some OTU ID for otuid in otus: otuname = oc.otu_name(biomtbl.metadata(otuid, axis="observation")["taxonomy"]) cat_data = {cat: {"pc1": [], "pc2": [], "size": []} for cat in category_ids} for sid in unifrac["pcd"]: category = cat_data[imap[sid][category_idx]] try: size = rel_abd[sid][otuid] * args.scale_by except KeyError as ke: print "{} not found in {} sample.".format(ke, sid) continue category["pc1"].append(float(unifrac["pcd"][sid][0])) category["pc2"].append(float(unifrac["pcd"][sid][1])) category["size"].append(size) if args.verbose: print "Saving chart for {}".format(" ".join(otuname.split("_"))) xr, yr = calculate_xy_range(cat_data) plot_PCoA(cat_data, otuname, unifrac, color_map.keys(), color_map, xr, yr, args.output_dir, args.save_as, args.ggplot2_style)
def main(): args = handle_program_options() try: with open(args.otu_table): pass except IOError as ioe: sys.exit("\nError with BIOM format file:{}\n".format(ioe)) try: with open(args.pcoa_fp): pass except IOError as ioe: sys.exit("\nError with principal coordinates file:{}\n".format(ioe)) try: with open(args.mapping): pass except IOError as ioe: sys.exit("\nError with mapping file:{}\n".format(ioe)) # check that the output dir exists, create it if not util.ensure_dir(args.output_dir) # load the BIOM table biomtbl = biom.load_table(args.otu_table) # Read unifrac principal coordinates file unifrac = util.parse_unifrac(args.pcoa_fp) # Read otu data file otus = set() with open(args.otu_ids_fp, "rU") as nciF: for line in nciF.readlines(): line = line.strip() otus.add(line) # Gather categories from mapping file header, imap = util.parse_map_file(args.mapping) try: category_idx = header.index(args.group_by) except ValueError: msg = "Error: Specified mapping category '{}' not found." sys.exit(msg.format(args.group_by)) category_ids = util.gather_categories(imap, header, [args.group_by]) color_map = util.color_mapping(imap, header, args.group_by, args.colors) rel_abd = bc.relative_abundance(biomtbl) rel_abd = bc.arcsine_sqrt_transform(rel_abd) # plot samples based on relative abundance of some OTU ID for otuid in otus: otuname = oc.otu_name(biomtbl.metadata(otuid, axis="observation")["taxonomy"]) cat_data = {cat: {"pc1": [], "pc2": [], "size": []} for cat in category_ids} for sid in unifrac["pcd"]: category = cat_data[imap[sid][category_idx]] try: size = rel_abd[sid][otuid] * args.scale_by except KeyError as ke: print("{} not found in {} sample.".format(ke, sid)) continue category["pc1"].append(float(unifrac["pcd"][sid][0])) category["pc2"].append(float(unifrac["pcd"][sid][1])) category["size"].append(size) if args.verbose: print("Saving chart for {}".format(" ".join(otuname.split("_")))) xr, yr = calculate_xy_range(cat_data) plot_PCoA(cat_data, otuname, unifrac, color_map.keys(), color_map, xr, yr, args.output_dir, args.save_as, args.ggplot2_style)
def main(): args = handle_program_options() try: with open(args.coord_fp): pass except IOError as ioe: err_msg = "\nError in input principal coordinates filepath (-i): {}\n" sys.exit(err_msg.format(ioe)) try: with open(args.map_fp): pass except IOError as ioe: err_msg = "\nError in input metadata mapping filepath (-m): {}\n" sys.exit(err_msg.format(ioe)) with open(args.coord_fp) as F: pcd = F.readlines() pcd = [line.split("\t") for line in pcd] map_header, imap = util.parse_map_file(args.map_fp) data_gather = util.gather_categories(imap, map_header, args.group_by.split(",")) categories = OrderedDict([(condition, {"pc1": [], "pc2": [], "pc3": []}) for condition in data_gather.keys()]) bcolors = itertools.cycle(Set3_12.hex_colors) if not args.colors: colors = [bcolors.next() for _ in categories] else: colors = util.color_mapping(imap, map_header, args.group_by, args.colors) colors = colors.values() parsed_unifrac = util.parse_unifrac(args.coord_fp) pco = args.pc_order if args.dimensions == 3: pco.append(3) pc1v = parsed_unifrac["varexp"][pco[0] - 1] pc2v = parsed_unifrac["varexp"][pco[1] - 1] if args.dimensions == 3: pc3v = parsed_unifrac["varexp"][pco[2] - 1] for sid, points in parsed_unifrac["pcd"].items(): for condition, dc in data_gather.items(): if sid in dc.sids: cat = condition break categories[cat]["pc1"].append((sid, points[pco[0] - 1])) categories[cat]["pc2"].append((sid, points[pco[1] - 1])) if args.dimensions == 3: categories[cat]["pc3"].append((sid, points[pco[2] - 1])) axis_str = "PC{} (Percent Explained Variance {:.3f}%)" # initialize plot fig = plt.figure(figsize=args.figsize) if args.dimensions == 3: ax = fig.add_subplot(111, projection="3d") ax.view_init(elev=args.z_angles[1], azim=args.z_angles[0]) ax.set_zlabel(axis_str.format(3, pc3v), labelpad=args.label_padding) if args.z_limits: ax.set_zlim(args.z_limits) else: ax = fig.add_subplot(111) # plot data for i, cat in enumerate(categories): if args.dimensions == 3: ax.scatter(xs=[e[1] for e in categories[cat]["pc1"]], ys=[e[1] for e in categories[cat]["pc2"]], zs=[e[1] for e in categories[cat]["pc3"]], zdir="z", c=colors[i], s=args.point_size, label=cat, edgecolors="k") else: ax.scatter([e[1] for e in categories[cat]["pc1"]], [e[1] for e in categories[cat]["pc2"]], c=colors[i], s=args.point_size, label=cat, edgecolors="k") # Script to annotate PCoA sample points. if args.annotate_points: for x, y in zip(categories[cat]["pc1"], categories[cat]["pc2"]): ax.annotate( x[0], xy=(x[1], y[1]), xytext=(-10, -15), textcoords="offset points", ha="center", va="center", ) # customize plot options if args.x_limits: ax.set_xlim(args.x_limits) if args.y_limits: ax.set_ylim(args.y_limits) ax.set_xlabel(axis_str.format(pco[0], float(pc1v)), labelpad=args.label_padding) ax.set_ylabel(axis_str.format(pco[1], float(pc2v)), labelpad=args.label_padding) leg = plt.legend(loc="best", scatterpoints=3, frameon=True, framealpha=1) leg.get_frame().set_edgecolor('k') # Set the font characteristics font = {"family": "normal", "weight": "bold", "size": args.font_size} mpl.rc("font", **font) if args.title: ax.set_title(args.title) if args.ggplot2_style and not args.dimensions == 3: gu.ggplot2_style(ax) # save or display result if args.out_fp: fig.savefig(args.out_fp, facecolor="white", edgecolor="none", bbox_inches="tight", pad_inches=0.2) else: plt.show()
def main(): args = handle_program_options() try: with open(args.otu_table): pass except IOError as ioe: sys.exit( "\nError with OTU_Sample abundance data file:{}\n" .format(ioe) ) try: with open(args.mapping): pass except IOError as ioe: sys.exit( "\nError with mapping file:{}\n" .format(ioe) ) # input data biomf = biom.load_table(args.otu_table) map_header, imap = util.parse_map_file(args.mapping) # rewrite tree file with otu names if args.input_tree: with open(args.input_tree) as treF, open(args.output_tre, "w") as outF: tree = treF.readline() if "'" in tree: tree = tree.replace("'", '') outF.write(newick_replace_otuids(tree, biomf)) oid_rows = {id_: md["taxonomy"] for val, id_, md in biomf.iter(axis="observation")} # calculate analysis results categories = None if args.map_categories is not None: categories = args.map_categories.split(",") # set transform if --stabilize_variance is specfied tform = bc.arcsine_sqrt_transform if args.stabilize_variance else None groups = util.gather_categories(imap, map_header, categories) for group in groups.values(): if args.analysis_metric in ["MRA", "NMRA"]: results = bc.MRA(biomf, group.sids, transform=tform) elif args.analysis_metric == "raw": results = bc.transform_raw_abundance(biomf, sampleIDs=group.sids, sample_abd=False) group.results.update({oc.otu_name(oid_rows[oid]): results[oid] for oid in results}) # write iTol data set file with open(args.output_itol_table, "w") as itolF: if args.analysis_metric == "raw": itolF.write("DATASET_GRADIENT\nSEPARATOR TAB\n") itolF.write("DATASET_LABEL\tLog Total Abundance\n") itolF.write("COLOR\t#000000\n") itolF.write("LEGEND_TITLE\tLog Total Abundance\n") itolF.write("LEGEND_SHAPES\t1\n") itolF.write("LEGEND_COLORS\t#000000\n") itolF.write("LEGEND_LABELS\tLog Total Abundance\n") itolF.write("COLOR_MIN\t#FFFFFF\n") itolF.write("COLOR_MAX\t#000000\n") else: itolF.write("DATASET_MULTIBAR\nSEPARATOR TAB\n") itolF.write("DATASET_LABEL\tNMRA\n") itolF.write("FIELD_COLORS\t{}\n".format("\t".join(["#ff0000" for _ in range(len(groups))]))) itolF.write("FIELD_LABELS\t" + "\t".join(groups.keys())+"\n") itolF.write("LEGEND_TITLE\tNMRA\n") itolF.write("LEGEND_SHAPES\t{}\n".format("\t".join(["1" for _ in range(len(groups))]))) itolF.write("LEGEND_COLORS\t{}\n".format("\t".join(["#ff0000" for _ in range(len(groups))]))) itolF.write("LEGEND_LABELS\t" + "\t".join(groups.keys())+"\n") itolF.write("WIDTH\t300\n") itolF.write("DATA\n") all_otus = frozenset({oc.otu_name(md["taxonomy"]) for val, id_, md in biomf.iter(axis="observation")}) for oname in all_otus: row = ["{name}"] # \t{s:.2f}\t{ns:.2f}\n" row_data = {"name": oname} msum = 0 for name, group in groups.iteritems(): row.append("{{{}:.5f}}".format(name)) if oname in group.results: row_data[name] = group.results[oname] else: row_data[name] = 0.0 msum += row_data[name] # normalize avg relative abundance data if args.analysis_metric == "NMRA" and msum > 0: row_data.update({key: data/msum for key, data in row_data.items() if key != "name"}) itolF.write("\t".join(row).format(**row_data) + "\n")
def test_gather_categories(self): """ Testing gather_category function from iTol.py. If successful, the function will be moved to util.py. :return: Returns OK if test goals were achieved, otherwise raises error. """ DataCategory = namedtuple("DataCategory", "sids results") result = ut.gather_categories(self.map_data, self.map_header) result1 = ut.gather_categories(self.map_data, self.map_header, ["Treatment"]) # one category given result2 = ut.gather_categories(self.map_data, self.map_header, ["Smoking=Control"]) # incorrect condition given result3 = ut.gather_categories(self.map_data, self.map_header, ["Treatment=Fast"]) # correct condition given result4 = ut.gather_categories(self.map_data, self.map_header, ["Treatment", "Smoking"]) # 2 categories given result5 = ut.gather_categories(self.map_data, self.map_header, ["Treatment", "Smoking=Never_Smoker"]) # 1 category 1 condition result6 = ut.gather_categories(self.map_data, self.map_header, ["Treatment=Control", "Smoking=Current_Smoker"]) # 1 category 1 condition result7 = ut.gather_categories(self.map_data, self.map_header, ["Smoking=Current_Smoker", "Smoking=Never_Smoker"]) # 2 conditions given result8 = ut.gather_categories(self.map_data, self.map_header, ["Smoking=Never_Smoker", "Treatment", "Gender=Female"]) # more than 2 categories - mix result9 = ut.gather_categories(self.map_data, self.map_header, categories="Nationality:Peru") # Testing if the function calculates without any categories. manual = {"default": DataCategory({"PC.354", "PC.355", "PC.356", "PC.481", "PC.593", "PC.607", "PC.634", "PC.635", "PC.636"}, {})} self.assertDictEqual( result, manual, msg="With no category or condition given, gather_categories() did not return " "all SampleIDs as expected." ) # Testing if the function accurately calculates for one category manual1 = {"Control": DataCategory({"PC.355", "PC.356", "PC.354", "PC.481", "PC.593"}, {}), "Fast": DataCategory({"PC.634", "PC.635", "PC.636", "PC.607"}, {})} self.assertDictEqual( result1, manual1, msg="With one category given, gather_categories() did not return per " "category SampleIDs as expected." ) # Testing if the function accurately calculates for incorrect condition given self.assertDictEqual( result2, manual, msg="With incorrect condition given, gather_categories() did not return " "all SampleIDs by default as expected." ) # Testing if the function accurately calculates for correct one condition given manual3 = {"Fast": DataCategory({"PC.634", "PC.635", "PC.636", "PC.607"}, {})} self.assertDictEqual( result3, manual3, msg="With one correct condition given, gather_categories() did not return " "SampleIDs for the condition given, as expected." ) # Testing if the function accurately calculates for correct one condition given manual4 = {"Control_Current_Smoker": DataCategory({"PC.355", "PC.356"}, {}), "Control_Never_Smoker": DataCategory({"PC.354", "PC.481", "PC.593"}, {}), "Fast_Current_Smoker": DataCategory({"PC.634", "PC.635", "PC.636"}, {}), "Fast_Never_Smoker": DataCategory({"PC.607"}, {})} self.assertDictEqual( result4, manual4, msg="With multiple categories given, gather_categories() did not return " "SampleIDs for all category combinations, as expected." ) # Testing if the function accurately calculates for one category and condition manual5 = {"Control_Never_Smoker": DataCategory({"PC.354", "PC.481", "PC.593"}, {}), "Fast_Never_Smoker": DataCategory({"PC.607"}, {})} self.assertDictEqual( result5, manual5, msg="With one category and one condition given, gather_categories() did not " "return SampleIDs for all category-condition combinations, as expected." ) # Testing if the function accurately calculates for one category and condition manual6 = {"Control_Current_Smoker": DataCategory({"PC.355", "PC.356"}, {})} self.assertDictEqual( result6, manual6, msg="With two specific conditions given, gather_categories() did not " "return SampleIDs for all condition combinations, as expected." ) # Testing if the function accurately calculates for one category and condition manual7 = {"Current_Smoker": DataCategory({"PC.355", "PC.356", "PC.634", "PC.635", "PC.636"}, {}), "Never_Smoker": DataCategory({"PC.354", "PC.481", "PC.593", "PC.607"}, {})} self.assertDictEqual( result7, manual7, msg="With two conditions from same category given, gather_categories() did " "not return SampleIDs for all condition combinations, as expected." ) # Testing if function accurately categorizes SampleIDs for multiple categories manual8 = {"Control_Never_Smoker_Female": DataCategory({"PC.354", "PC.593"}, {})} self.assertDictEqual( result8, manual8, msg="With two or more conditions/categories given, gather_categories() did " "not return SampleIDs for all condition combinations, as expected." ) # Testing invalid categories or conditions identified manual9 = {"default": DataCategory({"PC.354", "PC.355", "PC.356", "PC.481", "PC.593", "PC.607", "PC.634", "PC.635", "PC.636"}, {})} self.assertDictEqual( result9, manual9, msg="With invalid category or condition given, gather_categories() did not " "return all SampleIDs as expected." )
def main(): args = handle_program_options() try: with open(args.coord_fp): pass except IOError as ioe: err_msg = "\nError in input principal coordinates filepath (-i): {}\n" sys.exit(err_msg.format(ioe)) try: with open(args.map_fp): pass except IOError as ioe: err_msg = "\nError in input metadata mapping filepath (-m): {}\n" sys.exit(err_msg.format(ioe)) with open(args.coord_fp) as F: pcd = F.readlines() pcd = [line.split("\t") for line in pcd] map_header, imap = util.parse_map_file(args.map_fp) data_gather = util.gather_categories(imap, map_header, args.group_by.split(",")) categories = OrderedDict([(condition, { "pc1": [], "pc2": [], "pc3": [] }) for condition in data_gather.keys()]) bcolors = itertools.cycle(Set3_12.hex_colors) if not args.colors: colors = [bcolors.next() for _ in categories] else: colors = util.color_mapping(imap, map_header, args.group_by, args.colors) colors = colors.values() parsed_unifrac = util.parse_unifrac(args.coord_fp) pco = args.pc_order if args.dimensions == 3: pco.append(3) pc1v = parsed_unifrac["varexp"][pco[0] - 1] pc2v = parsed_unifrac["varexp"][pco[1] - 1] if args.dimensions == 3: pc3v = parsed_unifrac["varexp"][pco[2] - 1] for sid, points in parsed_unifrac["pcd"].items(): for condition, dc in data_gather.items(): if sid in dc.sids: cat = condition break categories[cat]["pc1"].append((sid, points[pco[0] - 1])) categories[cat]["pc2"].append((sid, points[pco[1] - 1])) if args.dimensions == 3: categories[cat]["pc3"].append((sid, points[pco[2] - 1])) axis_str = "PC{} (Percent Explained Variance {:.3f}%)" # initialize plot fig = plt.figure(figsize=args.figsize) if args.dimensions == 3: ax = fig.add_subplot(111, projection="3d") ax.view_init(elev=args.z_angles[1], azim=args.z_angles[0]) ax.set_zlabel(axis_str.format(3, pc3v), labelpad=args.label_padding) if args.z_limits: ax.set_zlim(args.z_limits) else: ax = fig.add_subplot(111) # plot data for i, cat in enumerate(categories): if args.dimensions == 3: ax.scatter(xs=[e[1] for e in categories[cat]["pc1"]], ys=[e[1] for e in categories[cat]["pc2"]], zs=[e[1] for e in categories[cat]["pc3"]], zdir="z", c=colors[i], s=args.point_size, label=cat, edgecolors="k") else: ax.scatter([e[1] for e in categories[cat]["pc1"]], [e[1] for e in categories[cat]["pc2"]], c=colors[i], s=args.point_size, label=cat, edgecolors="k") # Script to annotate PCoA sample points. if args.annotate_points: for x, y in zip(categories[cat]["pc1"], categories[cat]["pc2"]): ax.annotate( x[0], xy=(x[1], y[1]), xytext=(-10, -15), textcoords="offset points", ha="center", va="center", ) # customize plot options if args.x_limits: ax.set_xlim(args.x_limits) if args.y_limits: ax.set_ylim(args.y_limits) ax.set_xlabel(axis_str.format(pco[0], float(pc1v)), labelpad=args.label_padding) ax.set_ylabel(axis_str.format(pco[1], float(pc2v)), labelpad=args.label_padding) leg = plt.legend(loc="best", scatterpoints=3, frameon=True, framealpha=1) leg.get_frame().set_edgecolor('k') # Set the font characteristics font = {"family": "normal", "weight": "bold", "size": args.font_size} mpl.rc("font", **font) if args.title: ax.set_title(args.title) if args.ggplot2_style and not args.dimensions == 3: gu.ggplot2_style(ax) # save or display result if args.out_fp: fig.savefig(args.out_fp, facecolor="white", edgecolor="none", bbox_inches="tight", pad_inches=0.2) else: plt.show()
def test_gather_categories(self): """ Testing gather_category function from iTol.py. If successful, the function will be moved to util.py. :return: Returns OK if test goals were achieved, otherwise raises error. """ DataCategory = namedtuple("DataCategory", "sids results") result = ut.gather_categories(self.map_data, self.map_header) result1 = ut.gather_categories(self.map_data, self.map_header, ["Treatment"]) # one category given result2 = ut.gather_categories( self.map_data, self.map_header, ["Smoking=Control"]) # incorrect condition given result3 = ut.gather_categories( self.map_data, self.map_header, ["Treatment=Fast"]) # correct condition given result4 = ut.gather_categories( self.map_data, self.map_header, ["Treatment", "Smoking"]) # 2 categories given result5 = ut.gather_categories( self.map_data, self.map_header, ["Treatment", "Smoking=Never_Smoker"]) # 1 category 1 condition result6 = ut.gather_categories( self.map_data, self.map_header, ["Treatment=Control", "Smoking=Current_Smoker" ]) # 1 category 1 condition result7 = ut.gather_categories( self.map_data, self.map_header, ["Smoking=Current_Smoker", "Smoking=Never_Smoker" ]) # 2 conditions given result8 = ut.gather_categories( self.map_data, self.map_header, ["Smoking=Never_Smoker", "Treatment", "Gender=Female" ]) # more than 2 categories - mix result9 = ut.gather_categories(self.map_data, self.map_header, categories="Nationality:Peru") # Testing if the function calculates without any categories. manual = { "default": DataCategory( { "PC.354", "PC.355", "PC.356", "PC.481", "PC.593", "PC.607", "PC.634", "PC.635", "PC.636" }, {}) } self.assertDictEqual( result, manual, msg= "With no category or condition given, gather_categories() did not return " "all SampleIDs as expected.") # Testing if the function accurately calculates for one category manual1 = { "Control": DataCategory({"PC.355", "PC.356", "PC.354", "PC.481", "PC.593"}, {}), "Fast": DataCategory({"PC.634", "PC.635", "PC.636", "PC.607"}, {}) } self.assertDictEqual( result1, manual1, msg= "With one category given, gather_categories() did not return per " "category SampleIDs as expected.") # Testing if the function accurately calculates for incorrect condition given self.assertDictEqual( result2, manual, msg= "With incorrect condition given, gather_categories() did not return " "all SampleIDs by default as expected.") # Testing if the function accurately calculates for correct one condition given manual3 = { "Fast": DataCategory({"PC.634", "PC.635", "PC.636", "PC.607"}, {}) } self.assertDictEqual( result3, manual3, msg= "With one correct condition given, gather_categories() did not return " "SampleIDs for the condition given, as expected.") # Testing if the function accurately calculates for correct one condition given manual4 = { "Control_Current_Smoker": DataCategory({"PC.355", "PC.356"}, {}), "Control_Never_Smoker": DataCategory({"PC.354", "PC.481", "PC.593"}, {}), "Fast_Current_Smoker": DataCategory({"PC.634", "PC.635", "PC.636"}, {}), "Fast_Never_Smoker": DataCategory({"PC.607"}, {}) } self.assertDictEqual( result4, manual4, msg= "With multiple categories given, gather_categories() did not return " "SampleIDs for all category combinations, as expected.") # Testing if the function accurately calculates for one category and condition manual5 = { "Control_Never_Smoker": DataCategory({"PC.354", "PC.481", "PC.593"}, {}), "Fast_Never_Smoker": DataCategory({"PC.607"}, {}) } self.assertDictEqual( result5, manual5, msg= "With one category and one condition given, gather_categories() did not " "return SampleIDs for all category-condition combinations, as expected." ) # Testing if the function accurately calculates for one category and condition manual6 = { "Control_Current_Smoker": DataCategory({"PC.355", "PC.356"}, {}) } self.assertDictEqual( result6, manual6, msg= "With two specific conditions given, gather_categories() did not " "return SampleIDs for all condition combinations, as expected.") # Testing if the function accurately calculates for one category and condition manual7 = { "Current_Smoker": DataCategory({"PC.355", "PC.356", "PC.634", "PC.635", "PC.636"}, {}), "Never_Smoker": DataCategory({"PC.354", "PC.481", "PC.593", "PC.607"}, {}) } self.assertDictEqual( result7, manual7, msg= "With two conditions from same category given, gather_categories() did " "not return SampleIDs for all condition combinations, as expected." ) # Testing if function accurately categorizes SampleIDs for multiple categories manual8 = { "Control_Never_Smoker_Female": DataCategory({"PC.354", "PC.593"}, {}) } self.assertDictEqual( result8, manual8, msg= "With two or more conditions/categories given, gather_categories() did " "not return SampleIDs for all condition combinations, as expected." ) # Testing invalid categories or conditions identified manual9 = { "default": DataCategory( { "PC.354", "PC.355", "PC.356", "PC.481", "PC.593", "PC.607", "PC.634", "PC.635", "PC.636" }, {}) } self.assertDictEqual( result9, manual9, msg= "With invalid category or condition given, gather_categories() did not " "return all SampleIDs as expected.")
def main(): args = handle_program_options() try: with open(args.otu_table): pass except IOError as ioe: sys.exit( "\nError with OTU_Sample abundance data file:{}\n".format(ioe)) try: with open(args.mapping): pass except IOError as ioe: sys.exit("\nError with mapping file:{}\n".format(ioe)) # input data biomf = biom.load_table(args.otu_table) map_header, imap = util.parse_map_file(args.mapping) # rewrite tree file with otu names, skip if keep_otuids specified if args.input_tree and not args.keep_otuids: with open(args.input_tree) as treF, open(args.output_tre, "w") as outF: tree = treF.readline() if "'" in tree: tree = tree.replace("'", '') outF.write(newick_replace_otuids(tree, biomf)) if not args.keep_otuids: oid_rows = { id_: md["taxonomy"] for val, id_, md in biomf.iter(axis="observation") } # calculate analysis results categories = None if args.map_categories is not None and args.analysis_metric != "raw": categories = args.map_categories.split(",") # set transform if --stabilize_variance is specfied tform = bc.arcsine_sqrt_transform if args.stabilize_variance else None groups = util.gather_categories(imap, map_header, categories) for group in groups.values(): if args.analysis_metric in ["MRA", "NMRA"]: results = bc.MRA(biomf, group.sids, transform=tform) elif args.analysis_metric == "raw": results = bc.transform_raw_abundance(biomf, sampleIDs=group.sids, sample_abd=False) if args.keep_otuids: group.results.update({oid: results[oid] for oid in results}) else: group.results.update( {oc.otu_name(oid_rows[oid]): results[oid] for oid in results}) # write iTol data set file with open(args.output_itol_table, "w") as itolF: if args.analysis_metric == "raw": itolF.write("DATASET_GRADIENT\nSEPARATOR TAB\n") itolF.write("DATASET_LABEL\tLog Total Abundance\n") itolF.write("COLOR\t#000000\n") itolF.write("LEGEND_TITLE\tLog Total Abundance\n") itolF.write("LEGEND_SHAPES\t1\n") itolF.write("LEGEND_COLORS\t#000000\n") itolF.write("LEGEND_LABELS\tLog Total Abundance\n") itolF.write("COLOR_MIN\t#FFFFFF\n") itolF.write("COLOR_MAX\t#000000\n") else: itolF.write("DATASET_MULTIBAR\nSEPARATOR TAB\n") itolF.write("DATASET_LABEL\t{}\n".format(args.analysis_metric)) itolF.write("FIELD_COLORS\t{}\n".format("\t".join( ["#ff0000" for _ in range(len(groups))]))) itolF.write("FIELD_LABELS\t" + "\t".join(groups.keys()) + "\n") itolF.write("LEGEND_TITLE\t{}\n".format(args.analysis_metric)) itolF.write("LEGEND_SHAPES\t{}\n".format("\t".join( ["1" for _ in range(len(groups))]))) itolF.write("LEGEND_COLORS\t{}\n".format("\t".join( ["#ff0000" for _ in range(len(groups))]))) itolF.write("LEGEND_LABELS\t" + "\t".join(groups.keys()) + "\n") itolF.write("WIDTH\t300\n") itolF.write("DATA\n") if args.keep_otuids: all_otus = frozenset( {id_ for id_ in biomf.ids(axis="observation")}) else: all_otus = frozenset({ oc.otu_name(md["taxonomy"]) for val, id_, md in biomf.iter(axis="observation") }) for oname in all_otus: row = ["{name}"] # \t{s:.2f}\t{ns:.2f}\n" row_data = {"name": oname} msum = 0 for name, group in groups.iteritems(): row.append("{{{}:.5f}}".format(name)) if oname in group.results: row_data[name] = group.results[oname] else: row_data[name] = 0.0 msum += row_data[name] # normalize avg relative abundance data if args.analysis_metric == "NMRA" and msum > 0: row_data.update({ key: data / msum for key, data in row_data.items() if key != "name" }) itolF.write("\t".join(row).format(**row_data) + "\n")
def main(): args = handle_program_options() try: with open(args.coord_fp): pass except IOError as ioe: err_msg = '\nError in input principal coordinates filepath (-i): {}\n' sys.exit(err_msg.format(ioe)) try: with open(args.map_fp): pass except IOError as ioe: err_msg = '\nError in input metadata mapping filepath (-m): {}\n' sys.exit(err_msg.format(ioe)) with open(args.coord_fp) as F: pcd = F.readlines() pcd = [line.split('\t') for line in pcd] map_header, imap = util.parse_map_file(args.map_fp) data_gather = util.gather_categories(imap, map_header, args.colorby.split(',')) categories = OrderedDict([(condition, {'pc1': [], 'pc2': [], 'pc3': []}) for condition in data_gather.keys()]) bmap = qualitative.Paired[12] bcolors = itertools.cycle(bmap.hex_colors) if not args.colors: colors = [bcolors.next() for _ in categories] else: colors = parse_colors(args.colors, categories) parsed_unifrac = util.parse_unifrac(args.coord_fp) pco = args.pc_order if args.dimensions == 2 else [1, 2, 3] pc1v = parsed_unifrac['varexp'][pco[0]] pc2v = parsed_unifrac['varexp'][pco[1]] if args.dimensions == 3: pc3v = parsed_unifrac['varexp'][pco[2]] for sid, points in parsed_unifrac['pcd'].iteritems(): for condition, dc in data_gather.iteritems(): if sid in dc.sids: cat = condition break categories[cat]['pc1'].append((sid, float(points[pco[0] - 1]))) categories[cat]['pc2'].append((sid, float(points[pco[1] - 1]))) if args.dimensions == 3: categories[cat]['pc3'].append((sid, float(points[pco[2] - 1]))) axis_str = "PC{} - Percent variation explained {:.2f}%" # initialize plot fig = plt.figure(figsize=(14,8)) if args.dimensions == 3: ax = fig.add_subplot(111, projection='3d') ax.view_init(elev=23., azim=-134.5) ax.set_zlabel(axis_str.format(pco[2], float(pc3v))) if args.z_limits: ax.set_zlim(args.z_limits) else: ax = fig.add_subplot(111) # plot data for i, cat in enumerate(categories): if args.dimensions == 3: ax.scatter(xs=[e[1] for e in categories[cat]['pc1']], ys=[e[1] for e in categories[cat]['pc2']], zs=[e[1] for e in categories[cat]['pc3']], zdir='z', c=colors[i], s=args.point_size) else: ax.scatter([e[1] for e in categories[cat]['pc1']], [e[1] for e in categories[cat]['pc2']], c=colors[i], s=args.point_size) # Script to annotate PCoA points. # for x, y in zip(categories[cat]['pc1'], categories[cat]['pc2']): # ax.annotate( # x[0], xy=(x[1], y[1]), xytext=(-10, -15), # textcoords='offset points', ha='center', va='center', # ) # customize plot options if args.x_limits: ax.set_xlim(args.x_limits) if args.y_limits: ax.set_ylim(args.y_limits) ax.set_xlabel(axis_str.format(pco[0], float(pc1v))) ax.set_ylabel(axis_str.format(pco[1], float(pc2v))) ax.legend([Rectangle((0, 0), 1, 1, fc=colors[i]) for i in range(len(categories))], categories.keys(), loc='best') if args.title: title(args.title) # save or display result if args.out_fp: fig.savefig(args.out_fp, facecolor='white', edgecolor='none', dpi=args.dpi, bbox_inches='tight', pad_inches=0.2) else: plt.show()
def main(): args = handle_program_options() try: with open(args.otu_table): pass except IOError as ioe: sys.exit( '\nError with OTU_Sample abundance data file:{}\n' .format(ioe) ) try: with open(args.mapping): pass except IOError as ioe: sys.exit( '\nError with mapping file:{}\n' .format(ioe) ) # input data with open(args.otu_table) as bF: biom = json.loads(bF.readline()) map_header, imap = util.parse_map_file(args.mapping) # rewrite tree file with otu names if args.input_tree: with open(args.input_tree) as treF, open(args.output_tre, 'w') as outF: tree = treF.readline() if "'" in tree: tree = tree.replace("'", '') outF.write(newick_replace_otuids(tree, biom)) oid_rows = {row['id']: row for row in biom['rows']} # calculate analysis results categories = None if args.map_categories is not None: categories = args.map_categories.split(',') # set transform if --stabilize_variance is specfied tform = bc.arcsine_sqrt_transform if args.stabilize_variance else None groups = util.gather_categories(imap, map_header, categories) for group in groups.values(): if args.analysis_metric in ['MRA', 'NMRA']: results = bc.MRA(biom, group.sids, transform=tform) elif args.analysis_metric == 'raw': results = bc.transform_raw_abundance(biom, sampleIDs=group.sids, sample_abd=False) group.results.update({oc.otu_name_biom(oid_rows[oid]): results[oid] for oid in results}) # write iTol data set file with open(args.output_itol_table, 'w') as itolF: itolF.write('LABELS\t' + '\t'.join(groups.keys())+'\n') itolF.write('COLORS\t{}\n'.format('\t'.join(['#ff0000' for _ in range(len(groups))]))) all_otus = frozenset({oc.otu_name_biom(row) for row in biom['rows']}) for oname in all_otus: row = ['{name}'] # \t{s:.2f}\t{ns:.2f}\n' row_data = {'name': oname} msum = 0 for name, group in groups.iteritems(): row.append('{{{}:.5f}}'.format(name)) if oname in group.results: row_data[name] = group.results[oname] else: row_data[name] = 0.0 msum += row_data[name] # normalize avg relative abundance data if args.analysis_metric == 'NMRA' and msum > 0: row_data.update({key: data/msum for key, data in row_data.items() if key != 'name'}) itolF.write('\t'.join(row).format(**row_data) + '\n')
def main(): args = handle_program_options() try: with open(args.otu_table): pass except IOError as ioe: sys.exit("\nError with BIOM format file:{}\n".format(ioe)) try: with open(args.pcoa_fp): pass except IOError as ioe: sys.exit("\nError with principal coordinates file:{}\n".format(ioe)) try: with open(args.mapping): pass except IOError as ioe: sys.exit("\nError with mapping file:{}\n".format(ioe)) # check that the output dir exists, create it if not util.ensure_dir(args.output_dir) # load the BIOM table biomtbl = biom.load_table(args.otu_table) # Read unifrac principal coordinates file unifrac = util.parse_unifrac(args.pcoa_fp) # Read otu data file otus = set() with open(args.otu_ids_fp, "rU") as nciF: for line in nciF.readlines(): line = line.strip() otus.add(line) # Gather categories from mapping file header, imap = util.parse_map_file(args.mapping) try: category_idx = header.index(args.group_by) except ValueError: msg = "Error: Specified mapping category '{}' not found." sys.exit(msg.format(args.group_by)) category_ids = util.gather_categories(imap, header, [args.group_by]) color_map = util.color_mapping(imap, header, args.group_by, args.colors) rel_abd = bc.relative_abundance(biomtbl) rel_abd = bc.arcsine_sqrt_transform(rel_abd) # plot samples based on relative abundance of some OTU ID for otuid in otus: otuname = oc.otu_name( biomtbl.metadata(otuid, axis="observation")["taxonomy"]) cat_data = { cat: { "pc1": [], "pc2": [], "size": [] } for cat in category_ids } for sid in unifrac["pcd"]: category = cat_data[imap[sid][category_idx]] try: size = rel_abd[sid][otuid] * args.scale_by except KeyError as ke: print("{} not found in {} sample.".format(ke, sid)) continue category["pc1"].append(float(unifrac["pcd"][sid][0])) category["pc2"].append(float(unifrac["pcd"][sid][1])) category["size"].append(size) if args.verbose: print("Saving chart for {}".format(" ".join(otuname.split("_")))) xr, yr = calculate_xy_range(cat_data) plot_PCoA(cat_data, otuname, unifrac, color_map.keys(), color_map, xr, yr, args.output_dir, args.save_as, args.ggplot2_style)
def main(): args = program_options() try: biomf = biom.load_table(args.in_biomf) except IOError as ioe: sys.exit("Error with input BIOM format file: {}".format(ioe)) else: rel_abd = relative_abundance(biomf) ast_rel_abd = ast(rel_abd) # Get pairwise combinations of OTUs otu_combos = list(combinations(biomf.ids("observation"), 2)) try: mheader, mdata = parse_map_file(args.map_fnh) except IOError as ioe: sys.exit("Error with input mapping file: {}".format(ioe)) else: # Gather sampleID categories sid_cat = gather_categories(mdata, mheader, [args.category_column]) # Create arguments for helper function to be supplied to multiprocessing pool.map() chunksize = 10000 jobs = [( otu_combos[x:x + chunksize], sid_cat, ast_rel_abd, ) for x in xrange(0, len(otu_combos), chunksize)] print("{0} jobs created.".format(len(jobs))) # Start multiprocessing jobs try: print("Starting map_async()...") pool = Pool() res = pool.map_async(calc_corr_helper, jobs) pool.close() pool.join() except Exception: sys.exit("Error while calculating correlations\n{}".format( format_exc())) else: s_rho_calc = [] k_tau_calc = [] for r in res.get(): for s in r: if s[0] == "Spearman": s_rho_calc.append(s) else: k_tau_calc.append(s) # Get FDR corrected correlation results print("Running FDR correction on {} Spearman's Rho.".format( len(s_rho_calc))) fdr_corr_s_rho = run_fdr(s_rho_calc) print("Running FDR correction on {} Kendall Tau.".format(len(k_tau_calc))) fdr_corr_k_tau = run_fdr(k_tau_calc) # Consolidate correlation results k_kos = {( e[2], e[3], ) for e in fdr_corr_k_tau} s_kos = {( f[2], f[3], ) for f in fdr_corr_s_rho} final_kos = s_kos & k_kos print( "{0} elements from KendallTau\n{1} elements from SpearmanRho\n{2} elements are " "common to both.".format(len(k_kos), len(s_kos), len(final_kos))) final_fdr_corr_results = [ cdata[1:] for cdata in fdr_corr_s_rho if ( cdata[2], cdata[3], ) in final_kos ] # Write our results to file with open(args.out_fnh, "w") as outf: outf.write("Category\tVariable\tby Variable\tCorrelation\tp value\n") for k in final_fdr_corr_results: outf.write("{0}\t{1}\t{2}\t{3}\t{4}\n".format( k[0], k[1], k[2], k[3], k[4]))