Example #1
0
def main():
    args = handle_program_options()

    map_header, imap = util.parse_map_file(args.map_fp)

    df = pd.read_csv(args.biom_tsv, sep='\t', index_col=0).T
    # exclude Sample IDs not in the mapping file
    df = df.loc[imap.keys()]

    cat_gather = util.gather_categories(imap, map_header, args.group_by)
    if len(cat_gather) < 2:
        sys.stderr.write("ERROR: Only one category value found. Linear \
        Discriminant Analysis requires at least two categories to compare.")
        return

    color_gather = util.gather_categories(imap, map_header, [args.color_by])

    class_map = merge_dicts(*[{sid: cat for sid in cat_gather[cat].sids}
                              for cat in cat_gather])
    class_colors = merge_dicts(*[{class_map[sid]: color
                                  for sid in color_gather[color].sids}
                                 for color in color_gather])

    df.insert(0, "Condition", [class_map[entry] for entry in df.index])

    if args.save_lda_input:
        df.to_csv(args.save_lda_input)

    X_lda, y_lda = run_LDA(df)

    plot_LDA(X_lda, y_lda, class_colors, out_fp=args.out_fp, dpi=args.dpi,
             title=args.plot_title)
Example #2
0
def main():
    args = prog_options()

    try:
        biomf = biom.load_table(args.in_biomf)
    except IOError as ioe:
        sys.exit("Error with input BIOM format file: {}".format(ioe))
    else:
        biomf_pa = biomf.pa(
            inplace=False)  # convert to presence/absence BIOM table
        obs_ids = biomf_pa.ids("observation")

    try:
        mheader, mdata = parse_map_file(args.map_fnh)
    except IOError as ioe:
        sys.exit("Error with input mapping file: {}".format(ioe))
    else:
        if args.group_by:
            sid_cat = gather_categories(mdata, mheader, [args.group_by])
        else:
            sid_cat = gather_categories(mdata, mheader)

    # calculate core
    core_calc = {k: set() for k in sid_cat.keys()}
    for idx in obs_ids:
        for cat, val in sid_cat.iteritems():
            obs_count = 0
            num_of_samples = len(val.sids)
            for sid in val.sids:
                try:
                    assert biomf_pa.get_value_by_ids(idx, sid) == 1
                except AssertionError:
                    continue
                else:
                    obs_count += 1
            try:
                assert obs_count > round(args.core_pct * num_of_samples)
            except AssertionError:
                continue
            else:
                core_calc[cat].add(idx)

    # Check if output directory exists, if not, create it
    try:
        assert os.path.exists(os.path.abspath(args.out_fnh)) is True
    except AssertionError:
        os.makedirs(os.path.abspath(args.out_fnh))
    finally:
        for k, v in core_calc.iteritems():
            print("{0} core IDs in {1}".format(len(v), k))
            idx_filename = os.path.join(os.path.abspath(args.out_fnh),
                                        k + "_80_pct_core_ids.txt")
            with open(idx_filename, "w") as of:
                of.write("{0}".format("\n".join(sorted(v))))
            filtered_biomf = biomf.filter(v, axis="observation", inplace=False)
            if args.biom_out:
                biom_filename = os.path.join(os.path.abspath(args.out_fnh),
                                             k + "_80_pct_core.biom")
                with biom_open(biom_filename, "w") as f:
                    filtered_biomf.to_hdf5(f, "CORE BIOM")
def gather_sequences(fastaFN, mapFN):
    barcodes = [entry[1] for entry in util.parse_map_file(mapFN)[1].values()]
    seqs = []
    bcodelen = len(barcodes[0])
    count = 0

    for record in SeqIO.parse(fastaFN, "fasta", generic_dna):
        count += 1
        if str(record.seq)[:bcodelen] in barcodes:
            seqs.append(record)

    return seqs, count
Example #4
0
def gather_sequences(fastaFN, mapFN):
    barcodes = [entry[1] for entry in util.parse_map_file(mapFN)[1].values()]
    seqs = []
    bcodelen = len(barcodes[0])
    count = 0

    for record in SeqIO.parse(fastaFN, "fasta", generic_dna):
        count += 1
        if str(record.seq)[:bcodelen] in barcodes:
            seqs.append(record)

    return seqs, count
def main():
    args = handle_program_options()

    try:
        with open(args.input_biom_fp):
            pass
    except IOError as ioe:
        sys.exit('\nError with input BIOM-format file:{}\n'.format(ioe))

    try:
        with open(args.mapping):
            pass
    except IOError as ioe:
        sys.exit('\nError with mapping file:{}\n'.format(ioe))

    out_fp, ext = osp.splitext(args.output_biom_fp)

    with open(args.input_biom_fp) as bF:
        biom = json.loads(bF.readline())

    header, mapping = util.parse_map_file(args.mapping)

    try:
        category_id = header.index(args.map_category)
    except ValueError:
        sys.exit('Category {} not found in supplied mapping file.'.format(
            args.map_category))

    values = {mapping[sid][category_id] for sid in mapping}
    biom_copies = {value: copy.deepcopy(biom) for value in values}
    split_samples = split_by_category(biom['columns'], mapping, category_id)
    for cat_val in biom_copies:
        biom_copies[cat_val]['data'] = []
        biom_copies[cat_val]['rows'], biom_copies[cat_val]['columns'] = [
            item[1] for item in split_samples[cat_val]
        ], biom_copies[cat_val]['rows']
        sample_ids = [item[0] for item in split_samples[cat_val]]

        for i in xrange(len(biom['data'])):
            if biom['data'][i][1] in sample_ids:
                row, col, amt = biom['data'][i]
                biom_copies[cat_val]['data'].append(
                    [sample_ids.index(col), row, amt])

        biom_copies[cat_val]['shape'] = [
            len(biom_copies[cat_val]['rows']),
            len(biom_copies[cat_val]['columns'])
        ]

        with open(out_fp + '_' + cat_val + ext, 'w') as outF:
            outF.write(json.dumps(biom_copies[cat_val]))
Example #6
0
def main():
    args = handle_program_options()

    try:
        with open(args.map_file):
            pass
    except IOError as ioe:
            err_msg = '\nError opening QIIME mapping file: {}\n'
            sys.exit(err_msg.format(ioe))

    try:
        with open(args.biom_fp):
            pass
    except IOError as ioe:
            err_msg = '\nError opening BIOM table file: {}\n'
            sys.exit(err_msg.format(ioe))

    header, sample_map = putil.parse_map_file(args.map_file)
    biom_tbl = biom.load_table(args.biom_fp)
    if args.category not in header:
        sys.exit('Category \'{}\' not found'.format(args.category))
    
    cat_idx = header.index(args.category)
    cat_vals = {entry[cat_idx] for entry in sample_map.values()}

    plot_title = args.plot_title

    colors = color_mapping(sample_map, header, args.category, args.color_by)

    # Perform diversity calculations and density plotting
    for method, x_label in zip(args.diversity, args.x_label):
        if method not in alpha.__all__:
            sys.exit("ERROR: Diversity metric not found: " + method)
        metric = eval('alpha.'+method)
        div_calc, sample_ids = calc_diversity(metric, sample_map, biom_tbl, cat_vals, cat_idx)
        
        plot_group_diversity(div_calc, colors, plot_title, x_label,
                             args.out_dir, args.image_type)

        print "Diversity significance testing: {}".format(x_label)
        # calculate and print significance testing results
        if len(cat_vals) == 2:
            print_WilcoxonSRT(*div_calc.values())
        elif len(cat_vals) > 2:
            print_KruskalWallisH(div_calc.values())
        print

        if args.save_calculations:
            prefix = '_'.join(x_label.split())
            write_diversity_metrics(div_calc, sample_ids, osp.join(args.out_dir, args.save_calculations))
Example #7
0
def main():
    args = handle_program_options()

    # Read in biom file
    try:
        shared_biom = biom.load_table(args.input_biom_fp)
    except IOError as ie:
        sys.exit("\nError reading BIOM file: {}\n".format(ie))
    norm_shared_biom = shared_biom.norm(axis="sample", inplace=False)

    # Read in mapping file
    try:
        header, imap = parse_map_file(args.map_fp)
    except IOError as ioe:
        sys.exit("\nError in metadata mapping filepath: {}\n".format(ioe))

    # Samples for each group and  get DO values per category
    try:
        assert args.group_by is None
    except AssertionError:
        data_gather = gather_categories(imap, header, args.group_by.split(","))
        sample_list = [
            sid for cat in data_gather.keys() for sid in data_gather[cat].sids
        ]
    else:
        sample_list = norm_shared_biom.ids()
    doc = calc_doc(norm_shared_biom, sample_list)
    try:
        assert doc is not None
    except AssertionError:
        sys.exit("Error in DOC calculations. Please check the modules.")

    # Get confidence interval
    sl_lowess_regr = get_doc_ci(doc,
                                args.frac,
                                args.plot_ci,
                                sample_list,
                                num_of_seqs=args.num_iterations)

    # Plot the residual figure
    plot_residplot(sl_lowess_regr, args.residplot, save=args.save_image)

    # Plot DOC
    plot_doc(sl_lowess_regr,
             args.residplot,
             ci=args.plot_ci,
             title=args.title,
             save=args.save_image)
Example #8
0
def main():
    args = handle_program_options()

    try:
        # Load biom format file
        biomf = biom.load_table(args.input_biom_fp)
    except TypeError as te:
        sys.exit(
            "The data in the path does not appear to be a BIOM format table. "
            "Error: {}.".format(te))

    # Determine OTUIDs present in each sample
    sample_otus = oc.assign_otu_membership(biomf)

    try:
        # Parse mapping file
        header, imap = util.parse_map_file(args.mapping_file)
    except ValueError as ve:
        sys.exit("Error: {}.".format(ve))

    # Get relevant category information
    group_data = util.gather_categories(imap, header, [args.category_column])

    # Initialize results dict in group_data with {"otuids": set()} for each category
    for group in group_data:
        group_data[group].results["otuids"] = set()

    # Collect all OTUIDs present in each category
    for sid in sample_otus:
        group = sample_group(sid, group_data)
        group_data[group].results["otuids"].update(sample_otus[sid])

    if args.reverse:
        # Get shared OTUIDs
        shared = shared_otuids(group_data)
        # Write out shared OTUIDs results
        shared_df = pd.DataFrame.from_dict(shared, orient="index").T
        shared_df.to_csv(args.reverse, sep="\t", index=False)
    # Create input for unique_otus
    group_otuids = {
        group: group_data[group].results["otuids"]
        for group in group_data
    }
    # Write out unique OTUIDs to file
    write_uniques(args.output_dir, args.prefix, unique_otuids(group_otuids))
def main():
    args = handle_program_options()

    try:
        with open(args.input_biom_fp):
            pass
    except IOError as ioe:
        sys.exit('\nError with input BIOM-format file:{}\n'.format(ioe))

    try:
        with open(args.mapping):
            pass
    except IOError as ioe:
        sys.exit('\nError with mapping file:{}\n'.format(ioe))

    out_fp, ext = osp.splitext(args.output_biom_fp)

    with open(args.input_biom_fp) as bF:
        biom = json.loads(bF.readline())

    header, mapping = util.parse_map_file(args.mapping)

    try:
        category_id = header.index(args.map_category)
    except ValueError:
        sys.exit('Category {} not found in supplied mapping file.'.format(args.map_category))

    values = {mapping[sid][category_id] for sid in mapping}
    biom_copies = {value: copy.deepcopy(biom) for value in values}
    split_samples = split_by_category(biom['columns'], mapping, category_id)
    for cat_val in biom_copies:
        biom_copies[cat_val]['data'] = []
        biom_copies[cat_val]['rows'], biom_copies[cat_val]['columns'] = [item[1] for item in split_samples[cat_val]], biom_copies[cat_val]['rows']
        sample_ids = [item[0] for item in split_samples[cat_val]]

        for i in xrange(len(biom['data'])):
            if biom['data'][i][1] in sample_ids:
                row, col, amt = biom['data'][i]
                biom_copies[cat_val]['data'].append([sample_ids.index(col), row, amt])

        biom_copies[cat_val]['shape'] = [len(biom_copies[cat_val]['rows']),
                                         len(biom_copies[cat_val]['columns'])]

        with open(out_fp + '_' + cat_val + ext, 'w') as outF:
            outF.write(json.dumps(biom_copies[cat_val]))
def main():
    args = handle_program_options()

    # Parse and read mapping file
    try:
        header, imap = util.parse_map_file(args.map_fp)
        category_idx = header.index(args.group_by)
    except IOError as ioe:
        err_msg = "\nError in metadata mapping filepath (-m): {}\n"
        sys.exit(err_msg.format(ioe))

    # map groups to colors
    class_colors = util.color_mapping(imap, header, args.group_by,
                                      args.color_by)

    core_files = args.core_files
    tsv = False
    if args.core_files is None:
        core_files = args.tsv_core_files
        tsv = True

    # map each core file to its matching category in the mapping file
    group_cores = OrderedDict()
    for group, fp in zip(class_colors, core_files):
        if not tsv:
            core = load_core_file(fp)
            group_cores[group] = [
                name.replace("_", " ") for name in core.values()
                if not name.startswith("Unclassified")
            ]
        else:
            group_cores[group] = load_tsv_core(fp, args.skipheader)

    # create the overlap set of OTUs and plot
    overlap = set()
    overlap.update(*group_cores.values())

    plot_overlaps(overlap,
                  group_cores,
                  class_colors,
                  out_fp=args.out_fp,
                  fig_size=args.figsize,
                  title=args.title,
                  filter_common=args.filtercommon)
def main():
    args = handle_program_options()

    try:
        # Load biom format file
        biomf = biom.load_table(args.input_biom_fp)
    except TypeError as te:
        sys.exit("The data in the path does not appear to be a BIOM format table. "
                 "Error: {}.".format(te))

    # Determine OTUIDs present in each sample
    sample_otus = assign_otu_membership(biomf)

    try:
        # Parse mapping file
        header, imap = util.parse_map_file(args.mapping_file)
    except ValueError as ve:
        sys.exit("Error: {}.".format(ve))

    # Get relevant category information
    group_data = util.gather_categories(imap, header, [args.category_column])

    # Initialize results dict in group_data with {"otuids": set()} for each category
    for group in group_data:
        group_data[group].results["otuids"] = set()

    # Collect all OTUIDs present in each category
    for sid in sample_otus:
        group = sample_group(sid, group_data)
        group_data[group].results["otuids"].update(sample_otus[sid])

    if args.reverse:
        # Get shared OTUIDs
        shared = shared_otuids(group_data)
        # Write out shared OTUIDs results
        shared_df = pd.DataFrame.from_dict(shared, orient="index").T
        shared_df.to_csv(args.reverse, sep="\t", index=False)
    else:
        # Create input for unique_otus
        group_otuids = {group: group_data[group].results["otuids"]
                        for group in group_data}
        # Write out unique OTUIDs to file
        write_uniques(args.output_dir, args.prefix, unique_otuids(group_otuids))
Example #12
0
def main():
    args = handle_program_options()

    # Read in the distance data
    try:
        dm_data = pd.read_csv(args.dist_matrix_file, sep="\t", index_col=0)
    except IOError as ioe:
        sys.exit("\nError reading in distance matrix file: {}.".format(ioe))

    # mapping and colors info for plotting
    try:
        header, map_data = util.parse_map_file(args.map_fp)
    except IOError as ioe:
        sys.exit("\nError reading mapping file: {}.".format(ioe))
    y = [map_data[sid][header.index(args.group_by)] for sid in dm_data.index]
    cond_colors = util.color_mapping(map_data, header, args.group_by, args.color_by)

    # Prep input data for t-SNE
    X = dm_data[range(dm_data.shape[1])].values
    X_tsne = TSNE(n_components=3, metric="precomputed").fit_transform(X)

    # Plot t-SNE result
    fig = plt.figure(figsize=(14, 8))
    for cond, sid, xy in zip(y, dm_data.index, X_tsne):
        plt.scatter(x=xy[0], y=xy[1], s=150, c=cond_colors[cond], alpha=0.85,
                    edgecolors="k")
        if args.annotate:
            plt.annotate(s=sid, xy=(xy[0], xy[1]), xytext=(12, 12),
                         textcoords="offset points", ha="center", va="center",
                         alpha=1, style="italic")
    if args.plot_title is not None:
        plt.title(args.plot_title, fontsize=16, weight="bold")
    l = [plt.scatter([], [], c=cond_colors[cond], s=150, edgecolors="k")
         for cond in cond_colors]
    plt.legend(l, ["{}".format(cond) for cond in cond_colors], loc="best",
               scatterpoints=3, frameon=True, framealpha=1, fontsize=14)
    plt.xlabel("t-SNE 1", fontsize=16)
    plt.ylabel("t-SNE 2", fontsize=16)
    plt.xticks(size=12)
    plt.yticks(size=12)
    plt.grid()
    plt.show()
def main():
    args = handle_program_options()

    # Parse and read mapping file
    try:
        header, imap = util.parse_map_file(args.map_fp)
        category_idx = header.index(args.group_by)
    except IOError as ioe:
        err_msg = "\nError in metadata mapping filepath (-m): {}\n"
        sys.exit(err_msg.format(ioe))

    # map groups to colors
    class_colors = util.color_mapping(imap, header, args.group_by, args.color_by)

    core_files = args.core_files
    tsv = False
    if args.core_files is None:
        core_files = args.tsv_core_files
        tsv = True

    # map each core file to its matching category in the mapping file
    group_cores = OrderedDict()
    for group, fp in zip(class_colors, core_files):
        if not tsv:
            core = load_core_file(fp)
            group_cores[group] = [name.replace("_", " ") for name in core.values()
                                    if not name.startswith("Unclassified")]
        else:
            group_cores[group] = load_tsv_core(fp, args.skipheader)

    # create the overlap set of OTUs and plot
    overlap = set()
    overlap.update(*group_cores.values())

    plot_overlaps(overlap, group_cores, class_colors, 
                  out_fp=args.out_fp, fig_size=args.figsize, title=args.title,
                  filter_common=args.filtercommon)
Example #14
0
def main():
    args = handle_program_options()

    # Error check input file
    try:
        biomf = biom.load_table(args.input_biom_fnh)
    except IOError as ioe:
        sys.exit("\nError in BIOM file path: {}\n".format(ioe))
    try:
        mheader, mapf = util.parse_map_file(args.mapping_fnh)
    except IOError as ioe:
        sys.exit("\nError in mapping file path: {}\n".format(ioe))

    # Get filtered biom data
    sid_filtered_biomf = biomf.filter(mapf.keys(), inplace=False)
    print("\n{} sampleIDs retained from original biom file.".
          format(len(sid_filtered_biomf.ids())))
    obs_abd_sum = sid_filtered_biomf.sum(axis="observation")
    otuids = sid_filtered_biomf.ids("observation")
    abd_sum = {a: b for a, b in zip(otuids, obs_abd_sum)}

    # Run checks for redundant otus
    redundant_otuids = [otu for otu, abd in abd_sum.items() if abd == 0]
    otuid_filtered_biomf = sid_filtered_biomf.filter(redundant_otuids,
                                                     "observation",
                                                     invert=True,
                                                     inplace=False)
    print("{} IDs filtered out of the original biom file.\n".
          format(len(redundant_otuids)))

    # Write out files
    with bo(args.output_biom_fnh, "w") as rth:
        otuid_filtered_biomf.to_hdf5(rth, "Filtered OTU Table.")
    with open(args.filter_otuids_fnh, "w") as yui:
        for otuid in redundant_otuids:
            yui.write("{}\n".format(otuid))
Example #15
0
def main():
    args = handle_program_options()

    try:
        with open(args.otu_table):
            pass
    except IOError as ioe:
        sys.exit(
            "\nError with OTU_Sample abundance data file:{}\n".format(ioe))

    try:
        with open(args.mapping):
            pass
    except IOError as ioe:
        sys.exit("\nError with mapping file:{}\n".format(ioe))

    # input data
    biomf = biom.load_table(args.otu_table)
    map_header, imap = util.parse_map_file(args.mapping)

    # rewrite tree file with otu names, skip if keep_otuids specified
    if args.input_tree and not args.keep_otuids:
        with open(args.input_tree) as treF, open(args.output_tre, "w") as outF:
            tree = treF.readline()
            if "'" in tree:
                tree = tree.replace("'", '')
            outF.write(newick_replace_otuids(tree, biomf))

    if not args.keep_otuids:
        oid_rows = {
            id_: md["taxonomy"]
            for val, id_, md in biomf.iter(axis="observation")
        }

    # calculate analysis results
    categories = None
    if args.map_categories is not None and args.analysis_metric != "raw":
        categories = args.map_categories.split(",")

    # set transform if --stabilize_variance is specfied
    tform = bc.arcsine_sqrt_transform if args.stabilize_variance else None

    groups = util.gather_categories(imap, map_header, categories)
    for group in groups.values():
        if args.analysis_metric in ["MRA", "NMRA"]:
            results = bc.MRA(biomf, group.sids, transform=tform)
        elif args.analysis_metric == "raw":
            results = bc.transform_raw_abundance(biomf,
                                                 sampleIDs=group.sids,
                                                 sample_abd=False)
        if args.keep_otuids:
            group.results.update({oid: results[oid] for oid in results})
        else:
            group.results.update(
                {oc.otu_name(oid_rows[oid]): results[oid]
                 for oid in results})

    # write iTol data set file
    with open(args.output_itol_table, "w") as itolF:
        if args.analysis_metric == "raw":
            itolF.write("DATASET_GRADIENT\nSEPARATOR TAB\n")
            itolF.write("DATASET_LABEL\tLog Total Abundance\n")
            itolF.write("COLOR\t#000000\n")
            itolF.write("LEGEND_TITLE\tLog Total Abundance\n")
            itolF.write("LEGEND_SHAPES\t1\n")
            itolF.write("LEGEND_COLORS\t#000000\n")
            itolF.write("LEGEND_LABELS\tLog Total Abundance\n")
            itolF.write("COLOR_MIN\t#FFFFFF\n")
            itolF.write("COLOR_MAX\t#000000\n")
        else:
            itolF.write("DATASET_MULTIBAR\nSEPARATOR TAB\n")
            itolF.write("DATASET_LABEL\t{}\n".format(args.analysis_metric))
            itolF.write("FIELD_COLORS\t{}\n".format("\t".join(
                ["#ff0000" for _ in range(len(groups))])))
            itolF.write("FIELD_LABELS\t" + "\t".join(groups.keys()) + "\n")
            itolF.write("LEGEND_TITLE\t{}\n".format(args.analysis_metric))
            itolF.write("LEGEND_SHAPES\t{}\n".format("\t".join(
                ["1" for _ in range(len(groups))])))
            itolF.write("LEGEND_COLORS\t{}\n".format("\t".join(
                ["#ff0000" for _ in range(len(groups))])))
            itolF.write("LEGEND_LABELS\t" + "\t".join(groups.keys()) + "\n")
            itolF.write("WIDTH\t300\n")
        itolF.write("DATA\n")

        if args.keep_otuids:
            all_otus = frozenset(
                {id_
                 for id_ in biomf.ids(axis="observation")})
        else:
            all_otus = frozenset({
                oc.otu_name(md["taxonomy"])
                for val, id_, md in biomf.iter(axis="observation")
            })

        for oname in all_otus:
            row = ["{name}"]  # \t{s:.2f}\t{ns:.2f}\n"
            row_data = {"name": oname}
            msum = 0
            for name, group in groups.iteritems():
                row.append("{{{}:.5f}}".format(name))
                if oname in group.results:
                    row_data[name] = group.results[oname]
                else:
                    row_data[name] = 0.0
                msum += row_data[name]
            # normalize avg relative abundance data
            if args.analysis_metric == "NMRA" and msum > 0:
                row_data.update({
                    key: data / msum
                    for key, data in row_data.items() if key != "name"
                })
            itolF.write("\t".join(row).format(**row_data) + "\n")
Example #16
0
def main():
    args = handle_program_options()

    try:
        with open(args.otu_table):
            pass
    except IOError as ioe:
        sys.exit("\nError with BIOM format file:{}\n".format(ioe))

    try:
        with open(args.pcoa_fp):
            pass
    except IOError as ioe:
        sys.exit("\nError with principal coordinates file:{}\n".format(ioe))

    try:
        with open(args.mapping):
            pass
    except IOError as ioe:
        sys.exit("\nError with mapping file:{}\n".format(ioe))

    # check that the output dir exists, create it if not
    util.ensure_dir(args.output_dir)

    # load the BIOM table
    biomtbl = biom.load_table(args.otu_table)

    # Read unifrac principal coordinates file
    unifrac = util.parse_unifrac(args.pcoa_fp)

    # Read otu data file
    otus = set()
    with open(args.otu_ids_fp, "rU") as nciF:
        for line in nciF.readlines():
            line = line.strip()
            otus.add(line)

    # Gather categories from mapping file
    header, imap = util.parse_map_file(args.mapping)
    try:
        category_idx = header.index(args.group_by)
    except ValueError:
        msg = "Error: Specified mapping category '{}' not found."
        sys.exit(msg.format(args.group_by))
    category_ids = util.gather_categories(imap, header, [args.group_by])
    color_map = util.color_mapping(imap, header, args.group_by, args.colors)
    rel_abd = bc.relative_abundance(biomtbl)
    rel_abd = bc.arcsine_sqrt_transform(rel_abd)

    # plot samples based on relative abundance of some OTU ID
    for otuid in otus:
        otuname = oc.otu_name(biomtbl.metadata(otuid, axis="observation")["taxonomy"])
        cat_data = {cat: {"pc1": [], "pc2": [], "size": []}
                    for cat in category_ids}

        for sid in unifrac["pcd"]:
            category = cat_data[imap[sid][category_idx]]
            try:
                size = rel_abd[sid][otuid] * args.scale_by
            except KeyError as ke:
                print("{} not found in {} sample.".format(ke, sid))
                continue
            category["pc1"].append(float(unifrac["pcd"][sid][0]))
            category["pc2"].append(float(unifrac["pcd"][sid][1]))
            category["size"].append(size)

        if args.verbose:
            print("Saving chart for {}".format(" ".join(otuname.split("_"))))
        xr, yr = calculate_xy_range(cat_data)
        plot_PCoA(cat_data, otuname, unifrac, color_map.keys(),
                  color_map, xr, yr, args.output_dir,
                  args.save_as, args.ggplot2_style)
Example #17
0
 def setUp(self):
     """
     Setting up files, or data for testing purposes.
     """
     self.map_header, self.map_data = ut.parse_map_file(
         "phylotoast/test/test_mapping_file.txt")
Example #18
0
def main():
    args = handle_program_options()

    try:
        with open(args.coord_fp):
            pass
    except IOError as ioe:
        err_msg = "\nError in input principal coordinates filepath (-i): {}\n"
        sys.exit(err_msg.format(ioe))

    try:
        with open(args.map_fp):
            pass
    except IOError as ioe:
        err_msg = "\nError in input metadata mapping filepath (-m): {}\n"
        sys.exit(err_msg.format(ioe))

    with open(args.coord_fp) as F:
        pcd = F.readlines()
    pcd = [line.split("\t") for line in pcd]

    map_header, imap = util.parse_map_file(args.map_fp)

    data_gather = util.gather_categories(imap, map_header,
                                         args.group_by.split(","))
    categories = OrderedDict([(condition, {
        "pc1": [],
        "pc2": [],
        "pc3": []
    }) for condition in data_gather.keys()])

    bcolors = itertools.cycle(Set3_12.hex_colors)
    if not args.colors:
        colors = [bcolors.next() for _ in categories]
    else:
        colors = util.color_mapping(imap, map_header, args.group_by,
                                    args.colors)
        colors = colors.values()

    parsed_unifrac = util.parse_unifrac(args.coord_fp)

    pco = args.pc_order
    if args.dimensions == 3:
        pco.append(3)

    pc1v = parsed_unifrac["varexp"][pco[0] - 1]
    pc2v = parsed_unifrac["varexp"][pco[1] - 1]
    if args.dimensions == 3:
        pc3v = parsed_unifrac["varexp"][pco[2] - 1]

    for sid, points in parsed_unifrac["pcd"].items():
        for condition, dc in data_gather.items():
            if sid in dc.sids:
                cat = condition
                break
        categories[cat]["pc1"].append((sid, points[pco[0] - 1]))
        categories[cat]["pc2"].append((sid, points[pco[1] - 1]))

        if args.dimensions == 3:
            categories[cat]["pc3"].append((sid, points[pco[2] - 1]))

    axis_str = "PC{} (Percent Explained Variance {:.3f}%)"
    # initialize plot
    fig = plt.figure(figsize=args.figsize)
    if args.dimensions == 3:
        ax = fig.add_subplot(111, projection="3d")
        ax.view_init(elev=args.z_angles[1], azim=args.z_angles[0])
        ax.set_zlabel(axis_str.format(3, pc3v), labelpad=args.label_padding)
        if args.z_limits:
            ax.set_zlim(args.z_limits)
    else:
        ax = fig.add_subplot(111)

    # plot data
    for i, cat in enumerate(categories):
        if args.dimensions == 3:
            ax.scatter(xs=[e[1] for e in categories[cat]["pc1"]],
                       ys=[e[1] for e in categories[cat]["pc2"]],
                       zs=[e[1] for e in categories[cat]["pc3"]],
                       zdir="z",
                       c=colors[i],
                       s=args.point_size,
                       label=cat,
                       edgecolors="k")
        else:
            ax.scatter([e[1] for e in categories[cat]["pc1"]],
                       [e[1] for e in categories[cat]["pc2"]],
                       c=colors[i],
                       s=args.point_size,
                       label=cat,
                       edgecolors="k")

        # Script to annotate PCoA sample points.
        if args.annotate_points:
            for x, y in zip(categories[cat]["pc1"], categories[cat]["pc2"]):
                ax.annotate(
                    x[0],
                    xy=(x[1], y[1]),
                    xytext=(-10, -15),
                    textcoords="offset points",
                    ha="center",
                    va="center",
                )

    # customize plot options
    if args.x_limits:
        ax.set_xlim(args.x_limits)
    if args.y_limits:
        ax.set_ylim(args.y_limits)

    ax.set_xlabel(axis_str.format(pco[0], float(pc1v)),
                  labelpad=args.label_padding)
    ax.set_ylabel(axis_str.format(pco[1], float(pc2v)),
                  labelpad=args.label_padding)

    leg = plt.legend(loc="best", scatterpoints=3, frameon=True, framealpha=1)
    leg.get_frame().set_edgecolor('k')

    # Set the font characteristics
    font = {"family": "normal", "weight": "bold", "size": args.font_size}
    mpl.rc("font", **font)

    if args.title:
        ax.set_title(args.title)

    if args.ggplot2_style and not args.dimensions == 3:
        gu.ggplot2_style(ax)

    # save or display result
    if args.out_fp:
        fig.savefig(args.out_fp,
                    facecolor="white",
                    edgecolor="none",
                    bbox_inches="tight",
                    pad_inches=0.2)
    else:
        plt.show()
Example #19
0
def main():
    args = handle_program_options()

    try:
        with open(args.map_fp):
            pass
    except IOError as ioe:
        err_msg = "\nError in metadata mapping filepath (-m): {}\n"
        sys.exit(err_msg.format(ioe))

    # Parse and read mapping file and obtain group colors
    header, imap = util.parse_map_file(args.map_fp)
    class_colors = util.color_mapping(imap, header, args.group_by, args.color_by)

    if args.input_data_type == "unifrac_dm":
        try:
            with open(args.unifrac_file):
                pass
        except IOError as ioe:
            err_msg = "\nError with unifrac distance matrix file (-d): {}\n"
            sys.exit(err_msg.format(ioe))
        uf_data = pd.read_csv(args.unifrac_file, sep="\t", index_col=0)
        uf_data.insert(0, "Condition", [imap[sid][header.index(args.group_by)]
                                        for sid in uf_data.index])
        sampleids = uf_data.index
        if args.save_lda_input:
            uf_data.to_csv(args.save_lda_input, sep="\t")
        # Run LDA
        X_lda, y_lda, exp_var = run_LDA(uf_data)
        # Plot LDA
        plot_LDA(X_lda, y_lda, class_colors, exp_var, style=args.ggplot2_style,
                 out_fp=args.out_fp)
    else:
        # Load biom file and calculate relative abundance
        try:
            rel_abd = get_relative_abundance(args.biom_file)
        except ValueError as ve:
            err_msg = "\nError with biom format file (-d): {}\n"
            sys.exit(err_msg.format(ve))
        df_rel_abd = pd.DataFrame(rel_abd).T
        df_rel_abd.insert(0, "Condition", [imap[sid][header.index(args.group_by)]
                                           for sid in df_rel_abd.index])
        sampleids = df_rel_abd.index
        if args.save_lda_input:
            df_rel_abd.to_csv(args.save_lda_input, sep="\t")
        # Run LDA
        X_lda, y_lda, exp_var = run_LDA(df_rel_abd)
        # Plot LDA
        plot_LDA(X_lda, y_lda, class_colors, exp_var, style=args.ggplot2_style,
                 out_fp=args.out_fp)

    if args.bubble:
        # Get otus for LDA bubble plots
        try:
            with open(args.bubble) as hojiehr:
                for line in hojiehr.readlines():
                    bubble_otus = line.strip().split("\r")
        except IOError as ioe:
            err_msg = "\nError in OTU name list file (--bubble): {}\n"
            sys.exit(err_msg.format(ioe))

        # Load biom file and calculate relative abundance
        try:
            rel_abd = get_relative_abundance(args.biom_file)
        except ValueError as ve:
            err_msg = "\nError with biom format file (-d): {}\n"
            sys.exit(err_msg.format(ve))
        category_idx = header.index(args.group_by)

        # Calculate position and size of SampleIDs to plot for each OTU
        for otuname in bubble_otus:
            plot_data = {cat: {"x": [], "y": [], "size": [], "label": []}
                         for cat in class_colors.keys()}
            for sid, data in zip(sampleids, X_lda):
                category = plot_data[imap[sid][category_idx]]
                try:
                    size = rel_abd[sid][otuname] * args.scale_by
                except KeyError as ke:
                    print "{} not found in {} sample.".format(ke, sid)
                    continue
                category["x"].append(float(data[0]))
                category["y"].append(float(data[1]))
                category["size"].append(size)

            # Plot LDA bubble for each OTU
            fig = plt.figure(figsize=(12, 9))
            ax = fig.add_subplot(111)
            for i, cat in enumerate(plot_data):
                plt.scatter(plot_data[cat]["x"], plot_data[cat]["y"],
                            plot_data[cat]["size"], label=cat,
                            color=class_colors[cat],
                            alpha=0.85, marker="o", edgecolor="k")
            mpl.rc("font", family="Arial")  # define font for figure text
            mpl.rc("xtick", labelsize=12)  # increase X axis ticksize
            mpl.rc("ytick", labelsize=12)  # increase Y axis ticksize
            if X_lda.shape[1] == 1:
                plt.ylim((0.5, 2.5))
            plt.title(" ".join(otuname.split("_")), style="italic")
            plt.xlabel("LD1 (Percent Explained Variance: {:.3f}%)".format(exp_var[0]*100),
                       fontsize=12)
            plt.ylabel("LD2 (Percent Explained Variance: {:.3f}%)".format(exp_var[1]*100),
                       fontsize=12)
            lgnd = plt.legend(loc="best", scatterpoints=3, fontsize=12)
            # Change the legend marker size manually
            for i in range(len(class_colors.keys())):
                lgnd.legendHandles[i]._sizes = [75]

            # Set style for LDA bubble plots
            if args.ggplot2_style:
                gu.ggplot2_style(ax)
                fc = "0.8"
            else:
                fc = "none"

            # Save LDA bubble plots to output directory
            print "Saving chart for {}".format(" ".join(otuname.split("_")))
            fig.savefig(os.path.join(args.output_dir, "_".join(otuname.split())) + "." + args.save_as,
                        facecolor=fc, edgecolor="none", dpi=300,
                        bbox_inches="tight", pad_inches=0.2)
            plt.close(fig)
Example #20
0
def main():
    args = handle_program_options()

    try:
        with open(args.otu_table):
            pass
    except IOError as ioe:
        sys.exit(
            '\nError with OTU_Sample abundance data file:{}\n'
            .format(ioe)
        )

    try:
        with open(args.mapping):
            pass
    except IOError as ioe:
        sys.exit(
            '\nError with mapping file:{}\n'
            .format(ioe)
        )

    # input data
    with open(args.otu_table) as bF:
        biom = json.loads(bF.readline())
    map_header, imap = util.parse_map_file(args.mapping)

    # rewrite tree file with otu names
    if args.input_tree:
        with open(args.input_tree) as treF, open(args.output_tre, 'w') as outF:
            tree = treF.readline()
            if "'" in tree:
                tree = tree.replace("'", '')
            outF.write(newick_replace_otuids(tree, biom))

    oid_rows = {row['id']: row for row in biom['rows']}

    # calculate analysis results
    categories = None
    if args.map_categories is not None:
        categories = args.map_categories.split(',')

    # set transform if --stabilize_variance is specfied
    tform = bc.arcsine_sqrt_transform if args.stabilize_variance else None

    groups = util.gather_categories(imap, map_header, categories)
    for group in groups.values():
        if args.analysis_metric in ['MRA', 'NMRA']:
            results = bc.MRA(biom, group.sids, transform=tform)
        elif args.analysis_metric == 'raw':
            results = bc.transform_raw_abundance(biom, sampleIDs=group.sids,
                                                 sample_abd=False)

        group.results.update({oc.otu_name_biom(oid_rows[oid]): results[oid]
                             for oid in results})

    # write iTol data set file
    with open(args.output_itol_table, 'w') as itolF:
        itolF.write('LABELS\t' + '\t'.join(groups.keys())+'\n')
        itolF.write('COLORS\t{}\n'.format('\t'.join(['#ff0000'
                    for _ in range(len(groups))])))
        all_otus = frozenset({oc.otu_name_biom(row) for row in biom['rows']})

        for oname in all_otus:
            row = ['{name}']        # \t{s:.2f}\t{ns:.2f}\n'
            row_data = {'name': oname}
            msum = 0
            for name, group in groups.iteritems():
                row.append('{{{}:.5f}}'.format(name))
                if oname in group.results:
                    row_data[name] = group.results[oname]
                else:
                    row_data[name] = 0.0
                msum += row_data[name]
            # normalize avg relative abundance data
            if args.analysis_metric == 'NMRA' and msum > 0:
                row_data.update({key: data/msum
                                for key, data in row_data.items()
                                if key != 'name'})

            itolF.write('\t'.join(row).format(**row_data) + '\n')
Example #21
0
def main():
    args = handle_program_options()

    try:
        with open(args.otu_table):
            pass
    except IOError as ioe:
        sys.exit("\nError with BIOM format file:{}\n".format(ioe))

    try:
        with open(args.pcoa_fp):
            pass
    except IOError as ioe:
        sys.exit("\nError with principal coordinates file:{}\n".format(ioe))

    try:
        with open(args.mapping):
            pass
    except IOError as ioe:
        sys.exit("\nError with mapping file:{}\n".format(ioe))

    if not os.path.exists(args.output_dir):
        try:
            os.mkdir(args.output_dir)
        except OSError as oe:
            if os.errno == 2:
                msg = ("One or more directories in the path provided for " +
                       "--output-dir ({}) do not exist. If you are specifying " +
                       "a new directory for output, please ensure all other " +
                       "directories in the path currently exist.")
                sys.exit(msg.format(args.output_dir))
            else:
                msg = ("An error occurred trying to create the output " +
                       "directory ({}) with message: {}")
                sys.exit(msg.format(args.output_dir, oe.strerror))

    # load the BIOM table
    biomtbl = biom.load_table(args.otu_table)

    # Read unifrac principal coordinates file
    unifrac = util.parse_unifrac(args.pcoa_fp)

    # Read otu data file
    otus = set()
    with open(args.otu_ids_fp, "rU") as nciF:
        for line in nciF.readlines():
            line = line.strip()
            otus.add(line)

    # Gather categories from mapping file
    header, imap = util.parse_map_file(args.mapping)
    try:
        category_idx = header.index(args.group_by)
    except ValueError:
        msg = "Error: Specified mapping category '{}' not found."
        sys.exit(msg.format(args.group_by))
    category_ids = util.gather_categories(imap, header, [args.group_by])
    color_map = util.color_mapping(imap, header, args.group_by, args.colors)
    rel_abd = get_relative_abundance(biomtbl)

    # plot samples based on relative abundance of some OTU ID
    for otuid in otus:
        otuname = oc.otu_name(biomtbl.metadata(otuid, axis="observation")["taxonomy"])
        cat_data = {cat: {"pc1": [], "pc2": [], "size": []}
                    for cat in category_ids}

        for sid in unifrac["pcd"]:
            category = cat_data[imap[sid][category_idx]]
            try:
                size = rel_abd[sid][otuid] * args.scale_by
            except KeyError as ke:
                print "{} not found in {} sample.".format(ke, sid)
                continue
            category["pc1"].append(float(unifrac["pcd"][sid][0]))
            category["pc2"].append(float(unifrac["pcd"][sid][1]))
            category["size"].append(size)

        if args.verbose:
            print "Saving chart for {}".format(" ".join(otuname.split("_")))
        xr, yr = calculate_xy_range(cat_data)
        plot_PCoA(cat_data, otuname, unifrac, color_map.keys(),
                  color_map, xr, yr, args.output_dir,
                  args.save_as, args.ggplot2_style)
Example #22
0
def main():
    args = handle_program_options()

    # Parse and read mapping file
    try:
        header, imap = util.parse_map_file(args.map_fp)
        category_idx = header.index(args.group_by)
    except IOError as ioe:
        err_msg = "\nError in metadata mapping filepath (-m): {}\n"
        sys.exit(err_msg.format(ioe))

    # Obtain group colors
    try:
        assert args.colors is not None
    except AssertionError:
        categories = {v[category_idx] for k, v in imap.items()}
        color_cycle = cycle(Set3_12.hex_colors)
        class_colors = {c: color_cycle.next() for c in categories}
    else:
        class_colors = util.color_mapping(imap, header, args.group_by,
                                          args.colors)

    if args.dist_matrix_file:
        try:
            dm_data = pd.read_csv(args.dist_matrix_file, sep="\t", index_col=0)
        except IOError as ioe:
            err_msg = "\nError with unifrac distance matrix file (-d): {}\n"
            sys.exit(err_msg.format(ioe))
        dm_data.insert(0, "Condition",
                       [imap[str(sid)][category_idx] for sid in dm_data.index])
        if args.annotate_points:
            sampleids = [str(sid) for sid in dm_data.index]
        else:
            sampleids = None
        if args.save_lda_input:
            dm_data.to_csv(args.save_lda_input, sep="\t")
        # Run LDA
        X_lda, y_lda, exp_var = run_LDA(dm_data)
    else:
        # Load biom file and calculate relative abundance
        try:
            biomf = biom.load_table(args.otu_table)
        except IOError as ioe:
            err_msg = "\nError with biom format file (-d): {}\n"
            sys.exit(err_msg.format(ioe))
        # Get normalized relative abundances
        rel_abd = bc.relative_abundance(biomf)
        rel_abd = bc.arcsine_sqrt_transform(rel_abd)
        df_rel_abd = pd.DataFrame(rel_abd).T
        df_rel_abd.insert(
            0, "Condition",
            [imap[sid][category_idx] for sid in df_rel_abd.index])
        if args.annotate_points:
            sampleids = df_rel_abd.index
        else:
            sampleids = None
        if args.save_lda_input:
            df_rel_abd.to_csv(args.save_lda_input, sep="\t")
        # Run LDA
        X_lda, y_lda, exp_var = run_LDA(df_rel_abd)

    # Plot LDA
    if args.dimensions == 3:
        plot_LDA(X_lda,
                 y_lda,
                 class_colors,
                 exp_var,
                 style=args.ggplot2_style,
                 fig_size=args.figsize,
                 label_pad=args.label_padding,
                 font_size=args.font_size,
                 sids=sampleids,
                 dim=3,
                 zangles=args.z_angles,
                 pt_size=args.point_size,
                 out_fp=args.out_fp)
    else:
        plot_LDA(X_lda,
                 y_lda,
                 class_colors,
                 exp_var,
                 style=args.ggplot2_style,
                 fig_size=args.figsize,
                 label_pad=args.label_padding,
                 font_size=args.font_size,
                 sids=sampleids,
                 pt_size=args.point_size,
                 out_fp=args.out_fp)
Example #23
0
def main():
    args = handle_program_options()

    metrics = [m for m in alpha.__all__ if "_ci" not in m]
    try:
        metrics.remove("faith_pd")
    except ValueError:
        pass
    if args.show_available_metrics:
        print "\nAvailable alpha diversity metrics:"
        return "\n".join(metrics)

    # check that the output dir exists, create it if not
    msg = putil.ensure_dir(args.output_dir)
    # if an error occurs, print and exit
    if msg:
        sys.exit(msg)

    # parse mapping file
    try:
        header, sample_map = putil.parse_map_file(args.map_file)
    except Exception as ioe:
        err_msg = "\nError while processing the mapping file: {}\n"
        sys.exit(err_msg.format(ioe))

    # parse BIOM table
    try:
        biom_tbl = biom.load_table(args.biom_fp)
    except Exception as ioe:
        err_msg = "\nError loading BIOM table file: {}\n"
        sys.exit(err_msg.format(ioe))

    # group samples by category
    if args.category not in header:
        sys.exit("Category '{}' not found".format(args.category))
    cat_idx = header.index(args.category)
    cat_vals = {entry[cat_idx] for entry in sample_map.values()}

    plot_title = args.plot_title

    colors = putil.color_mapping(sample_map, header, args.category,
                                 args.color_by)

    # Perform diversity calculations and density plotting
    for method, x_label in izip_longest(args.diversity, args.x_label):
        if x_label is None:
            x_label = method.title()
        if method not in alpha.__all__:
            sys.exit("ERROR: Diversity metric not found: {}.".format(method))
        elif method in alpha.__all__ and method not in metrics:
            sys.exit(
                "Currently, PhyloToAST does not support {} metric.".format(
                    method))
        metric = eval("alpha." + method)
        div_calc, sample_ids = calc_diversity(metric, sample_map, biom_tbl,
                                              cat_vals, cat_idx)

        if args.save_calculations:
            write_diversity_metrics(div_calc, sample_ids,
                                    args.save_calculations)

        plot_group_diversity(div_calc, colors, plot_title, x_label,
                             args.output_dir, args.image_type)

        # calculate and print significance testing results
        if not args.suppress_stats:
            print "Diversity significance testing: {}".format(x_label)
            if len(cat_vals) == 2:
                print_MannWhitneyU(div_calc)
            elif len(cat_vals) > 2:
                print_KruskalWallisH(div_calc)
            print
        else:
            continue
Example #24
0
def main():
    args = handle_program_options()

    try:
        with open(args.otu_table):
            pass
    except IOError as ioe:
        sys.exit('\nError with OTU/Sample abundance BIOM format file:{}\n'.format(ioe))

    try:
        with open(args.unifrac):
            pass
    except IOError as ioe:
        sys.exit('\nError with principle coordinates analysis file:{}\n'.format(ioe))

    try:
        with open(args.names_colors_ids_fn):
            pass
    except IOError as ioe:
        sys.exit('\nError with input data file:{}\n'.format(ioe))

    try:
        with open(args.mapping):
            pass
    except IOError as ioe:
        sys.exit('\nError with mapping file:{}\n'.format(ioe))

    if not os.path.exists(args.output_dir):
        try:
            os.mkdir(args.output_dir)
        except OSError as oe:
            if os.errno == 2:
                msg = ('One or more directories in the path provided for ' +
                       '--output-dir ({}) do not exist. If you are specifying ' +
                       'a new directory for output, please ensure all other ' +
                       'directories in the path currently exist.')
                sys.exit(msg.format(args.output_dir))
            else:
                msg = ('An error occurred trying to create the output ' +
                       'directory ({}) with message: {}')
                sys.exit(msg.format(args.output_dir, oe.strerror))

    with open(args.otu_table) as bF:
        biom = json.loads(bF.readline())

    unifrac = parse_unifrac(args.unifrac)

    otus = {}
    with open(args.names_colors_ids_fn, 'rU') as nciF:
        category_names = nciF.readline().strip().split('\t')
        category_colors = nciF.readline().strip().split('\t')
        for line in nciF.readlines():
            line = line.split()
            otus[line[0]] = ' '.join(line[1:])
    header, imap = util.parse_map_file(args.mapping)

    try:
        category_idx = header.index(args.map_category)
    except ValueError:
        msg = "Error: Specified mapping category '{}' not found"
        sys.exit(msg.format(args.map_category))
    category_ids = link_samples_to_categories(imap, category_idx)

    # plot samples based on relative abundance of some OTU ID
    for otuID in otus:
        cat_data = {cat: {'pc1': [], 'pc2': [], 'size': [], 'zpc1': [], 'zpc2': []}
                    for cat in category_ids}

        for sid in unifrac['pcd']:
            category = cat_data[imap[sid][category_idx]]
            size = rel_abundance(otuID, sid, biom, args.scaling_factor)
            # if size > 0:
            category['pc1'].append(float(unifrac['pcd'][sid][0]))
            category['pc2'].append(float(unifrac['pcd'][sid][1]))
            category['size'].append(size)
#            else:
#                category['zpc1'].append(float(unifrac['pcd'][sid][1]))
#                category['zpc2'].append(float(unifrac['pcd'][sid][2]))

        if args.verbose:
            print 'Plotting chart for {}'.format(otus[otuID])
        xr, yr = calculate_xy_range(cat_data)
        plot_PCoA(cat_data, otus[otuID], unifrac, category_names,
                  category_colors, xr, yr, args.output_dir)
Example #25
0
def main():
    args = handle_program_options()

    try:
        with open(args.otu_table):
            pass
    except IOError as ioe:
        sys.exit("\nError with BIOM format file:{}\n".format(ioe))

    try:
        with open(args.pcoa_fp):
            pass
    except IOError as ioe:
        sys.exit("\nError with principal coordinates file:{}\n".format(ioe))

    try:
        with open(args.mapping):
            pass
    except IOError as ioe:
        sys.exit("\nError with mapping file:{}\n".format(ioe))

    # check that the output dir exists, create it if not
    util.ensure_dir(args.output_dir)

    # load the BIOM table
    biomtbl = biom.load_table(args.otu_table)

    # Read unifrac principal coordinates file
    unifrac = util.parse_unifrac(args.pcoa_fp)

    # Read otu data file
    otus = set()
    with open(args.otu_ids_fp, "rU") as nciF:
        for line in nciF.readlines():
            line = line.strip()
            otus.add(line)

    # Gather categories from mapping file
    header, imap = util.parse_map_file(args.mapping)
    try:
        category_idx = header.index(args.group_by)
    except ValueError:
        msg = "Error: Specified mapping category '{}' not found."
        sys.exit(msg.format(args.group_by))
    category_ids = util.gather_categories(imap, header, [args.group_by])
    color_map = util.color_mapping(imap, header, args.group_by, args.colors)
    rel_abd = bc.relative_abundance(biomtbl)
    rel_abd = bc.arcsine_sqrt_transform(rel_abd)

    # plot samples based on relative abundance of some OTU ID
    for otuid in otus:
        otuname = oc.otu_name(
            biomtbl.metadata(otuid, axis="observation")["taxonomy"])
        cat_data = {
            cat: {
                "pc1": [],
                "pc2": [],
                "size": []
            }
            for cat in category_ids
        }

        for sid in unifrac["pcd"]:
            category = cat_data[imap[sid][category_idx]]
            try:
                size = rel_abd[sid][otuid] * args.scale_by
            except KeyError as ke:
                print("{} not found in {} sample.".format(ke, sid))
                continue
            category["pc1"].append(float(unifrac["pcd"][sid][0]))
            category["pc2"].append(float(unifrac["pcd"][sid][1]))
            category["size"].append(size)

        if args.verbose:
            print("Saving chart for {}".format(" ".join(otuname.split("_"))))
        xr, yr = calculate_xy_range(cat_data)
        plot_PCoA(cat_data, otuname, unifrac, color_map.keys(), color_map, xr,
                  yr, args.output_dir, args.save_as, args.ggplot2_style)
Example #26
0
def main():
    args = handle_program_options()

    # Parse and read mapping file
    try:
        header, imap = util.parse_map_file(args.map_fp)
        category_idx = header.index(args.group_by)
    except IOError as ioe:
        err_msg = "\nError in metadata mapping filepath (-m): {}\n"
        sys.exit(err_msg.format(ioe))
    # Obtain group colors
    class_colors = util.color_mapping(imap, header, args.group_by, args.color_by)

    # Get otus for LDA bubble plots
    try:
        bubble_otus = set(pd.read_csv(args.otu_ids_fp, sep="\n", header=None)[0])
    except IOError as ioe:
        err_msg = "\nError in OTU IDs file (--bubble): {}\n"
        sys.exit(err_msg.format(ioe))

    # Load biom file and calculate relative abundance
    try:
        biomf = biom.load_table(args.otu_table)
    except IOError as ioe:
        err_msg = "\nError with biom format file (-d): {}\n"
        sys.exit(err_msg.format(ioe))

    # Get normalized relative abundances
    rel_abd = bc.relative_abundance(biomf)
    rel_abd = bc.arcsine_sqrt_transform(rel_abd)
    abd_val = {abd for sid, v1 in rel_abd.items() for otuid, abd in v1.items() if abd > 0}
    bubble_range = np.linspace(min(abd_val), max(abd_val), num=5) * args.scale_by
    # Get abundance to the nearest 50
    bubble_range = [int(50 * round(float(abd)/50)) for abd in bubble_range[1:]]

    # Set up input for LDA calc and get LDA transformed data
    if args.dist_matrix_file:
        try:
            uf_data = pd.read_csv(args.dist_matrix_file, sep="\t", index_col=0)
        except IOError as ioe:
            err_msg = "\nError with unifrac distance matrix file (-d): {}\n"
            sys.exit(err_msg.format(ioe))
        uf_data.insert(0, "Condition", [imap[sid][category_idx] for sid in uf_data.index])
        sampleids = uf_data.index
        if args.save_lda_input:
            uf_data.to_csv(args.save_lda_input, sep="\t")
        # Run LDA
        X_lda, y_lda, exp_var = run_LDA(uf_data)
    else:
        df_rel_abd = pd.DataFrame(rel_abd).T
        df_rel_abd.insert(0, "Condition", [imap[sid][category_idx]
                                           for sid in df_rel_abd.index])
        sampleids = df_rel_abd.index
        if args.save_lda_input:
            df_rel_abd.to_csv(args.save_lda_input, sep="\t")
        # Run LDA
        X_lda, y_lda, exp_var = run_LDA(df_rel_abd)

    # Calculate position and size of SampleIDs to plot for each OTU
    for otuid in bubble_otus:
        otuname = oc.otu_name(biomf.metadata(otuid, axis="observation")["taxonomy"])
        plot_data = {cat: {"x": [], "y": [], "size": [], "label": []}
                     for cat in class_colors.keys()}
        for sid, data in zip(sampleids, X_lda):
            category = plot_data[imap[sid][category_idx]]
            try:
                size = rel_abd[sid][otuid] * args.scale_by
            except KeyError as ke:
                print("{} not found in {} sample.".format(ke, sid))
                continue
            category["x"].append(float(data[0]))
            category["y"].append(float(data[1]))
            category["size"].append(size)

        # Plot LDA bubble for each OTU
        fig = plt.figure(figsize=args.figsize)
        ax = fig.add_subplot(111)
        for i, cat in enumerate(plot_data):
            plt.scatter(plot_data[cat]["x"], plot_data[cat]["y"],
                        s=plot_data[cat]["size"], label=cat, color=class_colors[cat],
                        alpha=0.85, edgecolors="k")
        if X_lda.shape[1] == 1:
            plt.ylim((0.5, 2.5))
        plt.title(" ".join(otuname.split("_")), style="italic", fontsize=13)
        try:
            plt.xlabel("LD1 (Percent Explained Variance: {:.3f}%)".format(exp_var[0]*100),
                       fontsize=13, labelpad=15)
        except:
            plt.xlabel("LD1", fontsize=13, labelpad=15)
        try:
            plt.ylabel("LD2 (Percent Explained Variance: {:.3f}%)".format(exp_var[1]*100),
                       fontsize=13, labelpad=15)
        except:
            plt.ylabel("LD2", fontsize=13, labelpad=15)

        lgnd1 = plt.legend(loc="best", scatterpoints=3, fontsize=13)
        for i in range(len(class_colors.keys())):
            lgnd1.legendHandles[i]._sizes = [80]  # Change the legend marker size manually
        # Add the legend manually to the current plot
        plt.gca().add_artist(lgnd1)

        c = [plt.scatter([], [], c="w", edgecolors="k", s=s1) for s1 in bubble_range]
        plt.legend(c, ["{}".format(s2) for s2 in bubble_range],
                   title="Scaled Bubble\n       Sizes", frameon=True, labelspacing=2,
                   fontsize=13, loc=4, scatterpoints=1, borderpad=1.1)

        # Set style for LDA bubble plots
        if args.ggplot2_style:
            gu.ggplot2_style(ax)
            fc = "0.8"
        else:
            fc = "none"

        # Save LDA bubble plots to output directory
        if args.verbose:
            print("Saving chart for {}".format(" ".join(otuname.split("_"))))
        fig.savefig(pj(args.output_dir, "_".join(otuname.split())) + "." + args.save_as,
                    facecolor=fc, edgecolor="none", dpi=300,
                    bbox_inches="tight", pad_inches=0.2)
        plt.close(fig)
Example #27
0
def main():
    args = handle_program_options()

    try:
        with open(args.coord_fp):
            pass
    except IOError as ioe:
        err_msg = '\nError in input principal coordinates filepath (-i): {}\n'
        sys.exit(err_msg.format(ioe))

    try:
        with open(args.map_fp):
            pass
    except IOError as ioe:
        err_msg = '\nError in input metadata mapping filepath (-m): {}\n'
        sys.exit(err_msg.format(ioe))

    with open(args.coord_fp) as F:
        pcd = F.readlines()
    pcd = [line.split('\t') for line in pcd]

    map_header, imap = util.parse_map_file(args.map_fp)

    data_gather = util.gather_categories(imap, map_header,
                                         args.colorby.split(','))
    categories = OrderedDict([(condition, {'pc1': [], 'pc2': [], 'pc3': []})
                  for condition in data_gather.keys()])

    bmap = qualitative.Paired[12]
    bcolors = itertools.cycle(bmap.hex_colors)
    if not args.colors:
        colors = [bcolors.next() for _ in categories]
    else:
        colors = parse_colors(args.colors, categories)

    parsed_unifrac = util.parse_unifrac(args.coord_fp)

    pco = args.pc_order if args.dimensions == 2 else [1, 2, 3]
    pc1v = parsed_unifrac['varexp'][pco[0]]
    pc2v = parsed_unifrac['varexp'][pco[1]]
    if args.dimensions == 3:
        pc3v = parsed_unifrac['varexp'][pco[2]]

    for sid, points in parsed_unifrac['pcd'].iteritems():
        for condition, dc in data_gather.iteritems():
            if sid in dc.sids:
                cat = condition
                break
        categories[cat]['pc1'].append((sid, float(points[pco[0] - 1])))
        categories[cat]['pc2'].append((sid, float(points[pco[1] - 1])))

        if args.dimensions == 3:
            categories[cat]['pc3'].append((sid, float(points[pco[2] - 1])))

    axis_str = "PC{} - Percent variation explained {:.2f}%"
    # initialize plot
    fig = plt.figure(figsize=(14,8))
    if args.dimensions == 3:
        ax = fig.add_subplot(111, projection='3d')
        ax.view_init(elev=23., azim=-134.5)
        ax.set_zlabel(axis_str.format(pco[2], float(pc3v)))
        if args.z_limits:
            ax.set_zlim(args.z_limits)
    else:
        ax = fig.add_subplot(111)

    # plot data
    for i, cat in enumerate(categories):
        if args.dimensions == 3:
            ax.scatter(xs=[e[1] for e in categories[cat]['pc1']],
                       ys=[e[1] for e in categories[cat]['pc2']],
                       zs=[e[1] for e in categories[cat]['pc3']], zdir='z',
                       c=colors[i],
                       s=args.point_size)
        else:
            ax.scatter([e[1] for e in categories[cat]['pc1']],
                       [e[1] for e in categories[cat]['pc2']],
                       c=colors[i], s=args.point_size)

# Script to annotate PCoA points.
#             for x, y in zip(categories[cat]['pc1'], categories[cat]['pc2']):
#                 ax.annotate(
#                     x[0], xy=(x[1], y[1]), xytext=(-10, -15),
#                     textcoords='offset points', ha='center', va='center',
#                     )

    # customize plot options
    if args.x_limits:
        ax.set_xlim(args.x_limits)
    if args.y_limits:
        ax.set_ylim(args.y_limits)

    ax.set_xlabel(axis_str.format(pco[0], float(pc1v)))
    ax.set_ylabel(axis_str.format(pco[1], float(pc2v)))

    ax.legend([Rectangle((0, 0), 1, 1, fc=colors[i])
              for i in range(len(categories))], categories.keys(), loc='best')

    if args.title:
        title(args.title)

    # save or display result
    if args.out_fp:
        fig.savefig(args.out_fp, facecolor='white',
                    edgecolor='none', dpi=args.dpi,
                    bbox_inches='tight', pad_inches=0.2)
    else:
        plt.show()
Example #28
0
def main():
    args = handle_program_options()

    # Read in the distance data
    try:
        dm_data = pd.read_csv(args.dist_matrix_file, sep="\t", index_col=0)
        dm_data_sids = dm_data.index
        dm_data = pairwise_distances(dm_data[range(dm_data.shape[1])].values,
                                     metric="precomputed")
    except IOError as ioe:
        sys.exit("\nError reading in distance matrix file: {}.".format(ioe))

    # Mapping and colors info for plotting
    try:
        header, map_data = util.parse_map_file(args.map_fp)
    except IOError as ioe:
        sys.exit("\nError reading mapping file: {}.".format(ioe))
    y = [map_data[sid][header.index(args.group_by)] for sid in dm_data_sids]

    # Get colors for all categories
    if not args.color_by:
        categories = set(y)
        bcolors = itertools.cycle(Set1_9.hex_colors)
        cond_colors = {c: bcolors.next() for c in categories}
    else:
        cond_colors = util.color_mapping(map_data, header, args.group_by, args.color_by)

    # Prep input data for t-SNE
    X_tsne = TSNE(n_components=3, perplexity=args.perplexity, metric="precomputed",
                  method="exact", verbose=2, random_state=0, angle=0.8)
    X_new = X_tsne.fit_transform(dm_data)
    print("KL divergence after optimization: {}\n".format(X_tsne.kl_divergence_))
    x_min, x_max = np.min(X_new, 0), np.max(X_new, 0)
    X_new = (X_new - x_min) / (x_max - x_min)

    # Plot t-SNE result
    fig = plt.figure(figsize=(14, 8))
    for cond, sid, xy in zip(y, dm_data_sids, X_new):
        ax = fig.add_subplot(111)
        ax.scatter(x=xy[0], y=xy[1], s=args.point_size, c=cond_colors[cond],
                   alpha=0.9, edgecolors="k")
        if args.annotate:
            ax.annotate(s=sid, xy=(xy[0], xy[1]), xytext=(12, 12),
                        textcoords="offset points", ha="center", va="center",
                        alpha=1, style="italic")
    if args.plot_title is not None:
        ax.set_title(args.plot_title, fontsize=16, weight="bold")
    l = [plt.scatter([], [], c=cond_colors[cond], s=150, edgecolors="k")
         for cond in cond_colors]
    plt.legend(l, ["{}".format(cond) for cond in cond_colors], loc="best",
               scatterpoints=3, frameon=True, framealpha=1, fontsize=14)
    ax.set_xlabel("t-SNE 1", fontsize=14)
    ax.set_ylabel("t-SNE 2", fontsize=14)
    plt.tight_layout()
    if args.ggplot2_style:
        gu.ggplot2_style(ax)
        fc = "0.8"
    else:
        fc = "none"

    # save or display result
    if args.out_fp:
        plt.savefig(args.out_fp, facecolor=fc, edgecolor="none", dpi=300, pad_inches=0.1,
                    bbox_inches="tight")
    else:
        plt.show()
Example #29
0
def main():
    args = handle_program_options()

    # Parse and read mapping file
    try:
        header, imap = util.parse_map_file(args.map_fp)
        category_idx = header.index(args.group_by)
    except IOError as ioe:
        err_msg = "\nError in metadata mapping filepath (-m): {}\n"
        sys.exit(err_msg.format(ioe))

    # Obtain group colors
    try:
        assert args.colors is not None
    except AssertionError:
        categories = {v[category_idx] for k, v in imap.items()}
        color_cycle = cycle(Set3_12.hex_colors)
        class_colors = {c: color_cycle.next() for c in categories}
    else:
        class_colors = util.color_mapping(imap, header, args.group_by, args.colors)

    if args.dist_matrix_file:
        try:
            dm_data = pd.read_csv(args.dist_matrix_file, sep="\t", index_col=0)
        except IOError as ioe:
            err_msg = "\nError with unifrac distance matrix file (-d): {}\n"
            sys.exit(err_msg.format(ioe))
        dm_data.insert(0, "Condition", [imap[str(sid)][category_idx] for sid in dm_data.index])
        if args.annotate_points:
            sampleids = [str(sid) for sid in dm_data.index]
        else:
            sampleids = None
        if args.save_lda_input:
            dm_data.to_csv(args.save_lda_input, sep="\t")
        # Run LDA
        X_lda, y_lda, exp_var = run_LDA(dm_data)
    else:
        # Load biom file and calculate relative abundance
        try:
            biomf = biom.load_table(args.otu_table)
        except IOError as ioe:
            err_msg = "\nError with biom format file (-d): {}\n"
            sys.exit(err_msg.format(ioe))
        # Get normalized relative abundances
        rel_abd = bc.relative_abundance(biomf)
        rel_abd = bc.arcsine_sqrt_transform(rel_abd)
        df_rel_abd = pd.DataFrame(rel_abd).T
        df_rel_abd.insert(0, "Condition", [imap[sid][category_idx]
                                           for sid in df_rel_abd.index])
        if args.annotate_points:
            sampleids = df_rel_abd.index
        else:
            sampleids = None
        if args.save_lda_input:
            df_rel_abd.to_csv(args.save_lda_input, sep="\t")
        # Run LDA
        X_lda, y_lda, exp_var = run_LDA(df_rel_abd)

    # Plot LDA
    if args.dimensions == 3:
        plot_LDA(X_lda, y_lda, class_colors, exp_var, style=args.ggplot2_style,
                 fig_size=args.figsize, label_pad=args.label_padding,
                 font_size=args.font_size, sids=sampleids, dim=3,
                 zangles=args.z_angles, pt_size=args.point_size, out_fp=args.out_fp)
    else:
        plot_LDA(X_lda, y_lda, class_colors, exp_var, style=args.ggplot2_style,
                 fig_size=args.figsize, label_pad=args.label_padding,
                 font_size=args.font_size, sids=sampleids, pt_size=args.point_size,
                 out_fp=args.out_fp)
Example #30
0
def main():
    args = handle_program_options()

    metrics = [m for m in alpha.__all__ if "_ci" not in m]
    try:
        metrics.remove("faith_pd")
    except ValueError:
        pass
    if args.show_available_metrics:
        print "\nAvailable alpha diversity metrics:"
        return "\n".join(metrics)

    # check that the output dir exists, create it if not
    msg = putil.ensure_dir(args.output_dir)
    # if an error occurs, print and exit
    if msg:
        sys.exit(msg)

    # parse mapping file
    try:
        header, sample_map = putil.parse_map_file(args.map_file)
    except Exception as ioe:
            err_msg = "\nError while processing the mapping file: {}\n"
            sys.exit(err_msg.format(ioe))

    # parse BIOM table
    try:
        biom_tbl = biom.load_table(args.biom_fp)
    except Exception as ioe:
        err_msg = "\nError loading BIOM table file: {}\n"
        sys.exit(err_msg.format(ioe))

    # group samples by category
    if args.category not in header:
        sys.exit("Category '{}' not found".format(args.category))
    cat_idx = header.index(args.category)
    cat_vals = {entry[cat_idx] for entry in sample_map.values()}

    plot_title = args.plot_title

    colors = putil.color_mapping(sample_map, header, args.category, args.color_by)

    # Perform diversity calculations and density plotting
    for method, x_label in izip_longest(args.diversity, args.x_label):
        if x_label is None:
            x_label = method.title()
        if method not in alpha.__all__:
            sys.exit("ERROR: Diversity metric not found: {}.".format(method))
        elif method in alpha.__all__ and method not in metrics:
            sys.exit("Currently, PhyloToAST does not support {} metric.".format(method))
        metric = eval("alpha."+method)
        div_calc, sample_ids = calc_diversity(metric, sample_map, biom_tbl,
                                              cat_vals, cat_idx)

        if args.save_calculations:
            write_diversity_metrics(div_calc, sample_ids, args.save_calculations)

        plot_group_diversity(div_calc, colors, plot_title, x_label, args.output_dir,
                             args.image_type)

        # calculate and print significance testing results
        if not args.suppress_stats:
            print "Diversity significance testing: {}".format(x_label)
            if len(cat_vals) == 2:
                print_MannWhitneyU(div_calc)
            elif len(cat_vals) > 2:
                print_KruskalWallisH(div_calc)
            print
        else:
            continue
Example #31
0
def main():
    args = handle_program_options()

    try:
        with open(args.otu_table):
            pass
    except IOError as ioe:
        sys.exit(
            "\nError with OTU_Sample abundance data file:{}\n"
            .format(ioe)
        )

    try:
        with open(args.mapping):
            pass
    except IOError as ioe:
        sys.exit(
            "\nError with mapping file:{}\n"
            .format(ioe)
        )

    # input data
    biomf = biom.load_table(args.otu_table)
    map_header, imap = util.parse_map_file(args.mapping)

    # rewrite tree file with otu names
    if args.input_tree:
        with open(args.input_tree) as treF, open(args.output_tre, "w") as outF:
            tree = treF.readline()
            if "'" in tree:
                tree = tree.replace("'", '')
            outF.write(newick_replace_otuids(tree, biomf))

    oid_rows = {id_: md["taxonomy"]
                for val, id_, md in biomf.iter(axis="observation")}

    # calculate analysis results
    categories = None
    if args.map_categories is not None:
        categories = args.map_categories.split(",")

    # set transform if --stabilize_variance is specfied
    tform = bc.arcsine_sqrt_transform if args.stabilize_variance else None

    groups = util.gather_categories(imap, map_header, categories)
    for group in groups.values():
        if args.analysis_metric in ["MRA", "NMRA"]:
            results = bc.MRA(biomf, group.sids, transform=tform)
        elif args.analysis_metric == "raw":
            results = bc.transform_raw_abundance(biomf, sampleIDs=group.sids,
                                                 sample_abd=False)
        group.results.update({oc.otu_name(oid_rows[oid]): results[oid]
                             for oid in results})

    # write iTol data set file
    with open(args.output_itol_table, "w") as itolF:
        if args.analysis_metric == "raw":
            itolF.write("DATASET_GRADIENT\nSEPARATOR TAB\n")
            itolF.write("DATASET_LABEL\tLog Total Abundance\n")
            itolF.write("COLOR\t#000000\n")
            itolF.write("LEGEND_TITLE\tLog Total Abundance\n")
            itolF.write("LEGEND_SHAPES\t1\n")
            itolF.write("LEGEND_COLORS\t#000000\n")
            itolF.write("LEGEND_LABELS\tLog Total Abundance\n")
            itolF.write("COLOR_MIN\t#FFFFFF\n")
            itolF.write("COLOR_MAX\t#000000\n")
        else:
            itolF.write("DATASET_MULTIBAR\nSEPARATOR TAB\n")
            itolF.write("DATASET_LABEL\tNMRA\n")
            itolF.write("FIELD_COLORS\t{}\n".format("\t".join(["#ff0000"
                        for _ in range(len(groups))])))
            itolF.write("FIELD_LABELS\t" + "\t".join(groups.keys())+"\n")
            itolF.write("LEGEND_TITLE\tNMRA\n")
            itolF.write("LEGEND_SHAPES\t{}\n".format("\t".join(["1"
                        for _ in range(len(groups))])))
            itolF.write("LEGEND_COLORS\t{}\n".format("\t".join(["#ff0000"
                        for _ in range(len(groups))])))
            itolF.write("LEGEND_LABELS\t" + "\t".join(groups.keys())+"\n")
            itolF.write("WIDTH\t300\n")
        itolF.write("DATA\n")
        all_otus = frozenset({oc.otu_name(md["taxonomy"])
                              for val, id_, md in
                              biomf.iter(axis="observation")})

        for oname in all_otus:
            row = ["{name}"]        # \t{s:.2f}\t{ns:.2f}\n"
            row_data = {"name": oname}
            msum = 0
            for name, group in groups.iteritems():
                row.append("{{{}:.5f}}".format(name))
                if oname in group.results:
                    row_data[name] = group.results[oname]
                else:
                    row_data[name] = 0.0
                msum += row_data[name]
            # normalize avg relative abundance data
            if args.analysis_metric == "NMRA" and msum > 0:
                row_data.update({key: data/msum
                                for key, data in row_data.items()
                                if key != "name"})
            itolF.write("\t".join(row).format(**row_data) + "\n")
Example #32
0
def main():
    args = handle_program_options()

    try:
        with open(args.coord_fp):
            pass
    except IOError as ioe:
        err_msg = "\nError in input principal coordinates filepath (-i): {}\n"
        sys.exit(err_msg.format(ioe))

    try:
        with open(args.map_fp):
            pass
    except IOError as ioe:
        err_msg = "\nError in input metadata mapping filepath (-m): {}\n"
        sys.exit(err_msg.format(ioe))

    with open(args.coord_fp) as F:
        pcd = F.readlines()
    pcd = [line.split("\t") for line in pcd]

    map_header, imap = util.parse_map_file(args.map_fp)

    data_gather = util.gather_categories(imap, map_header,
                                         args.group_by.split(","))
    categories = OrderedDict([(condition, {"pc1": [], "pc2": [], "pc3": []})
                              for condition in data_gather.keys()])

    bcolors = itertools.cycle(Set3_12.hex_colors)
    if not args.colors:
        colors = [bcolors.next() for _ in categories]
    else:
        colors = util.color_mapping(imap, map_header,
                                    args.group_by, args.colors)
        colors = colors.values()

    parsed_unifrac = util.parse_unifrac(args.coord_fp)

    pco = args.pc_order
    if args.dimensions == 3:
        pco.append(3)

    pc1v = parsed_unifrac["varexp"][pco[0] - 1]
    pc2v = parsed_unifrac["varexp"][pco[1] - 1]
    if args.dimensions == 3:
        pc3v = parsed_unifrac["varexp"][pco[2] - 1]

    for sid, points in parsed_unifrac["pcd"].items():
        for condition, dc in data_gather.items():
            if sid in dc.sids:
                cat = condition
                break
        categories[cat]["pc1"].append((sid, points[pco[0] - 1]))
        categories[cat]["pc2"].append((sid, points[pco[1] - 1]))

        if args.dimensions == 3:
            categories[cat]["pc3"].append((sid, points[pco[2] - 1]))

    axis_str = "PC{} (Percent Explained Variance {:.3f}%)"
    # initialize plot
    fig = plt.figure(figsize=args.figsize)
    if args.dimensions == 3:
        ax = fig.add_subplot(111, projection="3d")
        ax.view_init(elev=args.z_angles[1], azim=args.z_angles[0])
        ax.set_zlabel(axis_str.format(3, pc3v), labelpad=args.label_padding)
        if args.z_limits:
            ax.set_zlim(args.z_limits)
    else:
        ax = fig.add_subplot(111)

    # plot data
    for i, cat in enumerate(categories):
        if args.dimensions == 3:
            ax.scatter(xs=[e[1] for e in categories[cat]["pc1"]],
                       ys=[e[1] for e in categories[cat]["pc2"]],
                       zs=[e[1] for e in categories[cat]["pc3"]],
                       zdir="z", c=colors[i], s=args.point_size, label=cat,
                       edgecolors="k")
        else:
            ax.scatter([e[1] for e in categories[cat]["pc1"]],
                       [e[1] for e in categories[cat]["pc2"]],
                       c=colors[i], s=args.point_size, label=cat, edgecolors="k")

        # Script to annotate PCoA sample points.
        if args.annotate_points:
            for x, y in zip(categories[cat]["pc1"], categories[cat]["pc2"]):
                ax.annotate(
                    x[0], xy=(x[1], y[1]), xytext=(-10, -15),
                    textcoords="offset points", ha="center", va="center",
                    )

    # customize plot options
    if args.x_limits:
        ax.set_xlim(args.x_limits)
    if args.y_limits:
        ax.set_ylim(args.y_limits)

    ax.set_xlabel(axis_str.format(pco[0], float(pc1v)), labelpad=args.label_padding)
    ax.set_ylabel(axis_str.format(pco[1], float(pc2v)), labelpad=args.label_padding)

    leg = plt.legend(loc="best", scatterpoints=3, frameon=True, framealpha=1)
    leg.get_frame().set_edgecolor('k')

    # Set the font characteristics
    font = {"family": "normal", "weight": "bold", "size": args.font_size}
    mpl.rc("font", **font)

    if args.title:
        ax.set_title(args.title)

    if args.ggplot2_style and not args.dimensions == 3:
        gu.ggplot2_style(ax)

    # save or display result
    if args.out_fp:
        fig.savefig(args.out_fp, facecolor="white", edgecolor="none", bbox_inches="tight",
                    pad_inches=0.2)
    else:
        plt.show()
Example #33
0
def main():
    args = program_options()

    try:
        biomf = biom.load_table(args.in_biomf)
    except IOError as ioe:
        sys.exit("Error with input BIOM format file: {}".format(ioe))
    else:
        rel_abd = relative_abundance(biomf)
        ast_rel_abd = ast(rel_abd)
        # Get pairwise combinations of OTUs
        otu_combos = list(combinations(biomf.ids("observation"), 2))

    try:
        mheader, mdata = parse_map_file(args.map_fnh)
    except IOError as ioe:
        sys.exit("Error with input mapping file: {}".format(ioe))
    else:
        # Gather sampleID categories
        sid_cat = gather_categories(mdata, mheader, [args.category_column])

    # Create arguments for helper function to be supplied to multiprocessing pool.map()
    chunksize = 10000
    jobs = [(
        otu_combos[x:x + chunksize],
        sid_cat,
        ast_rel_abd,
    ) for x in xrange(0, len(otu_combos), chunksize)]
    print("{0} jobs created.".format(len(jobs)))

    # Start multiprocessing jobs
    try:
        print("Starting map_async()...")
        pool = Pool()
        res = pool.map_async(calc_corr_helper, jobs)
        pool.close()
        pool.join()
    except Exception:
        sys.exit("Error while calculating correlations\n{}".format(
            format_exc()))
    else:
        s_rho_calc = []
        k_tau_calc = []
        for r in res.get():
            for s in r:
                if s[0] == "Spearman":
                    s_rho_calc.append(s)
                else:
                    k_tau_calc.append(s)

    # Get FDR corrected correlation results
    print("Running FDR correction on {} Spearman's Rho.".format(
        len(s_rho_calc)))
    fdr_corr_s_rho = run_fdr(s_rho_calc)
    print("Running FDR correction on {} Kendall Tau.".format(len(k_tau_calc)))
    fdr_corr_k_tau = run_fdr(k_tau_calc)

    # Consolidate correlation results
    k_kos = {(
        e[2],
        e[3],
    )
             for e in fdr_corr_k_tau}
    s_kos = {(
        f[2],
        f[3],
    )
             for f in fdr_corr_s_rho}
    final_kos = s_kos & k_kos
    print(
        "{0} elements from KendallTau\n{1} elements from SpearmanRho\n{2} elements are "
        "common to both.".format(len(k_kos), len(s_kos), len(final_kos)))
    final_fdr_corr_results = [
        cdata[1:] for cdata in fdr_corr_s_rho if (
            cdata[2],
            cdata[3],
        ) in final_kos
    ]

    # Write our results to file
    with open(args.out_fnh, "w") as outf:
        outf.write("Category\tVariable\tby Variable\tCorrelation\tp value\n")
        for k in final_fdr_corr_results:
            outf.write("{0}\t{1}\t{2}\t{3}\t{4}\n".format(
                k[0], k[1], k[2], k[3], k[4]))
Example #34
0
 def setUp(self):
     """
     Setting up files, or data for testing purposes.
     """
     self.map_header, self.map_data = ut.parse_map_file("phylotoast/test/test_mapping_file.txt")