def boxplot(phenotype): """Draw a boxplot for the given continuous phenotype. :param phenotype: Display a boxplot for the provided continuous phenotype. :type phenotype: str An exception will be raised if the required phenotype is not continuous. """ data, meta = _get_data_meta(phenotype) data = data[~np.isnan(data)] _type = types.type_str(meta["variable_type"]) if _type.subtype_of(types.Continuous): raise REPLException("Can't draw boxplot for non-continuous variable " "('{}').".format(phenotype)) fig, ax = plt.subplots(1, 1) ax.boxplot(data, vert=False) ax.set_xlabel(phenotype) ax.set_yticklabels([]) ax.yaxis.set_ticks_position("none") filename = "cohort_plot.png" plt.savefig(filename, dpi=300) return _response_from_img_filename(filename)
def info(phen_or_command, drug_code=None): """Get information and summary statistics on a phenotype. :param phenotype: The name of the phenotype to get summary information on. :type phenotype: str Use 'list' to see all the available phenotypes for this command. This command can also be used to get information on a drug: info drug 12345 or C05 (ATC code) """ if phen_or_command == "drug": return _info_drug(drug_code) phenotype = phen_or_command message = StringIO() data, meta = _get_data_meta(phenotype) print("Phenotype meta data:", file=message) for k, v in meta.items(): if COLOR: k = colored(k, "green") print("\t{}{}".format(k.ljust(30), v), file=message) print("\nSummary statistics:", file=message) n_missing = STATE["manager"].get_number_missing(phenotype) n_total = data.shape[0] print("\t{} / {} missing values ({:.3f}%)".format( n_missing, n_total, n_missing / n_total * 100 ), file=message) t = types.type_str(meta["variable_type"]) if t.subtype_of(types.Discrete): # Show information on prevalence. n_cases = np.sum(data == 1) n_controls = np.sum(data == 0) print("\t{} cases, {} controls; prevalence: {:.3f}%".format( n_cases, n_controls, n_cases / (n_cases + n_controls) * 100 ), file=message) elif t.subtype_of(types.Continuous): mean = np.nanmean(data) std = np.nanstd(data) print(u"\tµ = {}, σ = {}".format(mean, std), file=message) print("\tmin = {}, max = {}".format(np.nanmin(data), np.nanmax(data)), file=message) elif t.subtype_of(types.Factor): print("\nCounts (rate):", file=message) n = data.shape[0] for name, count in data.value_counts().iteritems(): print("\t{}: {} ({:.3f}%)".format(name, count, count / n * 100), file=message) return {"success": True, "message": message.getvalue()}
def normal_qq_plot(phenotype): """Plot the Normal QQ plot of the observations. :param phenotype: The phenotype for which to draw the QQ plot. :type phenotype: str This function is only available for continuous phenotypes. """ data, meta = _get_data_meta(phenotype) data = data[~np.isnan(data)] if types.type_str(meta["variable_type"]).subtype_of(types.Continuous): raise REPLException( "Could not create QQ plot for {} variable '{}'.".format( meta["variable_type"], phenotype ) ) data = np.sort(data) expected = scipy.stats.norm.ppf( np.arange(1, data.shape[0] + 1) / (data.shape[0] + 1), loc=np.mean(data), scale=np.std(data) ) plt.scatter(expected, data, color="black", marker="o", s=10) x_min, x_max = plt.xlim() y_min, y_max = plt.ylim() slope, intercept, r_value, p_value, std_err = scipy.stats.linregress( expected, data ) plt.plot( [x_min, x_max], [slope * x_min + intercept, slope * x_max + intercept], "--", color="black", label="$R^2 = {:.4f}$".format(r_value ** 2) ) plt.legend(loc="lower right") plt.xlabel("Expected quantiles") plt.ylabel("Observed quantiles") plt.xlim([x_min, x_max]) plt.ylim([y_min, y_max]) filename = "cohort_plot.png" return _response_from_img_filename(filename)
def histogram(phenotype, nbins=None): """Draw a histogram (or a bar plot for discrete variables) of the data. :param phenotype: The phenotype for which to draw the histogram. :type phenotype: str :param nbins: The number of bins for the histogram (optional). :type nbins: int This function will work on both continuous and discrete variables (but not factors). """ data, meta = _get_data_meta(phenotype) t = types.type_str(meta["variable_type"]) data = data[~np.isnan(data)] if t.subtype_of(types.Continuous): # Histogram. if nbins: plt.hist(data, bins=nbins) else: plt.hist(data) plt.xlabel(phenotype) elif t.subtype_of(types.Discrete): # Bar plot. plt.bar((0.1, 0.4), (np.sum(data == 0), np.sum(data == 1)), width=0.1) plt.xticks((0.15, 0.45), ("control", "case")) plt.xlim((0, 0.6)) else: raise REPLException("Could not generate histogram for '{}' variable." "".format(meta["variable_type"])) filename = "cohort_plot.png" plt.savefig(filename, dpi=300) return _response_from_img_filename(filename)