def read_ancestry_files(self, only_optimal_Ks=False): dataframes = [] datasets = Dataset.all_datasets() Ks = self.available_Ks() panels = Panel.all_panels() + Panel.all_control_panels() for dataset, K, panel in product(datasets, Ks, panels): if only_optimal_Ks and self.optimal_Ks()[dataset.label] != K: continue # Results are sorted in directories named like DATASET_PANEL tag = "{}_{}".format(dataset.label, panel.label) basedir = join(ADMIXTURE_DIR, tag) if not isdir(basedir): continue # Read the .Q file for ratios of ancestry per sample fname = "{}.{}.Q".format(tag, K) ancestries_df = pd.read_csv(join(basedir, fname), sep="\s+", names=list(range(K))) # Read the .fam file for the sample IDs (they're in the same order) fname = "{}.fam".format(tag) samples = pd.read_csv(join(basedir, fname), sep="\s+", index_col=0, usecols=[0], names=["sample"]) ancestries_df.index = samples.index # Add population data to the sample IDs samples_df = ThousandGenomes().all_samples() ancestries_df = samples_df.join(ancestries_df).dropna() continents_present = len(ancestries_df["superpopulation"].unique()) if continents_present >= 3: self.infer_ancestral_components_from_samples_origin(ancestries_df) self.infer_ancestral_components_from_reference_pop(ancestries_df) # Arrange the hierarchical index ancestries_df.reset_index(inplace=True) ancestries_df["dataset"] = dataset.label ancestries_df["K"] = K ancestries_df["panel"] = panel.label ancestries_df.set_index(["dataset", "K", "panel"], inplace=True) dataframes.append(ancestries_df) return pd.concat(dataframes)
def plot_(self, components_df, explained_variance, title, filename, component_pairs=[("PC1", "PC2")], plot_size=None, legend_on=True): # + 1 axes for the one with the legend, +1 because index starts at 1 ax_ids = list(np.arange(1, len(component_pairs) + 2)) nrows, ncols, figsize = self._fig_dimensions(len(ax_ids), plot_size) fig = plt.figure(figsize=figsize) for components_to_compare in component_pairs: ax_id = ax_ids.pop(0) ax = fig.add_subplot(nrows, ncols, ax_id) ax = self.draw_ax(ax, components_to_compare, components_df, explained_variance, "PEL", title) if legend_on: # Legend subplot. It will use the handles and labels of the last ax handles, labels = ax.get_legend_handles_labels() populations_df = ThousandGenomes.population_names() descriptions = populations_df.ix[labels, "description"] legend_labels = [" - ".join([code, desc]) for code, desc in descriptions.iteritems()] ax = fig.add_subplot(nrows, ncols, ax_ids.pop(0)) ax = legend_subplot(ax, handles, legend_labels) # plt.tight_layout() fig.suptitle(title, fontsize=18, position=(0.12, 1.1), ha="left", family="serif") plt.subplots_adjust(wspace=0.05) if filename is not None: makedirs(self.FIGS_DIR, exist_ok=True) plt.savefig(join(self.FIGS_DIR, filename), facecolor="w", bbox_inches="tight")
def _generate_maf_long_df(self): mafs = ThousandGenomes.mafs() long_format_mafs = OrderedDict() # population_level can be "population" or "superpopulation" for population_level, dic in mafs.items(): names, frames = dic.keys(), dic.values() merged_df = pd.concat(frames, axis=1, keys=names) long_df = pd.melt(merged_df) long_df.columns = ["panel", "population", "MAF"] long_format_mafs[population_level] = long_df return long_format_mafs