Example #1
0
    def read_ancestry_files(self, only_optimal_Ks=False):
        dataframes = []

        datasets = Dataset.all_datasets()
        Ks = self.available_Ks()
        panels = Panel.all_panels() + Panel.all_control_panels()

        for dataset, K, panel in product(datasets, Ks, panels):

            if only_optimal_Ks and self.optimal_Ks()[dataset.label] != K:
                continue

            # Results are sorted in directories named like DATASET_PANEL
            tag = "{}_{}".format(dataset.label, panel.label)
            basedir = join(ADMIXTURE_DIR, tag)

            if not isdir(basedir):
                continue

            # Read the .Q file for ratios of ancestry per sample
            fname = "{}.{}.Q".format(tag, K)
            ancestries_df = pd.read_csv(join(basedir, fname), sep="\s+",
                                        names=list(range(K)))

            # Read the .fam file for the sample IDs (they're in the same order)
            fname = "{}.fam".format(tag)
            samples = pd.read_csv(join(basedir, fname), sep="\s+", index_col=0,
                                  usecols=[0], names=["sample"])
            ancestries_df.index = samples.index

            # Add population data to the sample IDs
            samples_df = ThousandGenomes().all_samples()
            ancestries_df = samples_df.join(ancestries_df).dropna()

            continents_present = len(ancestries_df["superpopulation"].unique())
            if continents_present >= 3:
                self.infer_ancestral_components_from_samples_origin(ancestries_df)

            self.infer_ancestral_components_from_reference_pop(ancestries_df)

            # Arrange the hierarchical index
            ancestries_df.reset_index(inplace=True)
            ancestries_df["dataset"] = dataset.label
            ancestries_df["K"] = K
            ancestries_df["panel"] = panel.label
            ancestries_df.set_index(["dataset", "K", "panel"], inplace=True)

            dataframes.append(ancestries_df)

        return pd.concat(dataframes)
Example #2
0
    def plot_(self, components_df, explained_variance, title, filename,
              component_pairs=[("PC1", "PC2")], plot_size=None, legend_on=True):

        # + 1 axes for the one with the legend, +1 because index starts at 1
        ax_ids = list(np.arange(1, len(component_pairs) + 2))
        nrows, ncols, figsize = self._fig_dimensions(len(ax_ids), plot_size)
        fig = plt.figure(figsize=figsize)

        for components_to_compare in component_pairs:
            ax_id = ax_ids.pop(0)
            ax = fig.add_subplot(nrows, ncols, ax_id)
            ax = self.draw_ax(ax, components_to_compare, components_df,
                              explained_variance, "PEL", title)

        if legend_on:
            # Legend subplot. It will use the handles and labels of the last ax
            handles, labels = ax.get_legend_handles_labels()
            populations_df = ThousandGenomes.population_names()
            descriptions = populations_df.ix[labels, "description"]
            legend_labels = [" - ".join([code, desc])
                            for code, desc in descriptions.iteritems()]

            ax = fig.add_subplot(nrows, ncols, ax_ids.pop(0))
            ax = legend_subplot(ax, handles, legend_labels)

        #  plt.tight_layout()
        fig.suptitle(title, fontsize=18, position=(0.12, 1.1), ha="left",
                     family="serif")
        plt.subplots_adjust(wspace=0.05)

        if filename is not None:
            makedirs(self.FIGS_DIR, exist_ok=True)
            plt.savefig(join(self.FIGS_DIR, filename), facecolor="w",
                        bbox_inches="tight")
Example #3
0
    def _generate_maf_long_df(self):
        mafs = ThousandGenomes.mafs()
        long_format_mafs = OrderedDict()

        # population_level can be "population" or "superpopulation"
        for population_level, dic in mafs.items():
            names, frames = dic.keys(), dic.values()
            merged_df = pd.concat(frames, axis=1, keys=names)
            long_df = pd.melt(merged_df)
            long_df.columns = ["panel", "population", "MAF"]
            long_format_mafs[population_level] = long_df

        return long_format_mafs