Example #1
0
    def read_ancestry_files(self, only_optimal_Ks=False):
        dataframes = []

        datasets = Dataset.all_datasets()
        Ks = self.available_Ks()
        panels = Panel.all_panels() + Panel.all_control_panels()

        for dataset, K, panel in product(datasets, Ks, panels):

            if only_optimal_Ks and self.optimal_Ks()[dataset.label] != K:
                continue

            # Results are sorted in directories named like DATASET_PANEL
            tag = "{}_{}".format(dataset.label, panel.label)
            basedir = join(ADMIXTURE_DIR, tag)

            if not isdir(basedir):
                continue

            # Read the .Q file for ratios of ancestry per sample
            fname = "{}.{}.Q".format(tag, K)
            ancestries_df = pd.read_csv(join(basedir, fname), sep="\s+",
                                        names=list(range(K)))

            # Read the .fam file for the sample IDs (they're in the same order)
            fname = "{}.fam".format(tag)
            samples = pd.read_csv(join(basedir, fname), sep="\s+", index_col=0,
                                  usecols=[0], names=["sample"])
            ancestries_df.index = samples.index

            # Add population data to the sample IDs
            samples_df = ThousandGenomes().all_samples()
            ancestries_df = samples_df.join(ancestries_df).dropna()

            continents_present = len(ancestries_df["superpopulation"].unique())
            if continents_present >= 3:
                self.infer_ancestral_components_from_samples_origin(ancestries_df)

            self.infer_ancestral_components_from_reference_pop(ancestries_df)

            # Arrange the hierarchical index
            ancestries_df.reset_index(inplace=True)
            ancestries_df["dataset"] = dataset.label
            ancestries_df["K"] = K
            ancestries_df["panel"] = panel.label
            ancestries_df.set_index(["dataset", "K", "panel"], inplace=True)

            dataframes.append(ancestries_df)

        return pd.concat(dataframes)