def _generate_distances_long_format(self): genome = Genome.regions() panel_analyser = PanelAnalyser() frames = [panel_analyser.snp_distances_per_chromosome(panel, genome) for panel in Panel.all_panels()] distances = pd.concat(frames).T return pd.melt(distances)
def read_ancestry_files(self, only_optimal_Ks=False): dataframes = [] datasets = Dataset.all_datasets() Ks = self.available_Ks() panels = Panel.all_panels() + Panel.all_control_panels() for dataset, K, panel in product(datasets, Ks, panels): if only_optimal_Ks and self.optimal_Ks()[dataset.label] != K: continue # Results are sorted in directories named like DATASET_PANEL tag = "{}_{}".format(dataset.label, panel.label) basedir = join(ADMIXTURE_DIR, tag) if not isdir(basedir): continue # Read the .Q file for ratios of ancestry per sample fname = "{}.{}.Q".format(tag, K) ancestries_df = pd.read_csv(join(basedir, fname), sep="\s+", names=list(range(K))) # Read the .fam file for the sample IDs (they're in the same order) fname = "{}.fam".format(tag) samples = pd.read_csv(join(basedir, fname), sep="\s+", index_col=0, usecols=[0], names=["sample"]) ancestries_df.index = samples.index # Add population data to the sample IDs samples_df = ThousandGenomes().all_samples() ancestries_df = samples_df.join(ancestries_df).dropna() continents_present = len(ancestries_df["superpopulation"].unique()) if continents_present >= 3: self.infer_ancestral_components_from_samples_origin(ancestries_df) self.infer_ancestral_components_from_reference_pop(ancestries_df) # Arrange the hierarchical index ancestries_df.reset_index(inplace=True) ancestries_df["dataset"] = dataset.label ancestries_df["K"] = K ancestries_df["panel"] = panel.label ancestries_df.set_index(["dataset", "K", "panel"], inplace=True) dataframes.append(ancestries_df) return pd.concat(dataframes)