def safe_sort_index(index: Index) -> Index: """ Returns the sorted index We keep the dtypes and the name attributes. Parameters ---------- index : an Index Returns ------- Index """ if index.is_monotonic_increasing: return index try: array_sorted = safe_sort(index) except TypeError: pass else: array_sorted = cast(np.ndarray, array_sorted) if isinstance(index, MultiIndex): index = MultiIndex.from_tuples(array_sorted, names=index.names) else: index = Index(array_sorted, name=index.name, dtype=index.dtype) return index
def analyze_one(nm, genedf, min_pos_reads, min_perc_poss, min_total_var_support, min_maf, min_samples, min_variants, gene_seqs, pnpn_groups): # This method prepares a separate dataframe for each gene (subsampling etc.) and then calculates # all of the required metrics. genedf = _prepgene(nm, genedf, min_pos_reads, min_perc_poss, min_total_var_support, min_maf, min_samples, min_variants) if genedf is None: return [None] * 5 sites, pnps = calc_pnps(genedf, gene_seqs[nm]) percmut = calc_percodon_mut(genedf, gene_seqs[nm]) pnpn = calc_pnpn_all(genedf, gene_seqs[nm], pnpn_groups) pnpn.index = MultiIndex.from_tuples([(nm, ix[0], ix[1]) for ix in pnpn.index]) for df in ([sites, percmut[0]] if sites is not None else [percmut[0]]): df.index = MultiIndex.from_product([[nm], df.index]) percmut[1] = Series(percmut[1]).to_frame(nm).T ffdeg_pw, ffdeg_poss = calc_ffdeg_pi_within(genedf, gene_seqs[nm]) return sites, (pnps.to_frame(nm).T if pnps is not None else pnps), percmut, \ (ffdeg_pw.to_frame(nm).T, ffdeg_poss), pnpn
def apply_counts(ldf): nm = ldf.index.get_level_values(0)[0] ldf.index = MultiIndex.from_tuples(ldf.index.get_level_values(1)) ldf = ldf.dropna(how='all', axis=1) codldf = Series({ i: (cod.loc[nm, i[0]] if i[0] in cod.loc[nm] else 0) for i in allcodons }) ldf = codldf.to_frame('cod').join(ldf).drop('cod', axis=1) for col in ldf.columns: ldf[col + '_s'] = codldf return ldf.fillna(0)
def create_codon_trans_matrix(dirsname, tmpdir, grpnm): chdirmkifnotexist(join(tmpdir, 'tmpmq')) # TODO: IMPORTANT! Wrap this for loop with your hpc job submission pipeline for fname in glob(join(dirsname, '*codons*')): if exists( join(tmpdir, grpnm + basename(fname).replace('codons', 'sums'))): continue do_one_unite(fname, grpnm, tmpdir) df = None for f in glob(join(tmpdir, grpnm + '*sums.df')): print(basename(f)) if df is None: df = read_pickle(f) else: df = concat([df, read_pickle(f)], sort=False).groupby(level=[0, 1]).sum() df.columns = MultiIndex.from_tuples([('per',i) if '_s' not in i \ else ('sums',i.replace('_s','')) for i in df.columns]) sms = df.sums.sum().truediv(3) def applyfunc(ldf): ret = ldf.per.truediv(ldf.sums).loc[ldf.name].iloc[:, 0] ret[ldf.name] = 1 - ret.sum() return ret def applyfunc_counts(ldf): adds = ldf.sums.iloc[0][0] ret = ldf.per.loc[ldf.name].iloc[:, 0] ret[ldf.name] = adds - ret.sum() return ret df_counts = df.groupby( level=1, axis=1).apply(lambda x: x.groupby(level=0).apply(applyfunc_counts)) df_norm = df.groupby( level=1, axis=1).apply(lambda x: x.groupby(level=0).apply(applyfunc)) aas, _, _ = get_codon_table() df_aas = df_counts df_aas['aa_start'] = [aas[a] for a in df_counts.index.get_level_values(0)] df_aas['aa_end'] = [aas[a2] + ('.' if a1!=a2 and aas[a1]==aas[a2] else '') \ for a1, a2 in \ zip(df_counts.index.get_level_values(0),df_counts.index.get_level_values(1))] df_aas = df_aas.groupby(['aa_start', 'aa_end']).sum() df_norm.to_pickle( join(General.Basepath, grpnm + '_4_60_mutation_codons.df')) df_counts.to_pickle( join(General.Basepath, grpnm + '_4_60_mutation_codon_counts.df')) df_aas.to_pickle( join(General.Basepath, grpnm + '_4_60_mutation_aas_counts.df')) sms.to_pickle(join(General.Basepath, grpnm + '_4_60_nucleotide_count.df'))
def _Intercept_2const(df): from pandas.core.indexes.multi import MultiIndex if isinstance(df.index, MultiIndex): new_index = [] for v in df.index.values: v = list(v) if 'Intercept' in v: v[v.index('Intercept')] = 'const' new_index.append(v) multi_index = lzip(*new_index) df.index = MultiIndex.from_arrays(multi_index) else: index_value = df.index.tolist() if 'Intercept' in index_value: index_value[index_value.index('Intercept')] = 'const' df.index = index_value return df
def _Intercept_2const(df): from pandas.core.indexes.multi import MultiIndex if df.index.contains('Intercept'): if isinstance(df.index,MultiIndex): new_index = [] for i in df.index.values: i = list(i) if 'Intercept' in i: i[i.index('Intercept')] = 'const' new_index.append(i) multi_index = lzip(*new_index) df.index = MultiIndex.from_arrays(multi_index) else: index_list = df.index.tolist() idx = index_list.index('Intercept') index_list[idx] = 'const' df.index = index_list return df
def _Intercept_2const(df): from pandas.core.indexes.multi import MultiIndex if "Intercept" in df.index: if isinstance(df.index, MultiIndex): new_index = [] for i in df.index.values: i = list(i) if "Intercept" in i: i[i.index("Intercept")] = "const" new_index.append(i) multi_index = lzip(*new_index) df.index = MultiIndex.from_arrays(multi_index) else: index_list = df.index.tolist() idx = index_list.index("Intercept") index_list[idx] = "const" df.index = index_list return df
def calc_pnpn_all(genedf, geneseq, pnpn_groups): # LIAT: # Here we calculate pN(group1)/pN(group2) for all different groups that we define below. # We start by taking the consensus sequence: consensus = _get_consensus(genedf, geneseq) if consensus is None: return None # Splitting to codons cons_codons = [consensus[i:i + 3] for i in range(0, len(consensus), 3)] # Separately we normalize by the numebr of reads in each poisition to get a relative abundance # of each SNP normdf = genedf.groupby(level=1).apply(lambda x: x.truediv(x.sum(0))) ret = [] for nm, (g1, g2, nononymity) in pnpn_groups.items(): # Call a calculation for each condition res = calc_pnpn_one(cons_codons, normdf, g1, g2, nononymity) # Organize with the constructed name as index res.index = MultiIndex.from_tuples([(nm, i) for i in res.index]) ret.append(res) return concat(ret, sort=False)
def _multiindex(self) -> MultiIndex: return MultiIndex.from_arrays([self.left, self.right], names=["left", "right"])
def _multiindex(self): return MultiIndex.from_arrays([self.left, self.right], names=['left', 'right'])