Example #1
0
def safe_sort_index(index: Index) -> Index:
    """
    Returns the sorted index

    We keep the dtypes and the name attributes.

    Parameters
    ----------
    index : an Index

    Returns
    -------
    Index
    """
    if index.is_monotonic_increasing:
        return index

    try:
        array_sorted = safe_sort(index)
    except TypeError:
        pass
    else:
        array_sorted = cast(np.ndarray, array_sorted)
        if isinstance(index, MultiIndex):
            index = MultiIndex.from_tuples(array_sorted, names=index.names)
        else:
            index = Index(array_sorted, name=index.name, dtype=index.dtype)

    return index
def analyze_one(nm, genedf, min_pos_reads, min_perc_poss,
                min_total_var_support, min_maf, min_samples, min_variants,
                gene_seqs, pnpn_groups):
    # This method prepares a separate dataframe for each gene (subsampling etc.) and then calculates
    # all of the required metrics.
    genedf = _prepgene(nm, genedf, min_pos_reads, min_perc_poss,
                       min_total_var_support, min_maf, min_samples,
                       min_variants)
    if genedf is None:
        return [None] * 5
    sites, pnps = calc_pnps(genedf, gene_seqs[nm])
    percmut = calc_percodon_mut(genedf, gene_seqs[nm])
    pnpn = calc_pnpn_all(genedf, gene_seqs[nm], pnpn_groups)
    pnpn.index = MultiIndex.from_tuples([(nm, ix[0], ix[1])
                                         for ix in pnpn.index])
    for df in ([sites, percmut[0]] if sites is not None else [percmut[0]]):
        df.index = MultiIndex.from_product([[nm], df.index])
    percmut[1] = Series(percmut[1]).to_frame(nm).T
    ffdeg_pw, ffdeg_poss = calc_ffdeg_pi_within(genedf, gene_seqs[nm])
    return sites, (pnps.to_frame(nm).T if pnps is not None else pnps), percmut, \
            (ffdeg_pw.to_frame(nm).T, ffdeg_poss), pnpn
Example #3
0
 def apply_counts(ldf):
     nm = ldf.index.get_level_values(0)[0]
     ldf.index = MultiIndex.from_tuples(ldf.index.get_level_values(1))
     ldf = ldf.dropna(how='all', axis=1)
     codldf = Series({
         i: (cod.loc[nm, i[0]] if i[0] in cod.loc[nm] else 0)
         for i in allcodons
     })
     ldf = codldf.to_frame('cod').join(ldf).drop('cod', axis=1)
     for col in ldf.columns:
         ldf[col + '_s'] = codldf
     return ldf.fillna(0)
Example #4
0
def create_codon_trans_matrix(dirsname, tmpdir, grpnm):
    chdirmkifnotexist(join(tmpdir, 'tmpmq'))
    # TODO: IMPORTANT! Wrap this for loop with your hpc job submission pipeline
    for fname in glob(join(dirsname, '*codons*')):
        if exists(
                join(tmpdir,
                     grpnm + basename(fname).replace('codons', 'sums'))):
            continue
        do_one_unite(fname, grpnm, tmpdir)
    df = None
    for f in glob(join(tmpdir, grpnm + '*sums.df')):
        print(basename(f))
        if df is None:
            df = read_pickle(f)
        else:
            df = concat([df, read_pickle(f)],
                        sort=False).groupby(level=[0, 1]).sum()
    df.columns = MultiIndex.from_tuples([('per',i) if '_s' not in i \
                                         else ('sums',i.replace('_s','')) for i in df.columns])
    sms = df.sums.sum().truediv(3)

    def applyfunc(ldf):
        ret = ldf.per.truediv(ldf.sums).loc[ldf.name].iloc[:, 0]
        ret[ldf.name] = 1 - ret.sum()
        return ret

    def applyfunc_counts(ldf):
        adds = ldf.sums.iloc[0][0]
        ret = ldf.per.loc[ldf.name].iloc[:, 0]
        ret[ldf.name] = adds - ret.sum()
        return ret

    df_counts = df.groupby(
        level=1,
        axis=1).apply(lambda x: x.groupby(level=0).apply(applyfunc_counts))
    df_norm = df.groupby(
        level=1, axis=1).apply(lambda x: x.groupby(level=0).apply(applyfunc))
    aas, _, _ = get_codon_table()
    df_aas = df_counts
    df_aas['aa_start'] = [aas[a] for a in df_counts.index.get_level_values(0)]
    df_aas['aa_end'] = [aas[a2] + ('.' if a1!=a2 and aas[a1]==aas[a2] else '') \
                        for a1, a2 in \
                        zip(df_counts.index.get_level_values(0),df_counts.index.get_level_values(1))]
    df_aas = df_aas.groupby(['aa_start', 'aa_end']).sum()
    df_norm.to_pickle(
        join(General.Basepath, grpnm + '_4_60_mutation_codons.df'))
    df_counts.to_pickle(
        join(General.Basepath, grpnm + '_4_60_mutation_codon_counts.df'))
    df_aas.to_pickle(
        join(General.Basepath, grpnm + '_4_60_mutation_aas_counts.df'))
    sms.to_pickle(join(General.Basepath, grpnm + '_4_60_nucleotide_count.df'))
Example #5
0
 def _Intercept_2const(df):
     from pandas.core.indexes.multi import MultiIndex
     if isinstance(df.index, MultiIndex):
         new_index = []
         for v in df.index.values:
             v = list(v)
             if 'Intercept' in v:
                 v[v.index('Intercept')] = 'const'
             new_index.append(v)
         multi_index = lzip(*new_index)
         df.index = MultiIndex.from_arrays(multi_index)
     else:
         index_value = df.index.tolist()
         if 'Intercept' in index_value:
             index_value[index_value.index('Intercept')] = 'const'
         df.index = index_value
     return df
Example #6
0
 def _Intercept_2const(df):
     from pandas.core.indexes.multi import MultiIndex
     if df.index.contains('Intercept'):
         if isinstance(df.index,MultiIndex):
             new_index = []
             for i in df.index.values:
                 i = list(i)
                 if 'Intercept' in i:
                     i[i.index('Intercept')] = 'const'
                 new_index.append(i)
             multi_index = lzip(*new_index)
             df.index = MultiIndex.from_arrays(multi_index)
         else:
             index_list = df.index.tolist()
             idx = index_list.index('Intercept')
             index_list[idx] = 'const'
             df.index = index_list
     return df
Example #7
0
    def _Intercept_2const(df):
        from pandas.core.indexes.multi import MultiIndex

        if "Intercept" in df.index:
            if isinstance(df.index, MultiIndex):
                new_index = []
                for i in df.index.values:
                    i = list(i)
                    if "Intercept" in i:
                        i[i.index("Intercept")] = "const"
                    new_index.append(i)
                multi_index = lzip(*new_index)
                df.index = MultiIndex.from_arrays(multi_index)
            else:
                index_list = df.index.tolist()
                idx = index_list.index("Intercept")
                index_list[idx] = "const"
                df.index = index_list
        return df
def calc_pnpn_all(genedf, geneseq, pnpn_groups):
    # LIAT:
    # Here we calculate pN(group1)/pN(group2) for all different groups that we define below.
    # We start by taking the consensus sequence:
    consensus = _get_consensus(genedf, geneseq)
    if consensus is None:
        return None
    # Splitting to codons
    cons_codons = [consensus[i:i + 3] for i in range(0, len(consensus), 3)]
    # Separately we normalize by the numebr of reads in each poisition to get a relative abundance
    # of each SNP
    normdf = genedf.groupby(level=1).apply(lambda x: x.truediv(x.sum(0)))
    ret = []
    for nm, (g1, g2, nononymity) in pnpn_groups.items():
        # Call a calculation for each condition
        res = calc_pnpn_one(cons_codons, normdf, g1, g2, nononymity)
        # Organize with the constructed name as index
        res.index = MultiIndex.from_tuples([(nm, i) for i in res.index])
        ret.append(res)
    return concat(ret, sort=False)
Example #9
0
 def _multiindex(self) -> MultiIndex:
     return MultiIndex.from_arrays([self.left, self.right], names=["left", "right"])
Example #10
0
 def _multiindex(self):
     return MultiIndex.from_arrays([self.left, self.right],
                                   names=['left', 'right'])
Example #11
0
 def _multiindex(self):
     return MultiIndex.from_arrays([self.left, self.right],
                                   names=['left', 'right'])