Beispiel #1
0
def to_molecular(df: pd.DataFrame, renorm=True):
    """
    Converts mass quantities to molar quantities of the same order.

    Parameters
    -----------
    df : :class:`pandas.DataFrame`
        Dataframe to transform.
    renorm : :class:`bool`, :code:`True`
        Whether to renormalise the dataframe after converting to relative moles.

    Returns
    -------
    :class:`pandas.DataFrame`
        Transformed dataframe.

    Notes
    ------
    Does not convert units (i.e. mass% --> mol%; mass-ppm --> mol-ppm).
    """
    # df = df.to_frame()
    MWs = [pt.formula(c).mass for c in df.columns]
    if renorm:
        return renormalise(df.div(MWs))
    else:
        return df.div(MWs)
Beispiel #2
0
    def test_arith_flex_series(self):
        df = self.simple

        row = df.xs('a')
        col = df['two']
        # after arithmetic refactor, add truediv here
        ops = ['add', 'sub', 'mul', 'mod']
        for op in ops:
            f = getattr(df, op)
            op = getattr(operator, op)
            assert_frame_equal(f(row), op(df, row))
            assert_frame_equal(f(col, axis=0), op(df.T, col).T)

        # special case for some reason
        assert_frame_equal(df.add(row, axis=None), df + row)

        # cases which will be refactored after big arithmetic refactor
        assert_frame_equal(df.div(row), df / row)
        assert_frame_equal(df.div(col, axis=0), (df.T / col).T)

        # broadcasting issue in GH7325
        df = DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype='int64')
        expected = DataFrame([[nan, np.inf], [1.0, 1.5], [1.0, 1.25]])
        result = df.div(df[0], axis='index')
        assert_frame_equal(result, expected)

        df = DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype='float64')
        expected = DataFrame([[np.nan, np.inf], [1.0, 1.5], [1.0, 1.25]])
        result = df.div(df[0], axis='index')
        assert_frame_equal(result, expected)
Beispiel #3
0
def propNoteGraph(data_test,b_u,b_i,mu,L,R):
    # Give the interesting graphic
    index_note = np.arange(1,6)
    count_1 = np.zeros([5,2])
    count_2 = np.zeros([5,2])
    notes = DataFrame(count_1,index=index_note,columns=['BON','MAUVAIS'])
    notes_naif = DataFrame(count_2,index=index_note,columns=['BON','MAUVAIS'])
    
    for r in range(data_test.shape[0]):
#        r_pred = round(mu + b_u[data_test.user_id.values[r]] + b_i[data_test.movie_id.values[r]] + X[data_test.user_id.values[r],data_test.movie_id.values[r]])           
        mean = mu + b_u[data_test[r,0]] + b_i[data_test[r,1]]        
        r_pred = round(mean + np.dot(L[data_test[r,0],:],R[data_test[r,1],:]))          
        r_pred = min(5,r_pred)
        r_pred = max(1,r_pred)
        r_true = int(round(mean+data_test[r,2]))
        r_naif = round(mean)

        if r_naif==r_true:
            notes_naif.BON[r_true]+=1
        else:
            notes_naif.MAUVAIS[r_true]+=1
        
        if r_pred==r_true:
            notes.BON[r_true]+=1
        else:
            notes.MAUVAIS[r_pred]+=1
                
    notes_naif_prop = notes_naif.div(notes_naif.sum(1),axis=0)
    notes_prop = notes.div(notes.sum(1),axis=0)
    
    notes_naif_VS_algo = pd.concat([notes_prop.BON,notes_naif_prop.BON], axis=1)
    notes_naif_VS_algo.columns = ['ALGO','NAIF']
    return notes_naif_VS_algo
    def test_arith_flex_series(self):
        df = self.simple

        row = df.xs('a')
        col = df['two']
        # after arithmetic refactor, add truediv here
        ops = ['add', 'sub', 'mul', 'mod']
        for op in ops:
            f = getattr(df, op)
            op = getattr(operator, op)
            assert_frame_equal(f(row), op(df, row))
            assert_frame_equal(f(col, axis=0), op(df.T, col).T)

        # special case for some reason
        assert_frame_equal(df.add(row, axis=None), df + row)

        # cases which will be refactored after big arithmetic refactor
        assert_frame_equal(df.div(row), df / row)
        assert_frame_equal(df.div(col, axis=0), (df.T / col).T)

        # broadcasting issue in GH7325
        df = DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype='int64')
        expected = DataFrame([[nan, np.inf], [1.0, 1.5], [1.0, 1.25]])
        result = df.div(df[0], axis='index')
        assert_frame_equal(result, expected)

        df = DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype='float64')
        expected = DataFrame([[np.nan, np.inf], [1.0, 1.5], [1.0, 1.25]])
        result = df.div(df[0], axis='index')
        assert_frame_equal(result, expected)
Beispiel #5
0
def hmm_build(alphabet, aln, threshold, sigma):
    '''Given alphabet, multiple alignment aln, insertion threshold and pseudocount sigma,
    return the profile HMM transition and emission matrix.'''
    
    aln_cols = list(zip(*(aln)))
    m, n = len(aln), len(aln_cols)       # m sequences, n columns

    # indices of columns where '-' count is below threshold
    match_cols = [i for i in range(n) if aln_cols[i].count('-') / m < threshold]

    # state names
    k = len(match_cols)   # k states
    states_ = ['M{0} D{0} I{0}'.format(i).split() for i in range(1, k + 1)]
    states = ['S', 'I0'] + [i for j in states_ for i in j] + ['E']
    
    # building matrices
    transitions = DataFrame(data=0.0, columns=states, index=states)
    emissions = DataFrame(data=0.0, columns=alphabet, index=states) 

    for seq in aln:  # iterate through each sequence
        state_ix = 0
        last_state = 'S'
        for i in range(n):
            if i in match_cols:
                state_ix += 1
                if seq[i] != '-':
                    current_state = 'M' + str(state_ix)
                    emissions.loc[current_state, seq[i]] += 1
                else:
                    current_state = 'D' + str(state_ix)
                
                transitions.loc[last_state, current_state] += 1
                last_state = current_state
            
            elif seq[i] != '-':
                current_state = 'I' + str(state_ix)
                transitions.loc[last_state, current_state] += 1
                emissions.loc[current_state, seq[i]] += 1
                last_state = current_state
                            
        transitions.loc[last_state, 'E'] += 1

    # scale rows to [0, 1]
    transitions = transitions.div(transitions.sum(1) + 1e-10, axis=0).round(3)
    emissions = emissions.div(emissions.sum(1) + 1e-10, axis=0).round(3)
    
    #add pseudocounts
    transitions.iloc[:2, 1:4] += sigma
    transitions.iloc[-4:-1, -2:] += sigma
    for i in range(k):
        transitions.iloc[i*3-1:i*3+2, i*3+1:i*3+4] += sigma
        emissions.iloc[i*3+1:i*3+3, :] += sigma
    emissions.iloc[-2, :] += sigma
    
    # scale again
    transitions = transitions.div(transitions.sum(1) + 1e-10, axis=0).round(3)
    emissions = emissions.div(emissions.sum(1) + 1e-10, axis=0).round(3)

    return transitions, emissions
Beispiel #6
0
def hmm_build(alphabet, aln, threshold, sigma):
    '''Given alphabet, multiple alignment aln, insertion threshold and pseudocount sigma,
    return the profile HMM transition and emission matrix.'''
    
    aln_cols = list(zip(*(aln)))
    m, n = len(aln), len(aln_cols)       # m sequences, n columns

    # indices of columns where '-' count is below threshold
    match_cols = [i for i in range(n) if aln_cols[i].count('-') / m < threshold]

    # state names
    k = len(match_cols)   # there k M-states
    states_ = [('M'+ str(i), 'D' + str(i), 'I' + str(i)) for i in range(1, k + 1)]
    states = ['S', 'I0'] + [i for j in states_ for i in j] + ['E']
    
    # building matrices
    transitions = DataFrame(data=0.0, index=states, columns=states)
    emissions   = DataFrame(data=0.0, index=states, columns=alphabet) 

    for seq in aln:  # iterate through each sequence
        state_ix = 0
        last_state = 'S'
        for i in range(n):
            if i in match_cols:
                state_ix += 1
                if seq[i] != '-':
                    current_state = 'M' + str(state_ix)
                    emissions.loc[current_state, seq[i]] += 1
                else:
                    current_state = 'D' + str(state_ix)
                
                transitions.loc[last_state, current_state] += 1
                last_state = current_state
            
            elif seq[i] != '-':
                current_state = 'I' + str(state_ix)
                transitions.loc[last_state, current_state] += 1
                emissions.loc[current_state, seq[i]] += 1
                last_state = current_state
                            
        transitions.loc[last_state, 'E'] += 1

    # scale rows to [0, 1]
    transitions = transitions.div(transitions.sum(1) + 1e-10, axis=0).round(3)
    emissions = emissions.div(emissions.sum(1) + 1e-10, axis=0).round(3)
    
    #add pseudocounts
    transitions.iloc[:2, 1:4] += sigma
    transitions.iloc[-4:-1, -2:] += sigma
    for i in range(k):
        transitions.iloc[i*3-1:i*3+2, i*3+1:i*3+4] += sigma
        emissions.iloc[i*3+1:i*3+3, :] += sigma
    emissions.iloc[-2, :] += sigma
    
    # scale again
    transitions = transitions.div(transitions.sum(1) + 1e-10, axis=0) + 1e-100
    emissions = emissions.div(emissions.sum(1) + 1e-10, axis=0)  + 1e-100

    return transitions, emissions, states, k
def to_molecular(df: pd.DataFrame, renorm=True):
    """
    Converts mass quantities to molar quantities of the same order.
    E.g.:
    mass% --> mol%
    mass-ppm --> mol-ppm
    """
    MWs = [pt.formula(c).mass for c in df.columns]
    if renorm:
        return renormalise(df.div(MWs))
    else:
        return df.div(MWs)
def to_molecular(df: pd.DataFrame, renorm=True):
    """
    Converts mass quantities to molar quantities of the same order.
    E.g.:
    mass% --> mol%
    mass-ppm --> mol-ppm
    """
    MWs = [pt.formula(c).mass for c in df.columns]
    if renorm:
         return renormalise(df.div(MWs))
    else:
        return df.div(MWs)
Beispiel #9
0
 def word_vector(self, strx, stry):
     NVframe = DataFrame(self.data_oversam,columns=[strx, stry])
     NVframe[u'case']= NVframe[strx]+'_'+NVframe[stry]
     casecounts = NVframe[u'case'].value_counts()
     NVframe = NVframe.reset_index()
     del NVframe[u'index']
     Count_ob = Count()
     count = Count_ob.casecount(NVframe, casecounts)
     NVframe[u'count']= count
     NVframe = NVframe[NVframe[u'case'].notnull()]
     NVframe = NVframe.drop_duplicates()
     NVframe=NVframe.set_index([strx, stry])
     del NVframe[u'case']
     NVframe = NVframe.unstack()
     NVframe = NVframe.fillna(0)
     NVframe.columns = NVframe.columns.get_level_values(1)
     NVframe = NVframe.div(NVframe.sum(1),axis=0)
     #NVframeが共起頻度行列
     #標準化処理
     SVD_ob = SVD()
     Uframe,Vframe,Sframe = SVD_ob.SVD_run(NVframe)
     Sframe.plot()
     plt.plot( Sframe, 'o')
     #print Sframe
     #print Vframe
     m = Uframe.mean(0)
     s = Uframe.std(0)
     nd = Uframe
     nd = Uframe.sub(m,axis=1).div(s,axis=1)
     SN = SVD_ob.sf(Sframe)
     return nd, SN
Beispiel #10
0
def min_max_scale_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Scales the data frame values between 0 and 1 across the columns allowing for easier comparison of line shape on plots
    :param df: data frame to be scaled
    :return: scaled dataframe
    """
    return df.div(df.max(), axis=1)
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Divide dataframe by reference feature column.
        Parameters
        ----------
        X : Pandas DataFrame of shape = [n_samples, n_features]
            The data to be transformed.
        Raises
        ------
        TypeError
            If the input is not a Pandas DataFrame
        ValueError
            - If the dataframe not of the same size as that used in fit().
        Returns
        -------
        X : pandas dataframe
            The dataframe with the transformed variables.
        """

        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = _to_dataframe(X)

        # Check if input data contains same number of columns as dataframe used to fit.
        _ensure_ncols(X, self.input_shape_[1])

        # transform
        X = X.div(self.scaling_factors_, axis=0)
        return X
Beispiel #12
0
def hmm_build(alphabet, aln, threshold):
    '''given alphabet, multiple alignment aln, and insertion threshold,
    return the profile HMM transition and emission matrix.'''
    
    aln_cols = list(zip(*(aln)))
    m, n = len(aln), len(aln_cols)       # m sequences, n columns

    # indices of columns where '-' count is below threshold
    match_cols = [i for i in range(n) if aln_cols[i].count('-') / m < threshold]

    # state names
    k = len(match_cols)   # k states
    states_ = [('M'+ str(i), 'D' + str(i), 'I' + str(i)) for i in range(1, k + 1)]
    states = ['S', 'I0'] + [i for j in states_ for i in j] + ['E']

    # building matrices
    transitions = DataFrame(data=0.0, columns=states, index=states)
    emissions = DataFrame(data=0.0, columns=alphabet, index=states) 

    for seq in aln:  # iterate through each sequence
        state_ix = 0
        last_state = 'S'
        for i in range(n):
            if i in match_cols:
                state_ix += 1
                if seq[i] != '-':
                    current_state = 'M' + str(state_ix)
                    emissions.loc[current_state, seq[i]] += 1
                else:
                    current_state = 'D' + str(state_ix)
                
                transitions.loc[last_state, current_state] += 1
                last_state = current_state
            
            elif seq[i] != '-':
                current_state = 'I' + str(state_ix)
                transitions.loc[last_state, current_state] += 1
                emissions.loc[current_state, seq[i]] += 1
                last_state = current_state
                            
        transitions.loc[last_state, 'E'] += 1

    # normalize rows
    transitions = transitions.div(transitions.sum(1) + 1e-10, axis=0).round(3)
    emissions = emissions.div(emissions.sum(1) + 1e-10, axis=0).round(3)

    return transitions, emissions
Beispiel #13
0
def scatter_pie_from_df(
    df: pd.DataFrame,
    x: str,
    y: str,
    cols: Optional[list] = [],
    normalize: bool = True,
    return_df: bool = False,
    palette: Optional[dict] = None,
    cmap: Optional[str] = "tab10",
    **kwargs,
) -> Axes:
    """
    Plot scatter pie based on columns in a DataFrame.
    
    Parameters:
        df: Dataframe containing x, y, and additional count columns.
        x: Column to use as x-values.
        y: Column to use as y-values.
        cols: List of columns in dataframe to use as ratios and plotting. 
            If [], uses all columns besides x and y.
        normalize: If True, calculate ratios using selected columns. 
        return_df: If True, also return normalized dataframe.
        palette: Dictionary mapping column name to color. 
            If None, create mapping using cmap. 
        cmap: Name of colormap to use if palette not provided. 
        kwargs: Arguments passed to :func:`scatter_pie`

    Returns:
        A :class:`~matplotlib.axes.Axes` and normalized df if `return_df` is True.
    """
    # make copy of dataframe and set xy as index
    df = df.copy().set_index([x, y])

    if (type(cols) == list) & (len(cols) > 1):
        # used specified list of columns
        df = df.loc[:, cols]
    elif cols != []:
        raise ValueError("cols must be a list of more than one column headers")

    # row normalize
    categories = df.columns
    df = df.div(df.sum(axis=1), axis=0).fillna(0)
    df = df.reset_index()

    # generate mapping of category to color
    if palette == None:
        palette = get_palette(categories, cmap)

    ratios = df[categories].to_records(index=False).tolist()
    colors = [palette[cat] for cat in categories]
    ax = scatter_pie(df[x].values, df[y].values, ratios, colors, **kwargs)

    # generate legend as separate figure
    if return_df:
        return ax, df
    return ax
Beispiel #14
0
def jevons_index(
    prices: pd.DataFrame,
    base_prices: pd.DataFrame,
    axis: int = 1,
) -> pd.Series:
    """Calculates an index using the Jevons method which takes the
    geometric mean of price relatives.
    """
    price_relatives = prices.div(base_prices)
    return geo_mean(price_relatives, axis) * 100
Beispiel #15
0
def get_quality_adjusted_prices(
    prices: pd.DataFrame,
    base_prices: pd.DataFrame,
    adjustments: pd.DataFrame,
    axis: pd._typing.Axis = 1,
) -> pd.DataFrame:
    """Applies the quality adjustments to get new base prices."""
    adjustment_factor = prices.div(prices - adjustments)

    return base_prices * adjustment_factor.cumprod(axis)
Beispiel #16
0
def carli_index(
    prices: pd.DataFrame,
    base_prices: pd.DataFrame,
    axis: int = 1,
) -> pd.Series:
    """Calculates an index using the Carli method which takes the mean
    of price relatives.
    """
    price_relatives = prices.div(base_prices)
    return price_relatives.mean(axis) * 100
Beispiel #17
0
def laspeyres_index(
    prices: pd.DataFrame,
    base_prices: pd.DataFrame,
    weights: pd.DataFrame,
    axis: int = 1,
) -> pd.Series:
    """Calculates an index using the Laspeyres method which takes a
    sum of the product of the price relatives and weight shares.
    """
    price_relatives = prices.div(base_prices)
    return aggregate(price_relatives, weights, axis=axis) * 100
Beispiel #18
0
def plot_lines(df: pd.DataFrame, normalize=False):
    if normalize:
        df = df.div(df.iloc[0], axis=1)
    fig = go.Figure()
    for col in df.columns:
        fig.add_trace(go.Scatter(x=df.index, y=df[col], name=col))

    fig.update_layout(showlegend=True,
                      xaxis={'hoverformat': '%d%b%Y'},
                      yaxis={'hoverformat': '.1%'})
    fig.show()
Beispiel #19
0
def _clean_up(df: pd.DataFrame) -> pd.DataFrame:
    """Форматирование данных."""
    df = df.transpose().stack()
    first_year = df.index[0][0]
    df.index = pd.date_range(
        name=col.DATE,
        freq="M",
        start=pd.Timestamp(year=first_year, month=1, day=END_OF_JAN),
        periods=len(df),
    )
    df = df.div(100)
    return df.to_frame(col.CPI)
Beispiel #20
0
def geometric_laspeyres_index(
    prices: pd.DataFrame,
    base_prices: pd.DataFrame,
    weights: pd.DataFrame,
    axis: int = 1,
) -> pd.Series:
    """Calculates an index using the geometric Laspeyres method which
    takes the geometric mean of the price relatives multiplied by weight
    shares.
    """
    price_relatives = prices.div(base_prices)
    index = aggregate(price_relatives, weights, method='geomean', axis=axis)
    return index * 100
Beispiel #21
0
def get_rolling_beta(df: pd.DataFrame, hist: pd.DataFrame, mark: pd.DataFrame,
                     n: pd.DataFrame) -> pd.DataFrame:
    """Turns a holdings portfolio into a rolling beta dataframe

    Parameters
    ----------
    df : pd.DataFrame
        The dataframe of daily holdings
    hist : pd.DataFrame
        A dataframe of historical returns
    mark : pd.DataFrame
        The dataframe of market performance
    n : int
        The period to get returns for

    Returns
    ----------
    final : pd.DataFrame
        Dataframe with rolling beta
    """
    df = df["Holding"]
    uniques = df.columns.tolist()
    res = df.div(df.sum(axis=1), axis=0)
    res = res.fillna(0)
    comb = pd.merge(hist["Close"],
                    mark["Market"],
                    how="outer",
                    left_index=True,
                    right_index=True)
    comb = comb.fillna(method="ffill")
    for col in hist["Close"].columns:
        exog = sm.add_constant(comb["Close"])
        rols = RollingOLS(comb[col], exog, window=252)
        rres = rols.fit()
        res[f"beta_{col}"] = rres.params["Close"]
    final = res.fillna(method="ffill")
    for uni in uniques:
        final[f"prod_{uni}"] = final[uni] * final[f"beta_{uni}"]
    dropped = final[[f"beta_{x}" for x in uniques]].copy()
    final = final.drop(columns=[f"beta_{x}" for x in uniques] + uniques)
    final["total"] = final.sum(axis=1)
    final = final[final.index >= datetime.now() - timedelta(days=n + 1)]
    comb = pd.merge(final,
                    dropped,
                    how="left",
                    left_index=True,
                    right_index=True)
    return comb
Beispiel #22
0
 def _get_proportional_weights(signal_df: pd.DataFrame,
                               values_df: pd.DataFrame,
                               inversely: bool) -> pd.DataFrame:
     """Assumes signal_df and values_df are two DataFrames with the same index and columns. inversely is bool and
     decides if the weights are proportional or inversely-proportional."""
     if not dataframe_has_same_index_and_column_names(signal_df, values_df):
         raise ValueError(
             'signal_df and values_df does not have the same composition.')
     values_df.iloc[:, 0] = np.nan
     if inversely:
         values_df = values_df.apply(lambda x: 1.0 / x)
     values_df *= signal_df
     values_sum_s = values_df.sum(axis=1)
     proportional_weight_df = values_df.div(values_sum_s,
                                            axis='index').fillna(value=0)
     return proportional_weight_df
def generate_probability_vector_result(output_path):

    cluster_frame = pd.read_csv(output_path + '/clusters.csv', header=None)
    cluster_frame = cluster_frame.set_index(cluster_frame.ix[:,0]).ix[:, 1:]
    cluster_array = cluster_frame.values

    points_frame = pd.read_csv(output_path + '/points.csv', header=None)
    # points_frame = points_frame.drop_duplicates()
    points_array = points_frame.values

    distance_matrix = pw.euclidean_distances(cluster_array, points_array)
    distance_matrix = distance_matrix.T
    distance_frame = DataFrame(distance_matrix)
    # print(distance_frame)
    # print(distance_frame.sum(axis=1))
    distance_frame = distance_frame.div(distance_frame.sum(axis=1), axis=0)
    distance_frame.to_csv(output_path + '/probability.csv')
def generate_probability_vector_result(output_path):

    cluster_frame = pd.read_csv(output_path + '/clusters.csv', header=None)
    cluster_frame = cluster_frame.set_index(cluster_frame.ix[:, 0]).ix[:, 1:]
    cluster_array = cluster_frame.values

    points_frame = pd.read_csv(output_path + '/points.csv', header=None)
    # points_frame = points_frame.drop_duplicates()
    points_array = points_frame.values

    distance_matrix = pw.euclidean_distances(cluster_array, points_array)
    distance_matrix = distance_matrix.T
    distance_frame = DataFrame(distance_matrix)
    # print(distance_frame)
    # print(distance_frame.sum(axis=1))
    distance_frame = distance_frame.div(distance_frame.sum(axis=1), axis=0)
    distance_frame.to_csv(output_path + '/probability.csv')
Beispiel #25
0
def winter_monthly(df: pd.DataFrame) -> pd.DataFrame:
    """Compute winter monthly deaths as a %age of all winter deaths."""
    df = df.query(("Date >= '1 Jul 2020' and Date <= '30 Jun 2021'"))
    df = df.resample("M").sum()
    assert df["UK"].sum() == 95234  # quality check

    # convert to monthly percentage of total
    df = df.div(df.sum()) * 100

    # data is to mid April 2021: pad remaining months to end of winter period with None
    idx = pd.to_datetime(
        [datetime(2021, 5, 31, 0, 0, 0), datetime(2021, 6, 30, 0, 0, 0)]
    )
    null_data = pd.DataFrame(columns=["UK"], data=[None, None], index=idx)
    df = df.append(null_data)

    return df
def feature_statistics_per_class(features, targets, target_names, bins=5):
    from pandas import DataFrame
    from sklearn.preprocessing import LabelBinarizer
    from sklearn.feature_extraction import DictVectorizer
    import numpy as np

    binned_df = (features.div(features.max()) * bins).astype(int).astype(str)
    feature_dict = binned_df.to_dict(orient='records')

    dv = DictVectorizer()
    x = dv.fit_transform(feature_dict)
    y = LabelBinarizer().fit_transform(targets)

    feature_count_df = DataFrame(np.dot(x.T.todense(), y),
                                 columns=target_names,
                                 index=dv.get_feature_names())
    feature_count_norm_df = feature_count_df.div(
        DataFrame(y, columns=target_names).sum())
    return feature_count_norm_df
def get_quality_adjustments(
    quality_value: pd.DataFrame,
    to_reset: Optional[pd.DataFrame] = None,
    to_adjust: Optional[pd.DataFrame] = None,
) -> pd.DataFrame:
    """Return cumulative quality adjustment factors for given values.

    Accumulates the quality adjustments across each Feb-Jan+1 window,
    resetting back to no adjustment (a factor of 1) if a reset occurs.
    By default, adjustment factors are determined by dividing quality
    values by the value in the period before, but this can be subset
    using `to_adjust`_.

    Parameters
    ----------
    quality_value : DataFrame
        The quality value used to calculate quality adjustments.
    to_reset : DataFrame
        Boolean mask of quality adjustments to be reset.
    to_adjust : DataFrame
        Boolean mask of values to be adjusted.

    Returns
    -------
    DataFrame
        Cumulative adjustment factors for base prices.

    """
    # Divide size by the period before.
    adjustment_factors = quality_value.div(quality_value.shift(1, axis=1))

    if to_adjust is not None:
        adjustment_factors[~to_adjust] = 1

    if to_reset is not None:
        # Get the inverse cumulative growth for resetting.
        impute_resets = get_cumulative_adjustments(adjustment_factors).pow(-1)
        adjustment_factors = adjustment_factors.mask(to_reset, impute_resets)

    # Fill data lost in first period with 1 i.e. no adjustment.
    return get_cumulative_adjustments(adjustment_factors).fillna(1)
Beispiel #28
0
def _axis_wise(
    df:             pd.DataFrame,
    level:          int,
    totals_name:    str,
    subtotals_name: str,
    ndigits:        int,
    unit:           int,
    **kwargs
) -> pd.DataFrame:
    if level > 0:
        totals_name = subtotals_name
    if isinstance(df.index, pd.MultiIndex):
        totals = (
            df.xs(totals_name, level=level, drop_level=False)
            .reindex(df.index)
            .bfill()
        )
    else:
        totals = df.loc[totals_name]
    result = df.div(totals).multiply(unit)
    return result.pipe(round_percentages, ndigits=ndigits)
Beispiel #29
0
def _table_wise(
    df:             pd.DataFrame,
    level:          int,
    subtotals_name: str,
    ndigits:        int,
    unit:           int,
    **kwargs
) -> pd.DataFrame:
    if level == 0:
        totals = df.iloc[-1, -1]
        if df.index.nlevels > 1 or df.columns.nlevels > 1:
            frame = pd.DataFrame().reindex_like(df)
            frame.iloc[-1, -1] = totals
            totals = frame.bfill().bfill(axis=1)
    else:
        totals = (
            df.xs(subtotals_name, level=level, drop_level=False)
            .xs(subtotals_name, axis=1, level=level, drop_level=False)
            .reindex_like(df).bfill().bfill(axis=1)
        )
    result = df.div(totals).multiply(unit)
    return result.pipe(round_percentages, ndigits=ndigits)
Beispiel #30
0
def _table_wise_multilevel(
    df:             pd.DataFrame,
    axlevels:       Any,
    totals_name:    str,
    subtotals_name: str,
    ndigits:        int,
    unit:           int,
    **kwargs
) -> pd.DataFrame:
    axlevels = [min(level) for level in axlevels]

    row_totals = totals_name if axlevels[0] == 0 else subtotals_name
    col_totals = totals_name if axlevels[1] == 0 else subtotals_name

    totals = (
        df.xs(row_totals, level=axlevels[0], drop_level=False)
        .xs(col_totals, axis=1, level=axlevels[1], drop_level=False)
        .reindex_like(df).bfill().bfill(axis=1)
    )

    result = df.div(totals).multiply(unit)
    return result.pipe(round_percentages, ndigits=ndigits)
def dataFrameMathTest():
    #Note : The methods that return a series default to working on columns.
    df = DataFrame()
    # Load a DataFrame from a CSV file    
    org_df = pd.read_csv('mlg.csv')
    df = org_df.iloc[:,1:7]
    
    resAbs = df.abs() # absolute values
    print(resAbs)
    #resAdd = df.add(o) # add df, Series or value
    #print(resAdd)
    resCount = df.count() # non NA/null values
    print(resCount)
    resCumMax = df.cummax() # (cols default axis)
    print(resCumMax)
    resCumMin = df.cummin() # (cols default axis)
    print(resCumMin)
    resCumSum = df.cumsum() # (cols default axis)
    print(resCumSum)
    resDiff = df.diff() # 1st diff (col def axis)
    print(resDiff)
    resDiv = df.div(12) # div by df, Series, value
    print(resDiv)
    #resDot = df.dot(13) # matrix dot product
    #print(resDot)
    resMax = df.max() # max of axis (col def)
    print(resMax)
    resMean = df.mean() # mean (col default axis)
    print(resMean)
    resMedian = df.median()# median (col default)
    print(resMedian)
    resMin = df.min() # min of axis (col def)
    print(resMin)
    resMul = df.mul(2) # mul by df Series val
    print(resMul)
    resSum = df.sum() # sum axis (cols default)
    print(resSum)
    resWhere = df.where(df > 0.5, other=np.nan)
    print(resWhere)
Beispiel #32
0
def compute_percentages(data_: pd.DataFrame):
    percentages = data_.div(data_.iloc[:, -1], axis=0)
    percentages.replace([np.inf, -np.inf, np.nan], 0, inplace=True)
    return percentages
Beispiel #33
0
matrices of transition and emission probabilities.
'''

from pandas import DataFrame
from io import StringIO

f = open('rosalind_ba10h.txt').read().rstrip().split('--------\n')
x = list(f[0].rstrip())
alphabet = f[1].rstrip().split()
path = list(f[2].rstrip())
states = f[3].rstrip().split()    

transitions = DataFrame(data=0.0, index=states, columns=states)
emissions   = DataFrame(data=0.0, index=states, columns=alphabet)

for t in zip(path[:-1], path[1:]):
    transitions.loc[t] += 1

for a in zip(path, x):
    emissions.loc[a] += 1

transitions = transitions.div(transitions.sum(1) + 1e-10, axis=0).round(3)
emissions = emissions.div(emissions.sum(1) + 1e-10, axis=0).round(3)

f = StringIO()
transitions.to_csv(f, sep='\t', float_format='%g')
f.write('--------\n')
emissions.to_csv(f, sep='\t', float_format='%g')

open('rosalind_ba10h_sub.txt', 'wt').write(f.getvalue().rstrip())
Beispiel #34
0
    def initial_setup(
        self,
        iot_p: pd.DataFrame,
        dtilde_iot: pd.DataFrame,
        ytilde_iot: pd.DataFrame,
        p_tau: float,
        substitution_rate: float,
    ) -> None:
        """
        One-time setup of the GDP model

        :param iot_p: primary input-output data
        :param dtilde_iot: intermediate input-output data
        :param ytilde_iot: final input-output data
        :param p_tau: tax rate on products and production (fraction of pre-crisis levels)
        :param substitution_rate: Rate at which capital can be substituted for labour and vice versa
        """
        self.iot_p = iot_p
        self.dtilde_iot = dtilde_iot
        self.ytilde_iot = ytilde_iot
        self.xtilde_iot = pd.concat(
            [
                iot_p[PrimaryInput.IMPORTS],
                iot_p[PrimaryInput.COMPENSATION],
                iot_p[[
                    PrimaryInput.FIXED_CAPITAL_CONSUMPTION,
                    PrimaryInput.NET_OPERATING_SURPLUS,
                ]].sum(axis=1),
            ],
            axis=1,
        ).T
        self.xtilde_iot.index = M
        # x~[M.K, T] == 0, so we add a small epsilon
        self.xtilde_iot = np.maximum(self.xtilde_iot, 1e-6)
        self.ytilde_total_iot = self.ytilde_iot.sum(axis=1)
        self.gamma_d = dtilde_iot.div(
            dtilde_iot.sum(axis=0) + self.xtilde_iot.sum(axis=0))
        self.gamma_x = self.xtilde_iot.div(
            dtilde_iot.sum(axis=0) + self.xtilde_iot.sum(axis=0))

        self.o_iot = iot_p[[
            PrimaryInput.TAXES_PRODUCTION, PrimaryInput.TAXES_PRODUCTS
        ]].T
        self.q_iot = dtilde_iot.sum(axis=0) + iot_p.sum(axis=1)
        assert np.allclose(
            (dtilde_iot.sum(axis=0) + iot_p.sum(axis=1)),
            (dtilde_iot.sum(axis=1) + self.ytilde_total_iot),
            rtol=1e-6,
        )  # errors are due to rounding and omission of household sector
        assert np.allclose(
            (dtilde_iot.sum(axis=0) + self.xtilde_iot.sum(axis=0) +
             self.o_iot.sum(axis=0)),
            (dtilde_iot.sum(axis=1) + self.ytilde_total_iot),
            rtol=1e-6,
        )  # errors are due to rounding and omission of household sector
        assert np.allclose(self.gamma_d.sum(axis=0) + self.gamma_x.sum(axis=0),
                           1.0,
                           atol=1e-9)
        assert (self.gamma_d >= 0).all().all()
        assert (self.gamma_x >= 0).all().all()
        # depends on p_tau
        cd_prod_fun = dtilde_iot.pow(self.gamma_d).prod(
            axis=0) * self.xtilde_iot.pow(self.gamma_x).prod(axis=0)
        min_prod_fun = pd.concat(
            [
                dtilde_iot.multiply(1 / self.gamma_d).min(),
                self.xtilde_iot.multiply(1 / self.gamma_x).min(),
            ],
            axis=1,
        ).min(axis=1)
        sum_prod_fun = (dtilde_iot.multiply(self.gamma_d).sum() +
                        self.xtilde_iot.multiply(self.gamma_x).sum())
        lin_prod_fun = (1 - substitution_rate
                        ) * min_prod_fun + substitution_rate * sum_prod_fun
        prod_fun = lin_prod_fun

        self.Lambda = (1 / (1 -
                            (self.o_iot.sum(axis=0) / self.q_iot) * p_tau) *
                       (dtilde_iot.sum(axis=0) + self.xtilde_iot.sum(axis=0)) /
                       prod_fun)

        self.gamma_d_dict = {(i, j): self.gamma_d.loc[i, j]
                             for i in Sector for j in Sector}
        self.gamma_x_dict = {(m, j): self.gamma_x.loc[m, j]
                             for m in M for j in Sector}
        self.Lambda_dict = {i: self.Lambda[i] for i in Sector}

        weight_taxes = {
            i: p_tau * self.o_iot.loc[PrimaryInput.TAXES_PRODUCTION, i] /
            self.q_iot[i]
            for i in Sector
        }
        self.gdp_per_sector = {
            i: self.indicator("xtilde", M.L, i) +
            self.indicator("xtilde", M.K, i) +
            self.indicator("q", i) * weight_taxes[i]
            for i in Sector
        }
        self.surplus_per_sector = {
            i: self.indicator("xtilde", M.K, i)
            for i in
            Sector  # households don't have capital input to production
        }
        self.objective_c = -np.sum(list(
            self.gdp_per_sector.values()), axis=0) - np.sum(
                list(self.surplus_per_sector.values()), axis=0)
        assert self.objective_c.shape[0] == len(self.variables)
        self.max_gdp_per_sector = (
            self.xtilde_iot.loc[M.L] + self.xtilde_iot.loc[M.K] +
            self.o_iot.loc[PrimaryInput.TAXES_PRODUCTION])
        self.max_gdp = self.max_gdp_per_sector.sum()

        self.c_production_function_lin(self.gamma_d_dict, self.gamma_x_dict,
                                       self.Lambda_dict, substitution_rate)
        self.c_input(self.o_iot, self.q_iot, p_tau)
        self.c_output(self.q_iot)
Beispiel #35
0
def add_shares(table: pd.DataFrame):
    """Добавляет к таблице долю инвесторов в портфеле."""
    share = table.div(table["Portfolio"], axis="index")
    share.index = ["%"] * len(share)
    return pd.concat([table, share])
Beispiel #36
0
                   columns=list('abc'))
frame2 = DataFrame(np.arange(1,10).reshape(3,3),
                   columns=list('abc'))
print(frame1)
print(frame2)

# frame 덧셈
add = frame1.add(frame2)
print(add)

# frame 뺄셈
sub = frame2.sub(frame1)
print(sub)

# frame 나눗셈 div = frame2 / frame1
div = frame2.div(frame1)
print(div) # inf : 부모가 0인 경우 

# frame 곱셈 
mul = frame1.mul(frame2)
print(mul)

# 행/열 단위 합계/평균/최댓값/최솟값

sum1 = mul.sum(axis = 1) # 행 단위
sum2 = mul.sum(axis = 0) # 열 단위
print('행 단위 합계:\n',sum1)
print('열 단위 합계:\n',sum2)


avg1 = mul.mean(axis = 1) # 행 단위 평균