Example #1
0
def total_energy(positions: pd.DataFrame, velocities: pd.DataFrame) -> int:
    """
    Computes the total energy of the system for a given position
    """
    pot = positions.abs().sum(axis=1)
    kin = velocities.abs().sum(axis=1)
    return sum(pot * kin)
Example #2
0
def linear_envelope(
    signal_df: pandas.DataFrame,
    critical_freqs: Union[float, Sequence[float]],
    sampling_frequency: int,
    order: int,
    filter_type: str = "butter",
    zero_lag: bool = True,
    cheby_param: Optional[float] = None,
    zero_center_: bool = True,
    inplace: bool = False,
) -> pandas.DataFrame:
    """Find the linear envelope of a signal.

    This function finds the linear envelope of the raw EMG signal by:

    1. (optionally) zero-centering each signal.
    2. Taking the `abs` of each value (full-wave rectification).
    3. Low-pass filtering the signal.

    Args:
        signal_df: a :py:class:`~pandas.DataFrame` with a different
            discrete-time signal in each of its columns.

        critical_freqs: passed along to :py:func:`digital_filter`.

        sampling_frequency: passed along to :py:func:`digital_filter`.

        order: passed along to :py:func:`digital_filter`.

        filter_type: passed along to :py:func:`digital_filter`.

        zero_lag: passed along to :py:func:`digital_filter`.

        cheby_param: passed along to :py:func:`digital_filter`.

        zero_center_: if `True`, zero-center the data before taking its
            absolute value.

        inplace: if `True`, the data in the original
            :py:class:`~pandas.DataFrame` will be modified directly. If
            `False`, the transformations will be applied to a copy of the data.
    """
    if zero_center_:
        signal_df = zero_center(signal_df, inplace=inplace)
    if inplace:
        signal_df[:] = signal_df.abs()
    else:
        signal_df = signal_df.abs()

    return digital_filter(
        signal_df=signal_df,
        critical_freqs=critical_freqs,
        sampling_frequency=sampling_frequency,
        order=order,
        filter_type=filter_type,
        band_type="lowpass",
        zero_lag=zero_lag,
        cheby_param=cheby_param,
        inplace=inplace,
    )
Example #3
0
def reduce_correlation_matrix(correlations: pd.DataFrame, reduction_size: int):
    best_indicators: List[str] = []

    correlations = correlations.abs()
    correlations_original = correlations.copy()

    cor = correlations.to_numpy()
    row_sums = np.sum(cor, axis=1)
    min_row = np.argmin(row_sums)
    best_indicators.append(correlations.columns[min_row])

    correlations.drop(correlations.index[min_row], axis="index", inplace=True)
    correlations.drop(correlations.columns[min_row],
                      axis="columns",
                      inplace=True)

    while len(best_indicators) < reduction_size:
        row_sums = []
        for index, row in correlations.iterrows():
            row_sums.append(correlations_original.loc[index,
                                                      best_indicators].sum())
        min_row = np.argmin(row_sums)
        ind = correlations.columns[min_row]
        if ind not in best_indicators:
            best_indicators.append(ind)
            correlations.drop(correlations.index[min_row],
                              axis="index",
                              inplace=True)
            correlations.drop(correlations.columns[min_row],
                              axis="columns",
                              inplace=True)

    ret = correlations_original.loc[best_indicators, best_indicators]
    return ret
Example #4
0
def plot_feature_importance(coefficients: DataFrame,
                            limit: int = None) -> None:
    with pd.option_context('display.max_rows', None, 'display.max_columns',
                           None):
        if not limit:
            limit = len(coefficients)

        coefficients = coefficients.reindex(coefficients.abs().sort_values(
            ascending=True, by='mean').index)
        coefficients = coefficients[-limit:]

    plt.figure(figsize=(4, 7 * (limit / 25)))

    plt.tick_params(
        axis='y',  # changes apply to the x-axis
        which='both',  # both major and minor ticks are affected
        left=False,  # ticks along the bottom edge are off
    )
    # plt.tick_params(axis='x', labelcolor='#414141', color='#b9b8b9')

    rects = plt.barh(
        coefficients.index,
        coefficients['mean'],
        color="#f89f76",
    )

    max_width = pipe(
        rects,
        map(lambda rect: rect.get_width()),
        max,
    )

    for index, rect in enumerate(rects):
        number = coefficients.iloc[index]['mean']
        plt.text(
            max_width * 1.1 + (-0.02 if number < 0 else 0),
            rect.get_y() + 0.2,
            f'{number:.3f}',
            # color='#060606',
            ha='left',
        )
    # plt.gcf().patch.set_facecolor('#fdeadd')
    plt.margins(y=0.01)
    # plt.gca().patch.set_facecolor('white')
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['bottom'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['right'].set_linewidth(1)
    plt.gca().spines['right'].set_color('#b9b8b9')
    plt.gca().spines['left'].set_linewidth(1)
    plt.gca().spines['left'].set_color('#b9b8b9')
    plt.gca().set_axisbelow(True)

    import matplotlib as mpl
    mpl.rcParams['figure.dpi'] = 100

    plt.grid(axis='x')

    plt.gca().xaxis.grid(linestyle='--', which='major', linewidth=1)
    plt.gca().get_xgridlines()[1].set_linestyle('-')
Example #5
0
 def test_where_complex(self):
     # GH 6345
     expected = DataFrame([[1 + 1j, 2], [np.nan, 4 + 1j]],
                          columns=["a", "b"])
     df = DataFrame([[1 + 1j, 2], [5 + 1j, 4 + 1j]], columns=["a", "b"])
     df[df.abs() >= 5] = np.nan
     tm.assert_frame_equal(df, expected)
Example #6
0
    def process(self, data, num_phonemes=None, wavelets=False):
        """ process is a method for summing CWT energies from EMG data into windows for each phoneme in the file.

        Attributes:
            data: a pandas DataFrame containing subvocalization EMG data processed by CWT, with any number of columns.
        Returns:
            a DataFrame containing summed energy spectrum windows for each phoneme based on an even splitting of the data.
        """

        if num_phonemes:
            num_windows = num_phonemes
            self.samples_per_window = int(data.shape[0] // num_windows)

        if wavelets:
            windows = DataFrame()
            # Go through each row from first index to last for each window
            for window in range(num_windows):
                first_index = int(window * self.samples_per_window)
                last_index = int(first_index + self.samples_per_window)
                data_window = DataFrame(data.iloc[first_index:last_index])
                # Sum up the squares of amplitudes in each column
                new_row = data_window.abs().pow(2).sum(axis=0)
                windows = windows.append(new_row, ignore_index=True)
            # Return all the windows
            return windows
def create_df_with_most_and_least_correlated_features(
        corr_df: pd.DataFrame,
        corr_column: str,
        max_variables: int,
        return_most_correlated: bool = False,
        return_least_correlated: bool = False) -> pd.DataFrame:
    """
    Creates data frame with max_variables/2 most correlated and max_variables/2 least correlated
    features
    :param corr_df: table with correlations
    :param corr_column: name of column with correlations
    :param max_variables: maximum variables in the output data frame
    :param return_most_correlated: boolean variable if we want to create df with only most
                                   correlated variables
    :param return_least_correlated: boolean variable if we want to create df with only least
                                    correlated variables
    Return: data frame with most or/and least correlated features
    """

    sorted_df = corr_df.abs().sort_values(by=[corr_column], ascending=False)

    most_correlated = corr_df.loc[sorted_df.index].head(int(max_variables / 2))
    least_correlated = corr_df.loc[sorted_df.index].tail(int(max_variables /
                                                             2))

    if return_most_correlated:
        return most_correlated
    elif return_least_correlated:
        return least_correlated
    else:
        return pd.concat([most_correlated, least_correlated])
Example #8
0
def true_range(high, low, close, drift=None, offset=None, **kwargs):
    """Indicator: True Range"""
    # Validate arguments
    high = verify_series(high)
    low = verify_series(low)
    close = verify_series(close)
    high_low_range = non_zero_range(high, low)
    drift = get_drift(drift)
    offset = get_offset(offset)

    # Calculate Result
    prev_close = close.shift(drift)
    ranges = [high_low_range, high - prev_close, prev_close - low]
    true_range = DataFrame(ranges).T
    true_range = true_range.abs().max(axis=1)

    # Offset
    if offset != 0:
        true_range = true_range.shift(offset)

    # Handle fills
    if 'fillna' in kwargs:
        true_range.fillna(kwargs['fillna'], inplace=True)
    if 'fill_method' in kwargs:
        true_range.fillna(method=kwargs['fill_method'], inplace=True)

    # Name and Categorize it
    true_range.name = f"TRUERANGE_{drift}"
    true_range.category = 'volatility'

    return true_range
Example #9
0
def compute_top_correlations_features(corr: pd.DataFrame,
                                      max_features: int) -> list:
    """
    Returns the max_features features having top correlations.

    Parameters
    ----------
    corr: pd.DataFrame
    max_features : int

    Returns
    -------
    list
    """
    sorted_corr = corr.abs().unstack().sort_values(kind="quicksort")[::-1]
    set_features = set()
    i = 0
    while len(set_features) < max_features and i < len(sorted_corr):
        if sorted_corr.index[i][0] != sorted_corr.index[i][1]:
            set_features.add(sorted_corr.index[i][0])
            # Last iteration can add one more feature otherwise
            if len(set_features) != max_features:
                set_features.add(sorted_corr.index[i][1])
        i += 1
    return list(set_features)
Example #10
0
def classify_open_closed_loci_with_quant(
        df: pd.DataFrame,
        quant: float = 0.1) -> Tuple[pd.DataFrame, pd.DataFrame]:
    abs_sub = df.abs()
    q = abs_sub.intensity.quantile(quant)
    open_areas = abs_sub.query("intensity <= @q")
    closed_areas = abs_sub.query("intensity > @q")
    return open_areas, closed_areas
def up_down_from_characteristic_direction(expr: pd.DataFrame, top_n=600):
    ''' Using the output of `characteristic_direction`, we can extract the top n genes
  with the highest absolute characteristic direction coefficients and split them into `up` and `down`.
  '''
    highest_abs_expr = expr.loc[expr.abs().sort_values(
        'CD-coefficient', ascending=False)[:top_n].index]
    return type(
        'UpDownGeneset', tuple(),
        dict(
            up=list(highest_abs_expr[highest_abs_expr > 0].dropna().index),
            down=list(highest_abs_expr[highest_abs_expr < 0].dropna().index),
        ))
Example #12
0
def plot_feature_permutation_importances(
    X_train_coefs: DataFrame,
    X_test: DataFrame,
    y_test: Series,
    est: BaseEstimator,
    sort_by: str = "coefficient",
    figsize: Tuple = (12, 8),
    ptitle: str = "plot title",
    savefig: Path = Path().cwd() / "reports" / "figures" / "manual_fi.png",
    save_pref: bool = False,
) -> DataFrame:
    """Plot feature and permutation importances"""
    fig, axs = plt.subplots(figsize=figsize, nrows=1, ncols=2)
    plt.subplots_adjust(wspace=0.6)
    axf = axs[0]
    # Sort by absolute value
    X_train_coefs = X_train_coefs.reindex(X_train_coefs.abs().sort_values(
        by=sort_by, ascending=True).index)
    X_train_coefs.plot(kind="barh", ax=axf, legend=False)
    axf.set_ylabel(None)
    axf.set_title(ptitle, fontweight="bold")
    labels = [
        item.get_text().replace("_", " ") for item in axf.get_yticklabels()
    ]
    axf.set_yticklabels(labels)

    axp = axs[1]
    result = permutation_importance(est,
                                    X=X_test,
                                    y=y_test,
                                    n_repeats=10,
                                    random_state=42,
                                    n_jobs=-1)
    sorted_idx = result.importances_mean.argsort()
    axp.boxplot(
        result.importances[sorted_idx].T,
        vert=False,
        labels=X_test.columns[sorted_idx],
        patch_artist=True,
    )
    axp.set_title("Permutation Importances (test data)",
                  fontweight="bold",
                  loc="left")
    curr_datetime = datetime.now().strftime("%Y%m%d-%H%M%S")
    filename = f"feature_importances__{curr_datetime}.png"
    if save_pref and not (savefig / filename).is_file():
        fig.savefig(savefig / filename, bbox_inches="tight", dpi=300)
    return X_train_coefs
Example #13
0
    def explain_local(self,
                      x_explain: pd.DataFrame,
                      n_cols: Optional[int] = None):
        res = []
        for sample_explanation in x_explain.abs().to_dict(orient='records'):
            importance = self._regularize(
                sorted(sample_explanation.items(),
                       key=operator.itemgetter(1),
                       reverse=True))

            total_mvmt = sum(map(operator.itemgetter(1), importance))
            res_ind = dict(importance[:n_cols])
            res_ind['rest'] = total_mvmt - sum(res_ind.values())
            res.append(res_ind)

        return res
Example #14
0
def comprehensive_analysis(records, **conf):
    rates = DataFrame()
    for code, values in records:
        rates[code] = change_rate(
            values, conf['col'], conf['slide'], conf['days'])

    # print '==== plot rates ===='
    # plot_items(rates, 2, 400)

    # print '==== highest correlations ===='
    # smoothed = rates.applymap(lambda x: round(x / 5.0))
    # sims = smoothed.corr()
    # sims = sims[sims != 1]
    # print sims.head().stack().nlargest(5)

    print '==== highest liquidities(variances) ===='
    var_highest = rates.abs().sum().nlargest(5)
    print var_highest
    def process(self,
                data: pd.DataFrame,
                factWeight: pd.DataFrame,
                method: str = 'Equal',
                rp: int = 60,
                hp: int = 5,
                **kwargs) -> pd.DataFrame:
        """
        部分权重会用到未来数据,所以需要对权重进行平移与相应的因子值进行匹配
        Parameters
        ----------
        hp : 持有期
        rp : 滚动周期
        data : 因子集
        factWeight :因子权重
        method : 因子合成方法
        kwargs :

        Returns
        -------

        """
        self.rp, self.hp = rp, hp

        factDir = np.sign(factWeight.rolling(rp, min_periods=1).mean())
        factDir = factDir.shift(hp + 1)  # 收益率为标签(预测值), 历史收益数据加权需要+ 1

        # 因子转为正向因子,同时因子收益等指标调整为单调状态
        factNew = data.mul(factDir, level=0).dropna()
        factWeightNew = factWeight.abs()

        method_dict = {"RetWeight": self.retWeighted,
                       "OPT": self.MAX_IC_IR
                       }

        if method is None:
            return data

        res = method_dict[method](fact=factNew, factWeight=factWeightNew, **kwargs)
        return res
def dataFrameMathTest():
    #Note : The methods that return a series default to working on columns.
    df = DataFrame()
    # Load a DataFrame from a CSV file    
    org_df = pd.read_csv('mlg.csv')
    df = org_df.iloc[:,1:7]
    
    resAbs = df.abs() # absolute values
    print(resAbs)
    #resAdd = df.add(o) # add df, Series or value
    #print(resAdd)
    resCount = df.count() # non NA/null values
    print(resCount)
    resCumMax = df.cummax() # (cols default axis)
    print(resCumMax)
    resCumMin = df.cummin() # (cols default axis)
    print(resCumMin)
    resCumSum = df.cumsum() # (cols default axis)
    print(resCumSum)
    resDiff = df.diff() # 1st diff (col def axis)
    print(resDiff)
    resDiv = df.div(12) # div by df, Series, value
    print(resDiv)
    #resDot = df.dot(13) # matrix dot product
    #print(resDot)
    resMax = df.max() # max of axis (col def)
    print(resMax)
    resMean = df.mean() # mean (col default axis)
    print(resMean)
    resMedian = df.median()# median (col default)
    print(resMedian)
    resMin = df.min() # min of axis (col def)
    print(resMin)
    resMul = df.mul(2) # mul by df Series val
    print(resMul)
    resSum = df.sum() # sum axis (cols default)
    print(resSum)
    resWhere = df.where(df > 0.5, other=np.nan)
    print(resWhere)
Example #17
0
def remove_duplicate_columns(df: pd.DataFrame, tol: float = 1e-8) -> List[str]:
    """Remove duplicate columns.

    Parameters
    ----------
    df : DataFrame
        Input dataframe.
    tol : float, optional
        Tolerance to assess duplicate columns. Default is 1e-8.

    Returns
    -------
    columns: list of str
        Columns to keep after removing duplicates.

    """

    df = df / df.abs().sum(0)
    df *= 1 / tol
    # keep = df.round().T.drop_duplicates(keep="last").T.columns  # Slow!!
    idx = np.unique(df.round().values, axis=1, return_index=True)[-1]
    keep = df.columns[sorted(idx)]
    return keep
Example #18
0
import numpy as np
from pandas import DataFrame

npdata = np.random.randn(5, 3)
columnNames = ['x1', 'x2', 'x3']
data = DataFrame(npdata, columns=columnNames)

print(data.abs())

print('\nMaximum value per column:')
print(data.max())
print('\nMinimum value per row:')
print(data.min(axis=1))
print('\nSum of values per column:')
print(data.sum())
print('\nAverage value per row:')
print(data.mean(axis=1))
print('\nCalculate max - min per column')
f = lambda x: x.max() - x.min()
print(data.apply(f))
print('\nCalculate max - min per row')
f = lambda x: x.max() - x.min()
print(data.apply(f, axis=1))
Example #19
0
class Portfolio(object):

    """This class represents portfolio and its events."""

    def __repr__(self):
        return '<Portfolio {}>'.format(self.prices.shape)

    def __init__(self, ohlcs, starting_capital=100000, price_type='cprices',
                 transaction_fee_bps=15., transaction_fee_min=7):
        self.price_type = price_type
        self.transaction_fee_bps = transaction_fee_bps
        self.transaction_fee_min = transaction_fee_min
        self.prices = self.from_ohlcs(ohlcs, price_type)
        self.volumes = self.from_ohlcs(ohlcs, 'volumes')
        self.trades = DataFrame(zeros(self.prices.shape), self.prices.index,
                                self.prices.columns)
        self.fees = DataFrame(zeros(self.prices.shape), self.prices.index,
                              self.prices.columns)
        self.starting_capital = starting_capital
        self.capital = []
        self.quantities = []
        self.values = []
        self.refresh()

    def from_ohlcs(self, ohlcs, price_type):
        """Set prices using a list of ohlc classes."""
        dfs = []
        for ohlc in ohlcs:
            df = DataFrame(getattr(ohlc, price_type),
                           posix_as_dt(ohlc.timestamps))
            dfs.append(df)
        prices = concat(dfs, join='outer', axis=1)
        prices.columns = [ohlc.symbol for ohlc in ohlcs]
        return prices.fillna(method='pad')

    def refresh(self):
        """Calculates positions, values, free capital and costs from trades.
        Fees of short positions (if any) are same as cost for long. This is not
        realistic, but the class is intended to represent long only portfolios.
        """
        self.fees = self.transaction_fee_bps * self.trades.abs() * \
            self.prices / 10000
        small = self.fees < self.transaction_fee_min
        nonzero = self.trades.abs() > 0
        self.fees[small * nonzero] = self.transaction_fee_min
        self.quantities = self.trades.cumsum()
        self.values = self.quantities * self.prices
        self.capital = self.starting_capital + self.total_trade_values - \
            self.total_fees

    def trade(self, timestring, symbol, quantity):
        """Convenience function to enter trades and refresh."""
        self.trades[symbol][timestring] = quantity
        self.refresh()

    def trade_max(self):
        """Trade all capital on first day, equal sized positions."""
        first_day = dt_as_str(self.prices.index[0])
        trade_sizes = zeros(len(self.prices.columns))
        trade_sizes[:] = self.starting_capital / float(len(trade_sizes))
        trade_sizes = [trade_sizes[ind] / self.prices.iloc[0].values[ind] for
                       ind in arange(len(trade_sizes))]
        trade_sizes = trunc(trade_sizes)
        for ind in arange(len(self.prices.columns)):
            self.trade(first_day, self.prices.columns[ind], trade_sizes[ind])
        self.refresh()

    @property
    def market_value(self):
        """Value of equity positions at each time."""
        return self.values.sum(axis=1)

    @property
    def total_value(self):
        """Total value of portfolio at each time."""
        return self.market_value + self.capital

    @property
    def trade_values(self):
        """Trade values for each trade."""
        tvals = -self.trades * self.prices
        to_0 = (tvals == 0) + (isnull(tvals))
        tvals[to_0] = 0
        return tvals

    @property
    def total_trade_values(self):
        """Cumulative sum of all trades."""
        return self.trade_values.sum(axis=1).cumsum()

    @property
    def total_fees(self):
        """Cumulative sum of fees."""
        return self.fees.sum(axis=1).cumsum()
Example #20
0
#%%
#STEP 4
# This cell calculate the absolute values of the proportion of one race to another in the wards.
# The assign function from pandas is used to create new columns that are populated with these calculations.

# Reads excel file
pd.read_excel(
    r"S:\GEOG 6293.10 Special Topics 201603\Volpe, Travis - TVolpe1\Final Project\GIS Python Project\Project Data\Data\Wards_2010_DCcounts.xlsx"
)

#Caculates proportions and uses assign to create new columns
disindx_assign2 = (
    df.abs(
        disindx_assign1.assign(
            AbsWB=disindx_assign1['wi_WT'] - disindx_assign1['bi_BT']).assign(
                AbsWA=disindx_assign1['wi_WT'] - disindx_assign1['ai_AT'])
        #.assign(AbsBW = disindx_assign1[ 'bi_BT' ] - disindx_assign1[ 'wi_WT' ]) : Not needed used to confim absolute value was working
        .assign(AbsBA=disindx_assign1['bi_BT'] -
                disindx_assign1['ai_AT']).head(n=9)))

#Creates an excel file populated with the dated calculated from the disindx_assign2 function.
writer = pd.ExcelWriter(
    r"S:\GEOG 6293.10 Special Topics 201603\Volpe, Travis - TVolpe1\Final Project\GIS Python Project\Project Data\Data\Wards_2010_DCratio.xlsx"
)
disindx_assign2.to_excel(writer, )
writer.save()

# Step 4a: in the exported excel file calaculate the disilimarity index
# D = 1/2(SUM(Abs_Races))
# as the disilimarity index is a global measure and in this context is only begining
# calaculated for the entire city it did not seem necessary to use pandas to do this.
Example #21
0
    def test_operators_timedelta64(self):
        df = DataFrame(
            dict(
                A=date_range("2012-1-1", periods=3, freq="D"),
                B=date_range("2012-1-2", periods=3, freq="D"),
                C=Timestamp("20120101") - timedelta(minutes=5, seconds=5),
            ))

        diffs = DataFrame(dict(A=df["A"] - df["C"], B=df["A"] - df["B"]))

        # min
        result = diffs.min()
        assert result[0] == diffs.loc[0, "A"]
        assert result[1] == diffs.loc[0, "B"]

        result = diffs.min(axis=1)
        assert (result == diffs.loc[0, "B"]).all()

        # max
        result = diffs.max()
        assert result[0] == diffs.loc[2, "A"]
        assert result[1] == diffs.loc[2, "B"]

        result = diffs.max(axis=1)
        assert (result == diffs["A"]).all()

        # abs
        result = diffs.abs()
        result2 = abs(diffs)
        expected = DataFrame(dict(A=df["A"] - df["C"], B=df["B"] - df["A"]))
        tm.assert_frame_equal(result, expected)
        tm.assert_frame_equal(result2, expected)

        # mixed frame
        mixed = diffs.copy()
        mixed["C"] = "foo"
        mixed["D"] = 1
        mixed["E"] = 1.0
        mixed["F"] = Timestamp("20130101")

        # results in an object array
        result = mixed.min()
        expected = Series(
            [
                pd.Timedelta(timedelta(seconds=5 * 60 + 5)),
                pd.Timedelta(timedelta(days=-1)),
                "foo",
                1,
                1.0,
                Timestamp("20130101"),
            ],
            index=mixed.columns,
        )
        tm.assert_series_equal(result, expected)

        # excludes numeric
        result = mixed.min(axis=1)
        expected = Series([1, 1, 1.0], index=[0, 1, 2])
        tm.assert_series_equal(result, expected)

        # works when only those columns are selected
        result = mixed[["A", "B"]].min(1)
        expected = Series([timedelta(days=-1)] * 3)
        tm.assert_series_equal(result, expected)

        result = mixed[["A", "B"]].min()
        expected = Series([timedelta(seconds=5 * 60 + 5),
                           timedelta(days=-1)],
                          index=["A", "B"])
        tm.assert_series_equal(result, expected)

        # GH 3106
        df = DataFrame({
            "time": date_range("20130102", periods=5),
            "time2": date_range("20130105", periods=5),
        })
        df["off1"] = df["time2"] - df["time"]
        assert df["off1"].dtype == "timedelta64[ns]"

        df["off2"] = df["time"] - df["time2"]
        df._consolidate_inplace()
        assert df["off1"].dtype == "timedelta64[ns]"
        assert df["off2"].dtype == "timedelta64[ns]"
Example #22
0
yc10 - yc10[1]

frame7 = DataFrame(np.arange(12.).reshape((4, 3)),
                   columns=list('abc'),
                   index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series_7 = frame7.iloc[0]
frame7 - series_7  #每一層都會減掉,叫做廣播broadcasting
series_7_1 = frame7['b']
frame7.sub(series_7_1, axis=0)

##函數應用與映射
frame8 = DataFrame(np.random.randn(4, 3),
                   columns=list('bde'),
                   index=['Ohio', 'Utah', 'Texas', 'Oregon'])
frame8
frame8.abs()  # = np.abs(frame8) 取絕對值

f = lambda x: x.max() - x.min()
frame8.apply(f)
frame8.apply(f, axis=1)


def f(x):
    return Series([x.min(), x.max()], index=['min', 'max'])


frame8.apply(f)  #傳遞進去的函數可以是多個值組成的Series

f = lambda x: '%.2f' % x
frame8.applymap(f)  #這邊特別使用applymap!
frame8['b'].map(f)  #Series有一個元素級函數的map用法,所以上述也要用map
Example #23
0
def correlation_matrix_to_sorted_pairs(corr: pd.DataFrame):
    df = corr.abs().stack().reset_index()
    df = df.loc[(df['level_0'] != 'level_0') & (df['level_1'] != 'level_0')]
    df = df.loc[df['level_0'] != df['level_1']]
    return df.sort_values([0]).iloc[::2].reset_index(drop=True)
Example #24
0
def plot_comparatives(data: DataFrame):
    columns = data.columns

    fig = figure(figsize=[14, 7])

    gs = GridSpec(3,
                  5,
                  width_ratios=[1.5, 1.5, 1.5, .06, 2.5],
                  height_ratios=[.3, 1, 1])
    gs.update(left=0.05, right=0.95, top=.95, wspace=0.3, hspace=0)

    hmap_original = subplot(gs[1, 0], xticks=[])
    hmap_original_x = subplot(gs[0, 0], yticks=[], xticks=[])

    hmap_blurred = subplot(gs[1, 1], xticks=[], yticks=[])
    hmap_blurred_x = subplot(gs[0, 1], yticks=[], xticks=[])

    hmap_contrast = subplot(gs[1, 2], xticks=[], yticks=[])
    hmap_contrast_x = subplot(gs[0, 2], yticks=[], xticks=[])

    hmap_cbar = subplot(gs[1, 3])

    gs_s = GridSpec(3, 2, width_ratios=[5, 2], height_ratios=[.5, 1, 1])
    gs_s.update(left=0.05, right=0.95, top=.95, wspace=0.5, hspace=0.3)
    hist = subplot(gs_s[0, 1], yticks=[])
    scat = subplot(gs_s[1, 1], xticks=[], yticks=[])

    gs2 = GridSpec(1, 2)
    gs2.update(left=0.05, right=0.95, top=.4, wspace=0.05)

    box1 = subplot(gs2[0, 0], xticks=[])
    box2 = subplot(gs2[0, 1], yticks=[], xticks=[])

    # ------------------------------------------------------------------

    functions = (
        template('r', 'Modulus of medians for\n2 sec windows Kendall Tau',
                 hmap_original, hmap_original_x, lambda x: x, 'original'),
        template('g', 'Modulus of Gaussian for\ncorrelation matrix',
                 hmap_blurred, hmap_blurred_x,
                 lambda x: gaussian_filter(x, sigma=(1, 1), order=0),
                 'gaussian'),
        template('b', 'Modulus of Gaussian\nwith increased contrast',
                 hmap_contrast, hmap_contrast_x,
                 lambda x: _adjust_contrast(x, lower=10, upper=90),
                 'high contrast'),
    )

    dm = list()
    for start in data.index[::512]:
        m = data[:][start:start + 512].corr('kendall').abs()
        dm.append(m.as_matrix())

    dm = asarray(dm)
    dm = nanmedian(dm, axis=0)

    hmap_kws = dict(xticklabels=10,
                    yticklabels=10,
                    square=True,
                    vmin=0,
                    vmax=1)

    # HEATMAPS ----------------------------------------------

    dfm = DataFrame(dm, columns=columns, index=columns).abs()
    results = dict()

    for index, item in enumerate(functions):
        dfm = DataFrame(item.func(dfm.as_matrix()),
                        columns=columns,
                        index=columns).abs()
        results[item.hist_label] = dfm.abs()

        sns.heatmap(dfm, ax=item.ax, cbar_ax=hmap_cbar, **hmap_kws)
        item.ax.set_yticklabels(item.ax.get_yticklabels(),
                                rotation=60,
                                fontsize=8)
        item.ax.set_xticklabels(item.ax.get_xticklabels(),
                                rotation=30,
                                fontsize=8)

        item.upper_ax.set_title(item.title)
        sum_dt = _smooth_line(dfm.as_matrix().sum(axis=0))
        item.upper_ax.plot(sum_dt)
        item.upper_ax.set_xlim(0, sum_dt.size)

        d_nan = dfm.abs().copy()
        d_nan[d_nan == 1] = NaN
        sns.kdeplot(d_nan.as_matrix().ravel(),
                    ax=hist,
                    c=item.c,
                    label=item.hist_label,
                    lw=0.8)
        hist.set_xticks([0])
        hist.legend(fontsize=8)

        if index == 0:
            item.upper_ax.set_ylabel('Sum', fontsize=8)
            continue

        item.ax.set_yticks([])
    else:
        hist.set_title(
            'Distributions of the medians\nof 2 sec Kendall kendall')
        sns.despine(left=True,
                    right=True,
                    top=True,
                    bottom=False,
                    offset=5,
                    ax=hist)

        hmap_cbar.set_aspect(10)

    # BOX PLOTS -------------------------------------

    dfm = results['original']
    dfm[dfm == 1] = 0
    sns.boxplot(data=results['original'], ax=box1, linewidth=0.5, fliersize=3)
    box1.set_title('Distribution of each channel (original)')
    sns.despine(left=False,
                right=True,
                top=True,
                bottom=True,
                ax=box1,
                offset=5)
    box1.set_yticks([0, 1])
    box1.set_yticklabels([0, 1])
    box1.set_xticks([])
    box1.set_xlabel('Channels', fontsize=8)
    box1.set_ylim(0, 1)

    dfm = DataFrame(dm, columns=columns, index=columns).abs()
    dfm[dfm == 1] = 0
    sns.boxplot(data=results['high contrast'],
                ax=box2,
                linewidth=0.5,
                fliersize=3)
    sns.despine(left=False,
                right=True,
                top=True,
                bottom=True,
                ax=box2,
                offset=5)
    box2.set_title('Distribution of each channel (high contrast)')
    box2.set_yticks([0, 1])
    box2.set_yticklabels([])
    box2.set_xticks([])
    box2.set_xlabel('Channels', fontsize=8)
    box2.set_ylim(0, 1)

    dfm = results['high contrast']
    dfm[dfm == 1] = 0
    clustered_dt = _cluster(dfm.as_matrix())
    scat.scatter(clustered_dt['X0'],
                 clustered_dt['X1'],
                 c=clustered_dt['columns_list'],
                 cmap='spectral',
                 s=15)
    scat.set_title('Ward linkage')
    scat.set_xticks([])
    scat.set_yticks([])

    return fig
Example #25
0
    'nova lox': 'salmon'
}

data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
print(data)

print(data['food'].map(lambda x: meat_to_animal[x.lower()]))

print('-------------------------')
# 数据标准化
datafile = './data/normalization_data.xls'  # 参数初始化
data = pd.read_excel(datafile, header=None)  # 读取数据

print((data - data.min()) / (data.max() - data.min()))  # 最小-最大规范化
print((data - data.mean()) / data.std())  # 零-均值规范化
print(data / 10 ** np.ceil(np.log10(data.abs().max())))  # 小数定标规范化

print('-------------------------')
# 替换值
data = Series([1., -999., 2., -999., -1000., 3.])
print(data)

print(data.replace(-999, np.nan))

print(data.replace([-999, -1000], np.nan))

print(data.replace([-999, -1000], [np.nan, 0]))

print(data.replace({-999: np.nan, -1000: 0}))

print('-------------------------')
def compute_diff_size(diff: pd.DataFrame):
    return diff.abs().values.sum()
Example #27
0
df1+df2
df1.add(df2, fill_value=0)

# Operations between DataFrame and Series
frame = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                  index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.ix[0]
frame - series #boardcast on each row
series2 = frame['d']
frame.sub(series2, axis=0)  #boardcast on each column

#Function application and mapping------------------------
# numpy的ufunc会被应用到元素级
frame = DataFrame(np.random.randn(4, 3), columns=list('bde'),index=['Utah', 'Ohio', 'Texas', 'Oregon'])
np.abs(frame)
frame.abs()
# DataFrame的apply默认将函数应用在各列
f = lambda x: x.max() - x.min() #x is an array?
frame.apply(f)
frame.apply(f,axis=1)   #应用于各行

def f(x):
    return Series([x.min(), x.max()], index = ['min', 'max'])
frame.apply(f)
#元素级的python函数应该用applymap,Series用map
format = lambda x: '%.2f' % x
frame.applymap(format)
frame['e'].map(format)

#Hierarchical indexing=====================================
data = Series(np.random.randn(10),index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],[1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])
 def _generate_absolute_upper_triangle_corrs(self, corrs: DataFrame,
                                             mask: ndarray) -> DataFrame:
     return corrs.abs().where(mask, np.nan)
Example #29
0
import numpy as np
from pandas import DataFrame

npdata = np.random.randn(5, 3)  #create a 5 by 3 random matrix
columnNames = ['x1', 'x2', 'x3']
data = DataFrame(npdata, columns=columnNames)

print(data.abs())  #get the absolute value for each element

print("\nMaximum value per column : ")
print(data.max())  #get maximum value for each column

print('\nMinimum value per row : ')
print(data.min(axis=1))  #get minimum value for each row

print('\nSum of values per column : ')
print(data.sum())  #get sum of values for each column

print('\nAverage value per row : ')
print(data.mean(axis=1))  #get average value for each row

print('\nCalculate max - min per column')
f = lambda x: x.max() - x.min()
print(data.apply(f))

print('\nCalculate max -  min per row')
f = lambda x: x.max() - x.min()
print(data.apply(f, axis=1))
Example #30
0
  'honey ham': 'pig',
  'nova lox': 'salmon'
}

data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
data

data['food'].map(lambda x: meat_to_animal[x.lower()])

# 数据标准化
datafile = 'd:/data/normalization_data.xls' #参数初始化
data = pd.read_excel(datafile, header = None) #读取数据

(data - data.min())/(data.max() - data.min()) #最小-最大规范化
(data - data.mean())/data.std() #零-均值规范化
data/10**np.ceil(np.log10(data.abs().max())) #小数定标规范化


###替换值
data = Series([1., -999., 2., -999., -1000., 3.])
data

data.replace(-999, np.nan)

data.replace([-999, -1000], np.nan)

data.replace([-999, -1000], [np.nan, 0])

data.replace({-999: np.nan, -1000: 0})