def clean(numpy_array):  #load your csv data here in numpy_array
        data=ut.preprocessData(numpy_array)

        #print dataarray
        #print data

        ###### numpy into pandas dataframe
        df = pd.DataFrame(data)
        #print df
        #print df.dtypes

        df=df.astype('float16')
        #print df.dtypes


        ###### generate preprocessed csv file 
        #df.to_csv('preprocessed_data.csv', sep=',',index=False)

        ###### normalize data between [0,1] using X_norm= (X - Xmin)/ (Xmax - Xmin)
        df_norm= (df - df.min()) / (df.max()-df.min())
        df_norm=df_norm.fillna(-1)

        ##### generate normalized csv 
        #df_norm.to_csv('normalized_data.csv',sep=',', index=False)
        
        return df_norm.as_matrix() 
Example #2
0
    def __generate_trace(self, objectives: DataFrame, metadata: list = None, legend: str = '', normalize: bool = False,
                         **kwargs):
        number_of_objectives = objectives.shape[1]

        if normalize:
            objectives = (objectives - objectives.min()) / (objectives.max() - objectives.min())

        marker = dict(
            color='rgb(127, 127, 127)',
            size=3,
            symbol='x',
            line=dict(
                color='rgb(204, 204, 204)',
                width=1
            ),
            opacity=0.8
        )
        marker.update(**kwargs)

        if number_of_objectives == 2:
            trace = go.Scattergl(
                x=objectives[0],
                y=objectives[1],
                mode='markers',
                marker=marker,
                name=legend,
                customdata=metadata
            )
        elif number_of_objectives == 3:
            trace = go.Scatter3d(
                x=objectives[0],
                y=objectives[1],
                z=objectives[2],
                mode='markers',
                marker=marker,
                name=legend,
                customdata=metadata
            )
        else:
            dimensions = list()
            for column in objectives:
                dimensions.append(
                    dict(range=[0, 1],
                         label=self.axis_labels[column:column+1][0] if self.axis_labels[column:column+1] else None,
                         values=objectives[column])
                )

            trace = go.Parcoords(
                line=dict(color='blue'),
                dimensions=dimensions,
                name=legend,
            )

        return trace
Example #3
0
def analyze():
    signals = read_csv(FILE_SIGNALS)
    devices = signals["id"].unique()
    
    print("got %d signals from %d devices" % (len(signals), len(devices)))

    signals = signals.groupby(["frequency", "id"]).size()
    signals = signals.reindex(MultiIndex.from_product([SPECTRUM, devices],
                                                      names=signals.index.names),
                              fill_value=0)
    signals = signals.unstack("id")
    
    # let's only keep frequencies with all signals present
    candidates = signals.dropna()
    # suggest frequency where the weakest sensor has the most
    # received signals, and then the frequency with most total
    # received signals for all sensors
    candidates = DataFrame({"total":   candidates.sum(axis=1),
                            "weakest": candidates.min(axis=1)})
    appropriate_freq = candidates.sort(["weakest", "total"],
                                       ascending=False).index[0]
    print("suggesting frequency %s" % mhz(appropriate_freq))

    signals.to_csv("spectrum.csv")
    
    import matplotlib.pyplot as plt
    from matplotlib.ticker import EngFormatter

    p=signals.plot(kind="Area")
    p.xaxis.set_major_formatter(EngFormatter(unit='Hz', places=2))
    plt.savefig(FILE_SPECTRUM, dpi=300)
    print("saved spectrum as %s" % FILE_SPECTRUM)
class LogAggregate:
    def __init__(self, dataset):
        self.dataset = DataFrame(dataset)

    def get_median(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).median()[kwarg['key']]
        else:
            return self.dataset.median()[kwarg['key']]

    def get_average(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).mean()[kwarg['key']]
        else:
            return self.dataset.mean()[kwarg['key']]

    def get_min(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).min()[kwarg['key']]
        else:
            return self.dataset.min()[kwarg['key']]
    
    def get_max(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).max()[kwarg['key']]
        else:
            return self.dataset.max()[kwarg['key']]

    def get_count(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).count()[kwarg['key']]
        else:
            return self.dataset.count()[kwarg['key']]
Example #5
0
    def test_min_max_dt64_with_NaT(self):
        # Both NaT and Timestamp are in DataFrame.
        df = DataFrame({"foo": [pd.NaT, pd.NaT, Timestamp("2012-05-01")]})

        res = df.min()
        exp = Series([Timestamp("2012-05-01")], index=["foo"])
        tm.assert_series_equal(res, exp)

        res = df.max()
        exp = Series([Timestamp("2012-05-01")], index=["foo"])
        tm.assert_series_equal(res, exp)

        # GH12941, only NaTs are in DataFrame.
        df = DataFrame({"foo": [pd.NaT, pd.NaT]})

        res = df.min()
        exp = Series([pd.NaT], index=["foo"])
        tm.assert_series_equal(res, exp)

        res = df.max()
        exp = Series([pd.NaT], index=["foo"])
        tm.assert_series_equal(res, exp)
Example #6
0
def get_val(df: pd.DataFrame, method):

    if isinstance(method, str):
        if method.lower() == "mean":
            return df.mean()
        elif method.lower() == "max":
            return df.max()
        elif method.lower() == "min":
            return df.min()
    elif isinstance(method, int) or isinstance(method, float):
        return method
    else:
        raise ValueError(f"unknown method {method} to replace nan vlaues")
def _flow_and_probability_mapper(monthly_data: pd.DataFrame, to_probability: bool = False,
                                 to_flow: bool = False, extrapolate: bool = False) -> interpolate.interp1d:
    if not to_flow and not to_probability:
        raise ValueError('You need to specify either to_probability or to_flow as True')

    # get maximum value to bound histogram
    max_val = math.ceil(np.max(monthly_data.max()))
    min_val = math.floor(np.min(monthly_data.min()))

    if max_val == min_val:
        warnings.warn('The observational data has the same max and min value. You may get unanticipated results.')
        max_val += .1

    # determine number of histograms bins needed
    number_of_points = len(monthly_data.values)
    number_of_classes = math.ceil(1 + (3.322 * math.log10(number_of_points)))

    # specify the bin width for histogram (in m3/s)
    step_width = (max_val - min_val) / number_of_classes

    # specify histogram bins
    bins = np.arange(-np.min(step_width), max_val + 2 * np.min(step_width), np.min(step_width))

    if bins[0] == 0:
        bins = np.concatenate((-bins[1], bins))
    elif bins[0] > 0:
        bins = np.concatenate((-bins[0], bins))

    # make the histogram
    counts, bin_edges = np.histogram(monthly_data, bins=bins)

    # adjust the bins to be the center
    bin_edges = bin_edges[1:]

    # normalize the histograms
    counts = counts.astype(float) / monthly_data.size

    # calculate the cdfs
    cdf = np.cumsum(counts)

    # interpolated function to convert simulated streamflow to prob
    if to_probability:
        if extrapolate:
            return interpolate.interp1d(bin_edges, cdf, fill_value='extrapolate')
        return interpolate.interp1d(bin_edges, cdf)
    # interpolated function to convert simulated prob to observed streamflow
    elif to_flow:
        if extrapolate:
            return interpolate.interp1d(cdf, bin_edges, fill_value='extrapolate')
        return interpolate.interp1d(cdf, bin_edges)
Example #8
0
def normalize(df: DataFrame, is_string: bool = False) -> DataFrame:
    """Normalizes numeric columns in a data frame"""

    ptid_col = None
    has_ptid: bool = PATIENT_ID_COL_NAME in list(df)

    if has_ptid:
        # Remove the PTID column
        ptid_col: DataFrame = get_del_col(data_set=df,
                                          col_name=PATIENT_ID_COL_NAME)

    if is_string:
        df: DataFrame = DataFrame(data=df.to_numpy(dtype=float),
                                  columns=list(df))

    # Normalize
    df: DataFrame = (df - df.min(axis=0)) / (df.max(axis=0) - df.min(axis=0))

    if has_ptid:
        # Reattach the patient ID column
        df: DataFrame = concat([ptid_col, df], axis=1)

    return df
Example #9
0
 def get_dist_to_nearest_target(self, bagfile):
     
     self._targets = self.detect_targets()
         
     positions = utilities.get_positions_from_bag(bagfile)
     
     distances = DataFrame()
     for target in range(len(self._targets)):
         px, py, pr = self._targets[target]
         distances['d'+ str(target)] = ((((positions['fly_x'] - px)/5.2)**2 + ((positions['fly_y'] - py)/4.8)**2)**0.5)# - pr
     
     distances['Timestamp'] = positions.Timestamp
     distances = utilities.convert_timestamps(distances)
     
     self.dtarget = DataFrame(distances.min(axis=1), columns=['dtarget'])
     return self.dtarget
Example #10
0
def init_scaler(source_df: pd.DataFrame,
                target_columns: List[str]) -> MinMaxScaler:
    # use min and max from source data if no definition is available. definitions see above: data_ranges
    source_min_df = source_df.min().to_frame().T
    source_max_df = source_df.max().to_frame().T
    target_min_df = pd.DataFrame(index=np.arange(0, 1), columns=target_columns)
    target_max_df = pd.DataFrame(index=np.arange(0, 1), columns=target_columns)

    # add min and max range for certain columns
    for column in target_columns:
        if column in data_ranges:
            # check if real data point is within defined data range. adapt accordingly
            if column in source_min_df and source_min_df.iloc[0][
                    column] < data_ranges[column]["min"]:  # "bad" case 1
                logger.write(
                    "Scaler init warning: Defined data range for column {}: [{}, {}], got minimum of {}"
                    .format(column, data_ranges[column]["min"],
                            data_ranges[column]["max"],
                            source_min_df.iloc[0][column]))
                target_min_df.iloc[0][column] = source_min_df.iloc[0][column]
            else:
                target_min_df.loc[0][column] = data_ranges[column]["min"]

            if column in source_max_df and source_max_df.iloc[0][
                    column] > data_ranges[column]["max"]:  # "bad" case 2
                logger.write(
                    "Scaler init warning: Defined data range for column {}: [{}, {}], got maximum of {}"
                    .format(column, data_ranges[column]["min"],
                            data_ranges[column]["max"],
                            source_max_df.iloc[0][column]))
                target_max_df.iloc[0][column] = source_max_df.iloc[0][column]
            else:
                target_max_df.loc[0][column] = data_ranges[column]["max"]

        elif column in source_min_df:
            target_min_df.loc[0][column] = source_min_df.iloc[0][column]
            target_max_df.loc[0][column] = source_min_df.iloc[0][column]
        else:
            raise ValueError(
                f"Unknown column {column}! No min and max values available!")

    min_max_df = pd.concat([target_min_df, target_max_df])
    min_max_data = min_max_df.to_numpy()
    # copy = False, if input already is numpy array
    scaler = MinMaxScaler(feature_range=(0, 1), copy=False)
    scaler.fit(min_max_data)
    return scaler
Example #11
0
    def get_min_dist(df: pds.DataFrame, tol: float = 1e-10):
        """
        Get the shortest pair distance from the given DataFrame.

        Args:
            df (DataFrame): index is the radial distance in Angstrom, and
                column is the time step in ps.
            tol (float): any float number less than tol is considered as zero.

        Returns:
            The shorted pair distance throughout the table.
        """
        # TODO: Add unittest
        for i, col in enumerate(df.columns):
            min_dist = df.min(axis="index")[i]
            if min_dist > tol:
                return float(col)
Example #12
0
def get_totals(df: pd.DataFrame):
    """
    The function takes a pandas DataFrame and creates a dictionary with selected summary statistics.
    """
    out = dict()
    out['min'] = df.min()
    out['per15'] = df.quantile(0.15)
    out['qr1'] = df.quantile(0.25)
    out['median'] = df.median()
    out['qr3'] = df.quantile(0.75)
    out['per85'] = df.quantile(0.85)
    out['max'] = df.max()
    out['count'] = df.count()
    out['mean'] = df.mean()
    out['iqr'] = out['qr3'] - out['qr1']

    return pd.DataFrame(out)
Example #13
0
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Transforms the variables using log transformation.
        Parameters
        ----------
        X : Pandas DataFrame of shape = [n_samples, n_features]
            The data to be transformed.
        Raises
        ------
        TypeError
            If the input is not a Pandas DataFrame
        ValueError
            - If the dataframe not of the same size as that used in fit().
            - If some variables contains zero or negative values.
        Returns
        -------
        X : pandas dataframe
            The dataframe with the transformed variables.
        """

        # check input dataframe and if class was fitted
        X = _to_dataframe(X)

        # check if input contains zero or negative values
        if (X <= 0).any().any():
            raise ValueError(
                "Some variables contain zero or negative values, can't apply log2"
            )

        # check if input contains infinite values
        if np.isinf(X).values.any():
            raise ValueError(
                "Some of the variables contain infinite values, can't apply log2"
            )

        # transform
        # Default lambda
        if self.lamb is None:
            lamb = X.min().min() / 10.0
        else:
            lamb = self.lamb
        # Apply the transformation
        y = X.values
        y = np.log2((y + (y**2 + lamb**2)**0.5) / 2)
        return pd.DataFrame(y, index=X.index, columns=X.columns)
Example #14
0
    def get_dist_to_nearest_target(self, bagfile):

        self._targets = self.detect_targets()

        positions = utilities.get_positions_from_bag(bagfile)

        distances = DataFrame()
        for target in range(len(self._targets)):
            px, py, pr = self._targets[target]
            distances['d' + str(target)] = (
                (((positions['fly_x'] - px) / 5.2)**2 +
                 ((positions['fly_y'] - py) / 4.8)**2)**0.5)  # - pr

        distances['Timestamp'] = positions.Timestamp
        distances = utilities.convert_timestamps(distances)

        self.dtarget = DataFrame(distances.min(axis=1), columns=['dtarget'])
        return self.dtarget
def generate_series_data(df: pd.DataFrame, column_length: int) -> pd.Series:
    df_max = df.max()
    df_max.index = df_max.index + '_max'
    df_min = df.min()
    df_min.index = df_min.index + '_min'
    df_mean = df.mean()
    df_mean.index = df_mean.index + '_mean'
    df_var = df.var()
    df_var.index = df_var.index + '_var'

    df_series = df.stack()
    series_index = np.array([
        [i] * column_length
        for i in range(int(len(df_series.index) / column_length))
    ]).reshape(1, -1)[0].astype(str)
    df_series.index = df_series.index.get_level_values(1) + '_' + series_index

    return pd.concat([df_series, df_max, df_min, df_mean, df_var])
Example #16
0
def dataframe_to_image(df: pd.DataFrame, image_filename: str) -> None:
    plt.figure(figsize=(12, 9))
    ax = plt.subplot(111)
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.get_xaxis().tick_bottom()
    ax.get_yaxis().tick_left()
    y_min = int(min(df.min().values * 0.98))
    y_max = int(max(df.max().values * 1.02))
    plt.ylim(y_min, y_max)
    plt.yticks(range(y_min, y_max, 5), fontsize=14)
    start_date = df.index.min().isoformat()[:10]
    end_date = df.index.max().isoformat()[:10]
    plt.title(f"Weight {start_date} - {end_date}", fontsize=22)

    plt.plot(
        df,
        lw=2.5,
    )
    plt.savefig(image_filename, bbox_inches="tight")
def cross_validate_trades(trades, N = 20, subset_fraction = 0.7):
    
    tickers = trades.tickers
    sample_size = round(len(tickers) * subset_fraction)
    summary = DataFrame(dtype = float)

    for n in range(N):
        sample_tickers = list(random.choice(tickers, sample_size, replace = False))
        trade_subset = trades.find(lambda T: T.ticker in sample_tickers)
        summary[n] = summary_report(trade_subset)

    result = DataFrame(dtype = float)
    result['Base'] = summary_report(trades)
    result['Mean'] = summary.mean(axis = 1)
    result['Std'] = summary.std(axis = 1)
    result['Median'] = summary.median(axis = 1)
    result['Max'] = summary.max(axis = 1)
    result['Min'] = summary.min(axis = 1)

    return (result, summary)
Example #18
0
def resilience(sim: pd.DataFrame, rec_th=1., idxs=[]) -> Tuple[float, float]:
    """
    Parameters
    ----------
    sim : pd.DataFrame
        return of simulate()

    Returns
    -------
    Tuple[float, float]
        shock intensity, time to recovery
    """

    average_shock = sim.min() - 1
    time_to_rec = (sim >= rec_th)[1:].idxmax()

    if len(idxs) > 0:
        return average_shock[idxs], time_to_rec[idxs]

    return average_shock, time_to_rec
Example #19
0
    def animated_training(self, data: np.ndarray, df: pd.DataFrame, mus,
                          sigmas):
        fig, ax = plt.subplots(figsize=(16, 9), dpi=70)

        def animate(epoch: int):
            self.run_epoch(data)
            ax.clear()
            plt.title(f'epoch = {epoch}')
            ax.set_xlabel('km')
            ax.set_ylabel('price')
            ax.set_xlim(data.min(axis=0)[0] - 1, data.max(axis=0)[0] + 1)
            ax.set_ylim(-4, 4)
            x = np.linspace(start=data.min(axis=0)[0] - 1,
                            stop=data.max(axis=0)[0] + 1,
                            num=100)
            y = self.estimator.predict(x)
            line = plt.plot(x, y, label='prediction')
            plt.scatter(data[:, 0], data[:, 1], label='raw data', marker='x')
            plt.legend()
            return line,

        ani = animation.FuncAnimation(fig,
                                      animate,
                                      frames=self.epochs,
                                      interval=10,
                                      blit=False)
        plt.show()
        for epoch in range(self.epochs):
            self.run_epoch(data)
        scaled_x = np.linspace(start=data.min(axis=0)[0] - 1,
                               stop=data.max(axis=0)[0] + 1,
                               num=100)
        self.graph(scaled_x, self.estimator.predict(scaled_x), data, 'k',
                   f'Scaled data ({self.epochs})')
        x_lin = np.linspace(start=df.min(axis=0)[0] - 1,
                            stop=df.max(axis=0)[0] + 1,
                            num=100)
        y_lin = self.estimator.predict(scaled_x) * sigmas[1] + mus[1]
        self.graph(x_lin, y_lin, (np.matrix([df.km, df.price]).T).A, 'b',
                   'Resulting unscaled prediction')
        return
Example #20
0
def _feature_extraction(data: pd.DataFrame) -> pd.Series:
    def nlargest_index(df, n):
        return df.nlargest(n).index.unique()[0:n]

    # first 225 statistical features
    statistical = data.min()
    statistical = statistical.append(data.max(), ignore_index=True)
    statistical = statistical.append(data.mean(), ignore_index=True)
    statistical = statistical.append(data.skew(), ignore_index=True)
    statistical = statistical.append(data.kurtosis(), ignore_index=True)

    # FFT features
    fft = pd.DataFrame(np.fft.fft(data))
    fft_angle = fft.applymap(np.angle)
    fft = fft.applymap(np.abs)
    largest_values = pd.Series()
    largest_angles = pd.Series()
    largest_indices = pd.Series()
    for i in range(0, 45):
        five_largest_idx = nlargest_index(fft.ix[:, i].map(abs), 5)  # is map(abs) redundant?
        largest_indices = largest_indices.append(pd.Series(five_largest_idx),
                                                 ignore_index=True)
        five_largest = fft_angle.ix[five_largest_idx, i].T
        largest_angles = largest_angles.append(five_largest)
        five_largest = fft.ix[five_largest_idx, i].T
        largest_values = largest_values.append(five_largest)

    # Autocorrelation
    autocorrelation = pd.Series()
    autocorrelation = autocorrelation.append(data.apply(lambda col: col.autocorr(1), axis=0))
    for i in range(5, 51, 5):
        autocorrelation = autocorrelation.append(data.apply(lambda col: col.autocorr(i), axis=0))

    # Make result
    feature_vector = pd.Series()
    feature_vector = feature_vector.append(statistical)
    feature_vector = feature_vector.append(largest_values)
    feature_vector = feature_vector.append(largest_angles)
    feature_vector = feature_vector.append(largest_indices)
    feature_vector = feature_vector.append(autocorrelation)
    return feature_vector
def cross_validate_trades(trades, N=20, subset_fraction=0.7):

    tickers = trades.tickers
    sample_size = round(len(tickers) * subset_fraction)
    summary = DataFrame(dtype=float)

    for n in range(N):
        sample_tickers = list(
            random.choice(tickers, sample_size, replace=False))
        trade_subset = trades.find(lambda T: T.ticker in sample_tickers)
        summary[n] = summary_report(trade_subset)

    result = DataFrame(dtype=float)
    result['Base'] = summary_report(trades)
    result['Mean'] = summary.mean(axis=1)
    result['Std'] = summary.std(axis=1)
    result['Median'] = summary.median(axis=1)
    result['Max'] = summary.max(axis=1)
    result['Min'] = summary.min(axis=1)

    return (result, summary)
def dataFrameMathTest():
    #Note : The methods that return a series default to working on columns.
    df = DataFrame()
    # Load a DataFrame from a CSV file    
    org_df = pd.read_csv('mlg.csv')
    df = org_df.iloc[:,1:7]
    
    resAbs = df.abs() # absolute values
    print(resAbs)
    #resAdd = df.add(o) # add df, Series or value
    #print(resAdd)
    resCount = df.count() # non NA/null values
    print(resCount)
    resCumMax = df.cummax() # (cols default axis)
    print(resCumMax)
    resCumMin = df.cummin() # (cols default axis)
    print(resCumMin)
    resCumSum = df.cumsum() # (cols default axis)
    print(resCumSum)
    resDiff = df.diff() # 1st diff (col def axis)
    print(resDiff)
    resDiv = df.div(12) # div by df, Series, value
    print(resDiv)
    #resDot = df.dot(13) # matrix dot product
    #print(resDot)
    resMax = df.max() # max of axis (col def)
    print(resMax)
    resMean = df.mean() # mean (col default axis)
    print(resMean)
    resMedian = df.median()# median (col default)
    print(resMedian)
    resMin = df.min() # min of axis (col def)
    print(resMin)
    resMul = df.mul(2) # mul by df Series val
    print(resMul)
    resSum = df.sum() # sum axis (cols default)
    print(resSum)
    resWhere = df.where(df > 0.5, other=np.nan)
    print(resWhere)
Example #23
0
def normalize(data_frame: pd.DataFrame,
              norm_type="mean",
              df_mean: pd.Series = None,
              df_std: pd.Series = None,
              df_min: pd.Series = None,
              df_max: pd.Series = None) -> pd.DataFrame:
    if norm_type == "min_max":
        if df_min is None:
            df_min = data_frame.min()
        if df_max is None:
            df_max = data_frame.max()

        result = (data_frame - df_min) / (df_max - df_min)
    else:
        if df_mean is None:
            df_mean = data_frame.mean()
        if df_mean is None:
            df_std = data_frame.std()

        result = (data_frame - df_mean) / df_std

    return result.fillna(0)
Example #24
0
    def test_min_max_dt64_with_NaT_skipna_false(self, tz_naive_fixture):
        # GH#36907
        tz = tz_naive_fixture
        df = DataFrame({
            "a": [
                Timestamp("2020-01-01 08:00:00", tz=tz),
                Timestamp("1920-02-01 09:00:00", tz=tz),
            ],
            "b": [Timestamp("2020-02-01 08:00:00", tz=tz), pd.NaT],
        })

        res = df.min(axis=1, skipna=False)
        expected = Series([df.loc[0, "a"], pd.NaT])
        assert expected.dtype == df["a"].dtype

        tm.assert_series_equal(res, expected)

        res = df.max(axis=1, skipna=False)
        expected = Series([df.loc[0, "b"], pd.NaT])
        assert expected.dtype == df["a"].dtype

        tm.assert_series_equal(res, expected)
Example #25
0
def simulate_df_with_same_variation(df: pd.DataFrame,
                                    sampling_size: int) -> pd.DataFrame:
    max_data_frame = df.max()
    min_data_frame = df.min()
    uniformly_selected_values_0 = np.random.uniform(min_data_frame[0],
                                                    max_data_frame[0],
                                                    sampling_size)
    uniformly_selected_values_1 = np.random.uniform(min_data_frame[1],
                                                    max_data_frame[1],
                                                    sampling_size)
    uniformly_selected_observations = np.column_stack(
        (uniformly_selected_values_0, uniformly_selected_values_1))
    if len(max_data_frame) >= 2:
        for i in range(2, len(max_data_frame)):
            uniformly_selected_values_i = np.random.uniform(
                min_data_frame[i], max_data_frame[i], sampling_size)
            to_stack = (uniformly_selected_observations,
                        uniformly_selected_values_i)
            uniformly_selected_observations = np.column_stack(to_stack)
    uniformly_selected_observations_df = pd.DataFrame(
        uniformly_selected_observations)
    return uniformly_selected_observations_df
Example #26
0
def optimal_weights(exp_rets: pd.DataFrame, cov: pd.DataFrame, n_points: int) -> np.array:
    """Retorna uma lista dos pesos que minimizam a volatilidade,
    dados os retornos esperados 'exp_rets' e a matriz de covariância
    'cov'. Considera o retorno esperado mínimo e o máximo para criar
    uma lista com 'n_points' retornos igualmente espaçados entre eles.
    Para cada retorno desta lista, executa a função minimize_vol.

    Args:
        exp_rets (pd.DataFrame): retornos esperados.
        cov (pd.DataFrame): matriz de covariância.
        n_points (int): número de intervalos, igualmente espaçados,
        entre o menor e o maior retorno.

    Returns:
        np.array
    """
    target_returns = np.linspace(exp_rets.min(), exp_rets.max(), n_points)
    weights = [
        minimize_vol(target_return, exp_rets, cov)
        for target_return in target_returns
    ]
    return weights
def get_Ys(do_pca=False):
    """Get Ys as DataFrame for fitting, if no PCA measurements are scaled from -1 to 1"""

    sv_db = access_db(0, True)
    measurements = get_msrmnts(sv_db, Q)

    if do_pca:
        X, df = pca_X()
        my_pca = PCA(n_components=0.99)
        my_pca.fit(X)

        X_trans = my_pca.transform(X)
        sn_Y = list(df.index)
        names = ['PCA Comp_' + str(i + 1) for i in range(my_pca.n_components_)]
        Ys = DataFrame(X_trans, index=sn_Y, columns=names)
        return Ys

    Ys = measurements

    Ys = Ys - Ys.min()
    Ys = Ys / Ys.max()
    return Ys * 2 - 1
Example #28
0
    def get_scatter_view_lims(counts_df: pd.DataFrame) -> Tuple[float, float]:
        """Calculates scatter view limits for the counts dataframe"""

        x0 = counts_df.min(axis='columns').where(lambda x: x != 0).dropna().min()
        x1 = np.max(counts_df).max()
        minpos = 1e-300

        if not np.isfinite([x0, x1]).all() or not isinstance(x0, np.float) or x1 <= 0:
            print("The provided dataset contains invalid values.")
            return (minpos, minpos)

        x0, x1 = (minpos if x0 <= 0 else x0,
                  minpos if x1 <= 0 else x1)

        transform = LogTransform(base=2)
        inverse_trans = transform.inverted()

        x0t, x1t = transform.transform([x0, x1])
        delta = (x1t - x0t) * mpl.rcParams.get('axes.xmargin', 0)
        if not np.isfinite(delta): delta = 0

        return inverse_trans.transform([x0t - delta, x1t + delta])
def group_msgs_by_term(df_msgs: pd.DataFrame, term: str) -> dict:
    # set term
    term_days = 8
    if term == 'lm':
        term_days = 31
    print('group messages every {0} days'.format(term_days))
    # analyze timestamp
    now_in_sec = (datetime.now(JST) -
                  datetime.fromtimestamp(0, JST)).total_seconds()
    interval_days = timedelta(days=term_days)
    interval_seconds = interval_days.total_seconds()
    oldest_timestamp = df_msgs.min().timestamp
    oldest_ts_in_sec = (datetime.fromtimestamp(oldest_timestamp, JST) -
                        datetime.fromtimestamp(0, JST)).total_seconds()
    loop_num = (abs(now_in_sec - oldest_ts_in_sec) / interval_seconds) + 1
    # extract by term
    dict_msgs_by_term = {}
    df_tmp = df_msgs
    now_tmp = now_in_sec
    for i in range(int(loop_num)):
        # make current term string
        cur_term_s = 'term_ago_{0}'.format(str(i).zfill(3))
        print(cur_term_s)
        # current messages
        df_msgs_cur = df_tmp.query('@now_tmp - timestamp < @interval_seconds')
        df_msgs_other = df_tmp.query(
            '@now_tmp - timestamp >= @interval_seconds')
        # messages does not exist. break.
        if df_msgs_cur.shape[0] == 0:
            break
        # add current messages to dict
        dict_msgs_by_term[cur_term_s] = ' '.join(
            df_msgs_cur.wakati_msg.dropna().values.tolist())
        # update temp value for next loop
        now_tmp = now_tmp - interval_seconds
        df_tmp = df_msgs_other
    return dict_msgs_by_term
Example #30
0
def _r2_containment(data: pd.DataFrame, curve: pd.Series,
                    relax: bool) -> float:
    """
    Produces \lambda_r with the given input data, using the standard ordering on R as the definition for containment.
    Parameters:
    ----------

    data: list 
        DataFrame of real-valued functions that define our band in R^2 (columns are time intervals, rows are functions)
    curve: pd.Series
        Function to check containment on 
    relax: bool
        If False, we use the strict definition of containment. If True, we consider the proportion of time the curve is in the band

    Returns:
    ----------
    float: If relax=False, then 0 if the function is not contained in the curve, 1 if it is. If relax=True, then we consider the proportion of time the curve is in the band, so we will return a number between 0 and 1. 
    """

    containment = 0

    y_range = []

    # Grab the mins/maxs across all rows (functions at each time index)
    mins = data.min(axis=1)
    maxs = data.max(axis=1)

    # Generate intervals in R over each time index
    intervals = [[i, j] for i, j in zip(mins, maxs)]

    # Check if each value in the curve is contained within the band
    for index, val in enumerate(curve):
        if intervals[index][0] <= val <= intervals[index][1]:
            containment += 1

    # If relax=True, then we return the proportion of points in the band, else, Python integer division will round down to 0 unless all points are contained in the band (strict containment)
    return containment / len(curve) if relax else containment // len(curve)
Example #31
0
def calc_stats(dataset: pd.DataFrame, dataname: str):
    mean = float(np.round(np.mean(dataset), 3))
    median = float(np.round(np.median(dataset), 3))
    min_value = float(np.round(dataset.min(), 3))
    max_value = float(np.round(dataset.max(), 3))
    quartile_1 = float(np.round(dataset.quantile(0.25), 3))
    quartile_3 = float(np.round(dataset.quantile(0.75), 3))
    iqr = np.round(quartile_3 - quartile_1, 3)
    lower_bound = np.round(quartile_1 - iqr * 1.5, 3)
    upper_bound = np.round(quartile_3 + iqr * 1.5, 3)

    print(f'{dataname} summary statistics')
    print(f'Min                      : {min_value}')
    print(f'Mean                     : {mean}')
    print(f'Max                      : {max_value}')
    print('')
    print(f'25th percentile          : {quartile_1}')
    print(f'Median                   : {median}')
    print(f'75th percentile          : {quartile_3}')
    print(f'Interquartile range (IQR): {iqr}')
    print('')
    print(f'Lower outlier bound      : {lower_bound}')
    print(f'Upper outlier bound      : {upper_bound}')
    print('--------------------------------')
Example #32
0
def draw_rphi_map(W       : Dict[int, List[KrSector]],
                  aMap    : DataFrame,
                  alims   : Optional[Tuple[float, float]] = None,
                  title   : str                           = 'E',
                  cmap    :  Colormap                     = matplotlib.cm.viridis,
                  alpha   : float                         = 1.0,  # level of transparency
                  rmax    : float                         = 200,  # the largest radius
                  scale   : float                         = 0.5,  # needed to fit the map
                  figsize : Tuple[float, float]           = (14,10)):


    fig = plt.figure(figsize=figsize) # give plots a rectangular frame

    ax = fig.add_subplot(1,1,1)
    if alims == None:
        e0M = aMap.max().max()
        e0m = aMap.min().min()
    else:
        e0m, e0M = alims[0], alims[1]
    p = add_map_values_to_axis_(W, aMap, ax, cmap, alpha, rmax, scale, clims=(e0m, e0M))
    fig.colorbar(p, ax=ax)
    plt.title(title)
    plt.tight_layout()
    plt.show()
Example #33
0
def selected_set_index(df: pd.DataFrame, indices: List[int],
                       minimize: bool) -> List[float]:
    """
    Convenience function. Returns per row the (minimum, maximum) of a selected
    set of columns.

    Parameters
    ----------
    df: pd.DataFrame
        A dataframe with each column representing a dataset, and each row
        representing a configuration.
    indices: List
        The rows to select
    minimize: bool
        Whether to return the sum of column-wise minimum or the sum of
        column-wise maximum

    Returns
    -------
    List[float]
         per column (minimum, maximum) of the selected rows
    """
    is_series = isinstance(df, pd.Series)
    # filters out only the algorithms that we have in the 'set of defaults'
    df = df.iloc[indices]
    # df.min(axis=0) returns per dataset the minimum score obtained by 'set of defaults'
    # then we take the median of this
    if minimize:
        result = df.min(axis=0)
    else:
        result = df.max(axis=0)
    if is_series:
        result = [result]
    if np.isnan(sum(result)):
        raise ValueError('None of the results of this function should be NaN')
    return result
def normalize(dataframe: DataFrame, column: str) -> DataFrame:
    maxd = dataframe.max(axis=0)
    mind = dataframe.min(axis=0)
    return (dataframe[column] - mind[column]) / (maxd[column] - mind[column])
Example #35
0
    def __generate_trace(self, points: pd.DataFrame, legend: str, metadata: list = None, normalize: bool = False,
                         **kwargs):
        dimension = points.shape[1]

        # tweak points size for 3D plots
        marker_size = 8
        if dimension == 3:
            marker_size = 4

        # if indicated, perform normalization
        if normalize:
            points = (points - points.min()) / (points.max() - points.min())

        marker = dict(
            color='#236FA4',
            size=marker_size,
            symbol='circle',
            line=dict(
                color='#236FA4',
                width=1
            ),
            opacity=0.8
        )
        marker.update(**kwargs)

        if dimension == 2:
            trace = go.Scattergl(
                x=points[0],
                y=points[1],
                mode='markers',
                marker=marker,
                name=legend,
                customdata=metadata
            )
        elif dimension == 3:
            trace = go.Scatter3d(
                x=points[0],
                y=points[1],
                z=points[2],
                mode='markers',
                marker=marker,
                name=legend,
                customdata=metadata
            )
        else:
            dimensions = list()
            for column in points:
                dimensions.append(
                    dict(range=[0, 1],
                         label=self.axis_labels[column:column + 1][0] if self.axis_labels[column:column + 1] else None,
                         values=points[column])
                )

            trace = go.Parcoords(
                line=dict(
                    color='#236FA4'
                ),
                dimensions=dimensions,
                name=legend,
            )

        return trace
Example #36
0
class Experiment():
    def __init__(self, n_training, n_test, dimensions, actualsEstimator, name):
        self.__name__ = name
        self.n_training = n_training
        self.n_test = n_test
        self.dimensions = dimensions
        self.actualsEstimator = actualsEstimator.fit()
        self.train = DataFrame(self.actualsEstimator.sample(self.n_training))
        self.importance_test = DataFrame(
            self.actualsEstimator.sample(self.n_test))
        self.lows = self.train.min(axis=0)
        self.highs = self.train.max(axis=0)
        self.uniform_test = DataFrame(
            uniform(low=self.lows,
                    high=self.highs,
                    size=(self.n_test, self.dimensions)))
        self.importance_actuals = self.actualsEstimator.predict(
            self.importance_test)
        self.uniform_actuals = self.actualsEstimator.predict(self.uniform_test)

        self.test = self.importance_test
        self.test_actuals = self.importance_actuals

        #Build up a KDTRee for faster processing
        self.kdt_ = KDTree(self.train, leaf_size=30, metric='euclidean')
        self.dist_, self.nn_ = self.kdt_.query(self.test,
                                               k=int(1 + self.n_test**0.5),
                                               return_distance=True)
        self.dist_loo_, self.nn_loo_ = self.kdt_.query(
            self.train, k=int(1 + self.n_training**0.5), return_distance=True)

    def ISE(self, estimates, actuals):
        r"""
        .. math:: Q_N(e,a,p) = \frac{1}{N}\sum_{i=0}^N\frac{(e_i-a_i)^2}{p_i} 
        Integrated Squared Error with Importance Sampling
        """
        return mean(((estimates - actuals)**2.))**0.5

    def IAE(self, estimates, actuals):
        r"""
        .. math:: Q_N(e,a,p) = \frac{1}{N}\sum_{i=0}^N\frac{|e_i-a_i|}{p_i} 
        Integrated Absolute Error with Importance Sampling
        """
        return mean(abs(estimates - actuals))

    def EmpericalEntropy(self, estimates):
        return entropy(estimates, base=2)

    def JensenShannon(self, estimates, actuals):
        M = 0.5 * (estimates + actuals)
        return 0.5 * (entropy(estimates, M, base=2) +
                      entropy(actuals, M, base=2))

    def KullbackLeiber(self, estimates, actuals):
        return entropy(actuals, estimates, base=2)

    def getResults(self, estimator, prekdt=False):
        #        uni_est = estimator.predict(self.uniform_test)
        #Attach some  pre calculated results to the estimator
        if prekdt:
            estimator.nn_ = self.nn_
            estimator.dist_ = self.dist_
            estimator.nn_loo_ = self.nn_loo_
            estimator.dist_loo_ = self.dist_loo_
            estimator.kdt_ = self.kdt_

        est = estimator.predict(self.test)
        actuals = self.test_actuals

        estimator.nn_ = None
        estimator.dist_ = None
        estimator.nn_loo_ = None
        estimator.dist_loo_ = None
        estimator.kdt_ = None

        return self.ISE(est,
                        actuals), self.IAE(est, actuals), self.JensenShannon(
                            est, actuals), self.KullbackLeiber(
                                est, actuals), self.EmpericalEntropy(est)
Example #37
0
def pca(x, y=None, ylev=None,
        nlab=0, lsize=10, lalpha=1,
        center="both", scale="none",
        legend=True, cname="variable",
        color=None):
    if type(color) != type({}):
        color = None
    xForSvd = x.ix[:, x.std(axis=0) > 0]
    xsvd = svdForPca(xForSvd, center, scale)
    svdRowPlot = DataFrame(
        xsvd[0][:, 0:2],
        index = xForSvd.index,
        columns = ["PC1", "PC2"]
    )
    svdRowPlot = svdRowPlot.divide(svdRowPlot.max(axis=0) -
                                   svdRowPlot.min(axis=0), axis=1)
    svdColPlot = DataFrame(
        numpy.transpose(xsvd[2][0:2, :]),
        index = xForSvd.columns,
        columns = ["PC1", "PC2"]
    )
    svdColPlot = svdColPlot.divide(svdColPlot.max(axis=0) -
                                   svdColPlot.min(axis=0), axis=1)
    if nlab > 0:
        svdColPlotMag = (svdColPlot**2).sum(axis=1)
        svdColPlotMag.sort_values(ascending=False, inplace=True)
        svdColPlot = svdColPlot.ix[svdColPlotMag.index]
        svdColPlot["label"] = ""
        svdColPlot.ix[0:nlab, "label"] = \
                svdColPlot.ix[0:nlab].index.to_series()
    if legend:
        ax = plt.subplot(111)
    plt.plot(svdColPlot["PC1"], svdColPlot["PC2"],
             "o", color=(0, 0, 0, 0.1), markersize=5,
             label=cname)
    if nlab > 0:
        for i in range(nlab):
            plt.text(svdColPlot.ix[i, "PC1"],
                     svdColPlot.ix[i, "PC2"],
                     svdColPlot.ix[i, "label"],
                     fontsize = lsize,
                     color = (0, 0, 0, lalpha),
                     label = None)
    if y is not None:
        if ylev is None:
            ylev = y.unique()
        for level in ylev:
            if color is not None and level in color.keys():
                plt.plot(svdRowPlot.ix[y == level, 0],
                         svdRowPlot.ix[y == level, 1],
                         "o",
                         markersize = 8,
                         label = level,
                         color = color[level])
            else:
                plt.plot(svdRowPlot.ix[y == level, 0],
                         svdRowPlot.ix[y == level, 1],
                         "o",
                         markersize = 8,
                         label = level)
    else:
        plt.plot(svdRowPlot["PC1"], svdRowPlot["PC2"],
                 "o", markersize=8)
    if legend:
        box = ax.get_position()
        ax.set_position([box.x0, box.y0, box.width*0.8, box.height])
        ax.legend(loc="center left", bbox_to_anchor=(1, 0.5), numpoints=1)
    plt.show()
Example #38
0
# <codecell>


check_functions = [(MVhypergeo_test, "MV_hypergeo"), (fishers_test, "Fishers")]
results = DataFrame(index=range(0, (refseq != "-").sum()))
for (g1seqs, g2seqs, nref, gname), (func, funcname) in product(grouping_seq, check_functions):
    print(gname, funcname)
    res = func(g1seqs, g2seqs, nref)
    aggres = resolve_indices(res, nref)
    colname = gname + "_" + funcname
    results[colname] = aggres


# <codecell>

results.min()

# <codecell>

from collections import defaultdict

naggres = defaultdict(set)
for col in results.columns:
    naggres[col] = set(results[col][results[col] < 0.05].index)
print(naggres)

# <codecell>

for c1, c2 in combinations(naggres.keys(), 2):
    common = naggres[c1] & naggres[c2]
    if common:
Example #39
0
__author__ = 'Executor'

import numpy as np
import pandas as pa
from pandas import Series, DataFrame


arr = np.array([[1, 2, np.nan], [np.nan, 3, 4]])
dframe1 = DataFrame(arr, index=['A', 'B'], columns=['One', 'Two', 'Three'])
print(dframe1.sum())
print(dframe1.sum(axis=1))
print(dframe1.min())
print(dframe1)
print(dframe1.idxmin())

print(dframe1)
print(dframe1.cumsum())

print(dframe1.describe())

from IPython.display import YouTubeVideo
YouTubeVideo('xGbpuFNR1ME')
YouTubeVideo('4EXNedimDMs')

''' stupid thing doesn't work!'''
Example #40
0
def plotter(df,
            title=False,
            kind='line',
            x_label=None,
            y_label=None,
            style='ggplot',
            figsize=(8, 4),
            save=False,
            legend_pos='best',
            reverse_legend='guess',
            num_to_plot=7,
            tex='try',
            colours='default',
            cumulative=False,
            pie_legend=True,
            partial_pie=False,
            show_totals=False,
            transparent=False,
            output_format='png',
            interactive=False,
            black_and_white=False,
            show_p_val=False,
            indices=False,
            transpose=False,
            rot=False,
            **kwargs):
    """Visualise corpus interrogations.
    :param title: A title for the plot
    :type title: str
    :param df: Data to be plotted
    :type df: Pandas DataFrame
    :param x_label: A label for the x axis
    :type x_label: str
    :param y_label: A label for the y axis
    :type y_label: str
    :param kind: The kind of chart to make
    :type kind: str ('line'/'bar'/'barh'/'pie'/'area')
    :param style: Visual theme of plot
    :type style: str ('ggplot'/'bmh'/'fivethirtyeight'/'seaborn-talk'/etc)
    :param figsize: Size of plot
    :type figsize: tuple (int, int)
    :param save: If bool, save with *title* as name; if str, use str as name
    :type save: bool/str
    :param legend_pos: Where to place legend
    :type legend_pos: str ('upper right'/'outside right'/etc)
    :param reverse_legend: Reverse the order of the legend
    :type reverse_legend: bool
    :param num_to_plot: How many columns to plot
    :type num_to_plot: int/'all'
    :param tex: Use TeX to draw plot text
    :type tex: bool
    :param colours: Colourmap for lines/bars/slices
    :type colours: str
    :param cumulative: Plot values cumulatively
    :type cumulative: bool
    :param pie_legend: Show a legend for pie chart
    :type pie_legend: bool
    :param partial_pie: Allow plotting of pie slices only
    :type partial_pie: bool
    :param show_totals: Print sums in plot where possible
    :type show_totals: str -- 'legend'/'plot'/'both'
    :param transparent: Transparent .png background
    :type transparent: bool
    :param output_format: File format for saved image
    :type output_format: str -- 'png'/'pdf'
    :param black_and_white: Create black and white line styles
    :type black_and_white: bool
    :param show_p_val: Attempt to print p values in legend if contained in df
    :type show_p_val: bool
    :param indices: To use when plotting "distance from root"
    :type indices: bool
    :param stacked: When making bar chart, stack bars on top of one another
    :type stacked: str
    :param filled: For area and bar charts, make every column sum to 100
    :type filled: str
    :param legend: Show a legend
    :type legend: bool
    :param rot: Rotate x axis ticks by *rot* degrees
    :type rot: int
    :param subplots: Plot each column separately
    :type subplots: bool
    :param layout: Grid shape to use when *subplots* is True
    :type layout: tuple -- (int, int)
    :param interactive: Experimental interactive options
    :type interactive: list -- [1, 2, 3]
    :returns: matplotlib figure
    """
    import corpkit
    import os
    try:
        from IPython.utils.shimmodule import ShimWarning
        import warnings
        warnings.simplefilter('ignore', ShimWarning)
    except:
        pass

    kwargs['rot'] = rot

    import matplotlib as mpl
    from matplotlib import rc

    # prefer seaborn plotting
    try:
        import seaborn as sns
    except ImportError:
        pass   
    
    if interactive:
        import matplotlib.pyplot as plt, mpld3
    else:
        import matplotlib.pyplot as plt
    
    import pandas
    from pandas import DataFrame, Series

    from time import localtime, strftime
    from process import checkstack

    if interactive:
        import mpld3
        import collections
        from mpld3 import plugins, utils
        from plugins import InteractiveLegendPlugin, HighlightLines

    have_mpldc = False
    try:
        from mpldatacursor import datacursor, HighlightingDataCursor
        have_mpldc = True
    except ImportError:
        pass

    # check what environment we're in
    tk = checkstack('tkinter')
    running_python_tex = checkstack('pythontex')
    running_spider = checkstack('spyder')

    if not title:
        title = ''

    def truncate_colormap(cmap, minval=0.0, maxval=1.0, n=100):
        """remove extreme values from colourmap --- no pure white"""
        import matplotlib.colors as colors
        import numpy as np
        new_cmap = colors.LinearSegmentedColormap.from_list(
        'trunc({n},{a:.2f},{b:.2f})'.format(n=cmap.name, a=minval, b=maxval),
        cmap(np.linspace(minval, maxval, n)))
        return new_cmap

    def get_savename(imagefolder, save = False, title = False, ext = 'png'):
        """Come up with the savename for the image."""
        import os
        from corpkit.process import urlify

        # name as 
        if not ext.startswith('.'):
            ext = '.' + ext
        if isinstance(save, STRINGTYPE):
            savename = os.path.join(imagefolder, (urlify(save) + ext))
        #this 'else' is redundant now that title is obligatory
        else:
            if title:
                filename = urlify(title) + ext
                savename = os.path.join(imagefolder, filename)

        # remove duplicated ext
        if savename.endswith('%s%s' % (ext, ext)):
            savename = savename.replace('%s%s' % (ext, ext), ext, 1)
        return savename

    def rename_data_with_total(dataframe, was_series = False, using_tex = False, absolutes = True):
        """adds totals (abs, rel, keyness) to entry name strings"""
        if was_series:
            where_the_words_are = dataframe.index
        else:
            where_the_words_are = dataframe.columns
        the_labs = []
        for w in list(where_the_words_are):
            if not absolutes:
                if was_series:
                    perc = dataframe.T[w][0]
                else:
                    the_labs.append(w)
                    continue
                if using_tex:
                    the_labs.append('%s (%.2f\%%)' % (w, perc))
                else:
                    the_labs.append('%s (%.2f %%)' % (w, perc))
            else:
                if was_series:
                    score = dataframe.T[w].sum()
                else:
                    score = dataframe[w].sum()
                if using_tex:
                    the_labs.append('%s (n=%d)' % (w, score))
                else:
                    the_labs.append('%s (n=%d)' % (w, score))
        if not was_series:
            dataframe.columns = the_labs
        else:
            vals = list(dataframe[list(dataframe.columns)[0]].values)
            dataframe = pandas.DataFrame(vals, index = the_labs)
            dataframe.columns = ['Total']
        return dataframe

    def auto_explode(dataframe, tinput, was_series = False, num_to_plot = 7):
        """give me a list of strings and i'll output explode option"""
        output = [0 for s in range(num_to_plot)]

        if was_series:
            l = list(dataframe.index)
        else:
            l = list(dataframe.columns)

        if isinstance(tinput, (STRINGTYPE, int)):
            tinput = [tinput]
        if isinstance(tinput, list):
            for i in tinput:
                if isinstance(i, STRINGTYPE):
                    index = l.index(i)
                else:
                    index = i
                output[index] = 0.1
        return output

    # get a few options from kwargs
    sbplt = kwargs.get('subplots', False)
    show_grid = kwargs.pop('grid', True)
    the_rotation = kwargs.get('rot', False)
    dragmode = kwargs.pop('draggable', False)
    leg_frame = kwargs.pop('legend_frame', True)
    leg_alpha = kwargs.pop('legend_alpha', 0.8)
    # auto set num to plot based on layout
    lo = kwargs.get('layout', None)
    if lo:
        num_to_plot = lo[0] * lo[1]

    # todo: get this dynamically instead.
    styles = ['dark_background', 'bmh', 'grayscale', 'ggplot', 'fivethirtyeight', 'matplotlib', False, 'mpl-white']
    #if style not in styles:
        #raise ValueError('Style %s not found. Use %s' % (str(style), ', '.join(styles)))

    if style == 'mpl-white':
        try:
            sns.set_style("whitegrid")
        except:
            pass
        style = 'matplotlib'

    if kwargs.get('savepath'):
        mpl.rcParams['savefig.directory'] = kwargs.get('savepath')
        kwargs.pop('savepath', None)

    mpl.rcParams['savefig.bbox'] = 'tight'
    mpl.rcParams.update({'figure.autolayout': True})

    # try to use tex
    # TO DO:
    # make some font kwargs here
    using_tex = False
    mpl.rcParams['font.family'] = 'sans-serif'
    mpl.rcParams['text.latex.unicode'] = True
    
    if tex == 'try' or tex is True:
        try:
            rc('text', usetex=True)
            rc('font', **{'family': 'serif', 'serif': ['Computer Modern']})
            using_tex = True
        except:
            matplotlib.rc('font', family='sans-serif') 
            matplotlib.rc('font', serif='Helvetica Neue') 
            matplotlib.rc('text', usetex='false') 
            rc('text', usetex=False)
    else:
        rc('text', usetex=False)  

    if interactive:
        using_tex = False 

    if show_totals is False:
        show_totals = 'none'

    # find out what kind of plot we're making, and enable
    # or disable interactive values if need be
    kwargs['kind'] = kind.lower()

    if interactive:
        if kwargs['kind'].startswith('bar'):
            interactive_types = [3]
        elif kwargs['kind'] == 'area':
            interactive_types = [2, 3]
        elif kwargs['kind'] == 'line':
            interactive_types = [2, 3]
        elif kwargs['kind'] == 'pie':
            interactive_types = None
            warnings.warn('Interactive plotting not yet available for pie plots.')
        else:
            interactive_types = [None]
    if interactive is False:
        interactive_types = [None]

    # find out if pie mode, add autopct format
    piemode = False
    if kind == 'pie':
        piemode = True
        # always the best spot for pie
        #if legend_pos == 'best':
            #legend_pos = 'lower left'
        if show_totals.endswith('plot') or show_totals.endswith('both'):
            kwargs['pctdistance'] = 0.6
            if using_tex:
                kwargs['autopct'] = r'%1.1f\%%'
            else:
                kwargs['autopct'] = '%1.1f%%'

    # copy data, make series into df
    dataframe = df.copy()
    was_series = False
    if isinstance(dataframe, Series):
        was_series = True
        if not cumulative:
            dataframe = DataFrame(dataframe)
        else:
            dataframe = DataFrame(dataframe.cumsum())
    else:
        # don't know if this is much good.
        if transpose:
            dataframe = dataframe.T
        if cumulative:
            dataframe = DataFrame(dataframe.cumsum())
        if len(list(dataframe.columns)) == 1:
            was_series = True
    
    # attempt to convert x axis to ints:
    #try:
    #    dataframe.index = [int(i) for i in list(dataframe.index)]
    #except:
    #    pass

    # remove totals and tkinter order
    if not was_series:
        for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]):
            try:
                dataframe = dataframe.drop(name, axis = ax, errors = 'ignore')
            except:
                pass
    
    try:
        dataframe = dataframe.drop('tkintertable-order', errors = 'ignore')
    except:
        pass
    try:
        dataframe = dataframe.drop('tkintertable-order', axis = 1, errors = 'ignore')
    except:
        pass

    # look at columns to see if all can be ints, in which case, set up figure
    # for depnumming
    if not was_series:
        if indices == 'guess':
            def isint(x):
                try:
                    a = float(x)
                    b = int(a)
                except ValueError or OverflowError:
                    return False
                else:
                    return a == b

            if all([isint(x) is True for x in list(dataframe.columns)]):
                indices = True
            else:
                indices = False

        # if depnumming, plot all, transpose, and rename axes
        if indices is True:
            num_to_plot = 'all'
            dataframe = dataframe.T
            if y_label is None:
                y_label = 'Percentage of all matches'
            if x_label is None:
                x_label = ''

    # set backend?
    output_formats = ['svgz', 'ps', 'emf', 'rgba', 'raw', 'pdf', 'svg', 'eps', 'png', 'pgf']
    if output_format not in output_formats:
        raise ValueError('%s output format not recognised. Must be: %s' % (output_format, ', '.join(output_formats)))
    
    # don't know if these are necessary
    if 'pdf' in output_format:
        plt.switch_backend(output_format) 
    if 'pgf' in output_format:
        plt.switch_backend(output_format)

    if num_to_plot == 'all':
        if was_series:
            if not piemode:
                num_to_plot = len(dataframe)
            else:
                num_to_plot = len(dataframe)
        else:
            if not piemode:
                num_to_plot = len(list(dataframe.columns))
            else:
                num_to_plot = len(dataframe.index)

    # explode pie, or remove if not piemode
    if piemode and not sbplt and kwargs.get('explode'):
        kwargs['explode'] = auto_explode(dataframe, 
                                        kwargs['explode'], 
                                        was_series=was_series, 
                                        num_to_plot=num_to_plot)
    else:
        kwargs.pop('explode', None)

    legend = kwargs.get('legend', True)

    #cut data short
    plotting_a_totals_column = False
    if was_series:
        if list(dataframe.columns)[0] != 'Total':
            try:
                can_be_ints = [int(x) for x in list(dataframe.index)]
                num_to_plot = len(dataframe)
            except:
                dataframe = dataframe[:num_to_plot]
        elif list(dataframe.columns)[0] == 'Total':
            plotting_a_totals_column = True
            if not 'legend' in kwargs:
                legend = False
            num_to_plot = len(dataframe)
    else:
        if transpose:
            dataframe = dataframe.head(num_to_plot)
        else:
            dataframe = dataframe.T.head(num_to_plot).T

    # remove stats fields, put p in entry text, etc.
    statfields = ['slope', 'intercept', 'r', 'p', 'stderr']
    try:
        dataframe = dataframe.drop(statfields, axis = 1, errors = 'ignore')
    except:
        pass    
    try:
        dataframe.ix['p']
        there_are_p_vals = True
    except:
        there_are_p_vals = False
    if show_p_val:
        if there_are_p_vals:
            newnames = []
            for col in list(dataframe.columns):
                pval = dataframe[col]['p']

                def p_string_formatter(val):
                    if val < 0.001:
                        if not using_tex:
                            return 'p < 0.001'
                        else:
                            return r'p $<$ 0.001'
                    else:
                        return 'p = %s' % format(val, '.3f')

                pstr = p_string_formatter(pval)
                newname = '%s (%s)' % (col, pstr)
                newnames.append(newname)
            dataframe.columns = newnames
            dataframe.drop(statfields, axis = 0, inplace = True, errors = 'ignore')
        else:
            warnings.warn('No p-values calculated to show.\n\nUse keep_stats kwarg while editing to generate these values.')
    else:
        if there_are_p_vals:
            dataframe.drop(statfields, axis = 0, inplace = True, errors = 'ignore')

    # make and set y label
    absolutes = True
    if type(dataframe) == DataFrame:
        try:
            if not all([s.is_integer() for s in dataframe.iloc[0,:].values]):
                absolutes = False
        except:
            pass
    else:
        if not all([s.is_integer() for s in dataframe.values]):        
            absolutes = False

    ##########################################
    ################ COLOURS #################
    ##########################################

    # set defaults, with nothing for heatmap yet
    if colours is True or colours == 'default':
        if kind != 'heatmap':
            colours = 'viridis'
        else:
            colours = 'default'
    
    # assume it's a single color, unless string denoting map
    cmap_or_c = 'color'
    if colours is not False and type(colours) == str:
        cmap_or_c = 'colormap'
    from matplotlib.colors import LinearSegmentedColormap
    if type(colours)==LinearSegmentedColormap:
        cmap_or_c = 'colormap'

    # for heatmaps, it's always a colormap
    if kind == 'heatmap':
        cmap_or_c = 'cmap'
        # if it's a defaulty string, set accordingly
        if type(colours) == str:
            if colours.lower().startswith('diverg'):
                colours = sns.diverging_palette(10, 133, as_cmap=True)

            # if default not set, do diverge for any df with a number < 0
            elif colours.lower() == 'default':
                mn = dataframe.min()
                if type(mn) == Series:
                    mn = mn.min()
                if mn < 0:
                    colours = sns.diverging_palette(10, 133, as_cmap=True)
                else:
                    colours = sns.light_palette("green", as_cmap=True)

    if 'seaborn' not in style:
        kwargs[cmap_or_c] = colours
    #if not was_series:
    #    if kind in ['pie', 'line', 'area']:
    #        if colours and not plotting_a_totals_column:
    #            kwargs[cmap_or_c] = colours
    #    else:
    #        if colours:
    #            kwargs[cmap_or_c] = colours
    #if piemode:
    #    if num_to_plot > 0:
    #        kwargs[cmap_or_c] = colours
    #    else:
    #        if num_to_plot > 0:
    #            kwargs[cmap_or_c] = colours
    
    # multicoloured bar charts
    #if colours and cmap_or_c == 'colormap':
    #    if kind.startswith('bar'):
    #        if len(list(dataframe.columns)) == 1:
    #            if not black_and_white:
    #                import numpy as np
    #                the_range = np.linspace(0, 1, num_to_plot)
    #                middle = len(the_range) / 2
    #                try:
    #                    cmap = plt.get_cmap(colours)
    #                    kwargs[cmap_or_c] = [cmap(n) for n in the_range][middle]
    #                except ValueError:
    #                    kwargs[cmap_or_c] = colours
    #            # make a bar width ... ? ...
    #            #kwargs['width'] = (figsize[0] / float(num_to_plot)) / 1.5
    

    # reversing legend option
    if reverse_legend is True:
        rev_leg = True
    elif reverse_legend is False:
        rev_leg = False

    # show legend or don't, guess whether to reverse based on kind
    if kind in ['bar', 'barh', 'area', 'line', 'pie']:
        if was_series:
            legend = False
        if kind == 'pie':
            if pie_legend:
                legend = True
            else:
                legend = False
    if kind in ['barh', 'area']:
        if reverse_legend == 'guess':
            rev_leg = True
    if not 'rev_leg' in locals():
        rev_leg = False

    # the default legend placement
    if legend_pos is True:
        legend_pos = 'best'

    # cut dataframe if just_totals
    try:
        tst = dataframe['Combined total']
        dataframe = dataframe.head(num_to_plot)
    except:
        pass

    # no title for subplots because ugly,
    if title and not sbplt:
        kwargs['title'] = title
        
    # no interactive subplots yet:
    if sbplt and interactive:
        import warnings
        interactive = False
        warnings.warn('No interactive subplots yet, sorry.')
        return
        
    # not using pandas for labels or legend anymore.
    #kwargs['labels'] = None
    #kwargs['legend'] = False

    if legend:
        if num_to_plot > 6:
            if not kwargs.get('ncol'):
                kwargs['ncol'] = num_to_plot / 7
        # kwarg options go in leg_options
        leg_options = {'framealpha': leg_alpha,
                       'shadow': kwargs.get('shadow', False),
                       'ncol': kwargs.pop('ncol', 1)}    

        # determine legend position based on this dict
        if legend_pos:
            possible = {'best': 0, 'upper right': 1, 'upper left': 2, 'lower left': 3, 'lower right': 4, 
                        'right': 5, 'center left': 6, 'center right': 7, 'lower center': 8, 'upper center': 9, 
                        'center': 10, 'o r': 2, 'outside right': 2, 'outside upper right': 2, 
                        'outside center right': 'center left', 'outside lower right': 'lower left'}

            if type(legend_pos) == int:
                the_loc = legend_pos
            elif type(legend_pos) == str:
                try:
                    the_loc = possible[legend_pos]
                except KeyError:
                    raise KeyError('legend_pos value must be one of:\n%s\n or an int between 0-10.' %', '.join(list(possible.keys())))
            leg_options['loc'] = the_loc
            #weirdness needed for outside plot
            if legend_pos in ['o r', 'outside right', 'outside upper right']:
                leg_options['bbox_to_anchor'] = (1.02, 1)
            if legend_pos == 'outside center right':
                leg_options['bbox_to_anchor'] = (1.02, 0.5)
            if legend_pos == 'outside lower right':
                leg_options['loc'] == 'upper right'
                leg_options['bbox_to_anchor'] = (0.5, 0.5)
        
        # a bit of distance between legend and plot for outside legends
        if type(legend_pos) == str:
            if legend_pos.startswith('o'):
                leg_options['borderaxespad'] = 1

    if not piemode:
        if show_totals.endswith('both') or show_totals.endswith('legend'):
            dataframe = rename_data_with_total(dataframe, 
                                           was_series = was_series, 
                                           using_tex = using_tex, 
                                           absolutes = absolutes)
    else:
        if pie_legend:
            if show_totals.endswith('both') or show_totals.endswith('legend'):
                dataframe = rename_data_with_total(dataframe, 
                                           was_series = was_series, 
                                           using_tex = using_tex, 
                                           absolutes = absolutes)

    if piemode:
        if partial_pie:
            dataframe = dataframe / 100.0

    # some pie things
    if piemode:
        if not sbplt:
            kwargs['y'] = list(dataframe.columns)[0]
    
    def filler(df):
        pby = df.T.copy()
        for i in list(pby.columns):
            tot = pby[i].sum()
            pby[i] = pby[i] * 100.0 / tot
        return pby.T

    areamode = False
    if kind == 'area':
        areamode = True

    if legend is False:
        kwargs['legend'] = False

    # line highlighting option for interactive!
    if interactive:
        if 2 in interactive_types:
            if kind == 'line':
                kwargs['marker'] = ','
        if not piemode:
            kwargs['alpha'] = 0.1
    
    # convert dates --- works only in my current case!
    if plotting_a_totals_column or not was_series:
        try:
            can_it_be_int = int(list(dataframe.index)[0])
            can_be_int = True
        except:
            can_be_int = False
        if can_be_int:
            if 1500 < int(list(dataframe.index)[0]):
                if 2050 > int(list(dataframe.index)[0]):
                    n = pandas.PeriodIndex([d for d in list(dataframe.index)], freq='A')
                    dataframe = dataframe.set_index(n)

        if kwargs.get('filled'):
            if areamode or kind.startswith('bar'):
                dataframe = filler(dataframe)
            kwargs.pop('filled', None)

    MARKERSIZE = 4
    COLORMAP = {
            0: {'marker': None, 'dash': (None,None)},
            1: {'marker': None, 'dash': [5,5]},
            2: {'marker': "o", 'dash': (None,None)},
            3: {'marker': None, 'dash': [1,3]},
            4: {'marker': "s", 'dash': [5,2,5,2,5,10]},
            5: {'marker': None, 'dash': [5,3,1,2,1,10]},
            6: {'marker': 'o', 'dash': (None,None)},
            7: {'marker': None, 'dash': [5,3,1,3]},
            8: {'marker': "1", 'dash': [1,3]},
            9: {'marker': "*", 'dash': [5,5]},
            10: {'marker': "2", 'dash': [5,2,5,2,5,10]},
            11: {'marker': "s", 'dash': (None,None)}
            }

    HATCHES = {
            0:  {'color': '#dfdfdf', 'hatch':"/"},
            1:  {'color': '#6f6f6f', 'hatch':"\\"},
            2:  {'color': 'b', 'hatch':"|"},
            3:  {'color': '#dfdfdf', 'hatch':"-"},
            4:  {'color': '#6f6f6f', 'hatch':"+"},
            5:  {'color': 'b', 'hatch':"x"}
            }

    if black_and_white:
        if kind == 'line':
            kwargs['linewidth'] = 1

        cmap = plt.get_cmap('Greys')
        new_cmap = truncate_colormap(cmap, 0.25, 0.95)
        if kind == 'bar':
            # darker if just one entry
            if len(dataframe.columns) == 1:
                new_cmap = truncate_colormap(cmap, 0.70, 0.90)
        kwargs[cmap_or_c] = new_cmap

    # remove things from kwargs if heatmap
    if kind == 'heatmap':
        hmargs = {'annot': kwargs.pop('annot', True),
              cmap_or_c: kwargs.pop(cmap_or_c, None),
              'fmt': kwargs.pop('fmt', ".2f"),
              'cbar': kwargs.pop('cbar', False)}

        for i in ['vmin', 'vmax', 'linewidths', 'linecolor',
                  'robust', 'center', 'cbar_kws', 'cbar_ax',
                  'square', 'mask']:
            if i in kwargs.keys():
                hmargs[i] = kwargs.pop(i, None)

    class dummy_context_mgr():
        """a fake context for plotting without style
        perhaps made obsolete by 'classic' style in new mpl"""
        def __enter__(self):
            return None
        def __exit__(self, one, two, three):
            return False

    with plt.style.context((style)) if style != 'matplotlib' else dummy_context_mgr():

        if not sbplt:
            # check if negative values, no stacked if so
            if areamode:
                if not kwargs.get('ax'):
                    kwargs['legend'] = False
                if dataframe.applymap(lambda x: x < 0.0).any().any():
                    kwargs['stacked'] = False
                    rev_leg = False
            if kind != 'heatmap':
                # turn off pie labels at the last minute
                if kind == 'pie' and pie_legend:
                    kwargs['labels'] = None
                    kwargs['autopct'] = '%.2f'
                if kind == 'pie':
                    kwargs.pop('color', None)
                ax = dataframe.plot(figsize=figsize, **kwargs)
            else:
                plt.figure(figsize=figsize)
                if title:
                    plt.title(title)
                ax = kwargs.get('ax', plt.axes())
                sns.heatmap(dataframe, ax=ax, **hmargs)
                plt.yticks(rotation=0)

            if areamode and not kwargs.get('ax'):
                handles, labels = plt.gca().get_legend_handles_labels()
                del handles
                del labels

            if x_label:
                ax.set_xlabel(x_label)
            if y_label:
                ax.set_ylabel(y_label)

        else:
            if not kwargs.get('layout'):
                plt.gcf().set_tight_layout(False)

            if kind != 'heatmap':
                ax = dataframe.plot(figsize=figsize, **kwargs)
            else:
                plt.figure(figsize=figsize)
                if title:
                    plt.title(title)
                ax = plt.axes()
                sns.heatmap(dataframe, ax=ax, **hmargs)
                plt.xticks(rotation=0)
                plt.yticks(rotation=0)

        def rotate_degrees(rotation, labels):
            if rotation is None:
                if max(labels, key=len) > 6:
                    return 45
                else:
                    return 0
            elif rotation is False:
                return 0
            elif rotation is True:
                return 45
            else:
                return rotation
        
        if sbplt:
            if 'layout' not in kwargs:
                axes = [l for index, l in enumerate(ax)]
            else:
                axes = []
                cols = [l for index, l in enumerate(ax)]
                for col in cols:
                    for bit in col:
                        axes.append(bit)

            for index, a in enumerate(axes):
                labels = [item.get_text() for item in a.get_xticklabels()]
                rotation = rotate_degrees(the_rotation, labels)                
                a.set_xticklabels(labels, rotation = rotation, ha='right')
        else:
            if kind == 'heatmap':
                labels = [item.get_text() for item in ax.get_xticklabels()]
                rotation = rotate_degrees(the_rotation, labels)
                ax.set_xticklabels(labels, rotation = rotation, ha='right')

        if transparent:
            plt.gcf().patch.set_facecolor('white')
            plt.gcf().patch.set_alpha(0)

        if black_and_white:
            if kind == 'line':
                # white background
                # change everything to black and white with interesting dashes and markers
                c = 0
                for line in ax.get_lines():
                    line.set_color('black')
                    #line.set_width(1)
                    line.set_dashes(COLORMAP[c]['dash'])
                    line.set_marker(COLORMAP[c]['marker'])
                    line.set_markersize(MARKERSIZE)
                    c += 1
                    if c == len(list(COLORMAP.keys())):
                        c = 0

        # draw legend with proper placement etc
        if legend:
            if not piemode and not sbplt and kind != 'heatmap':
                if 3 not in interactive_types:
                    handles, labels = plt.gca().get_legend_handles_labels()
                    # area doubles the handles and labels. this removes half:
                    #if areamode:
                    #    handles = handles[-len(handles) / 2:]
                    #    labels = labels[-len(labels) / 2:]
                    if rev_leg:
                        handles = handles[::-1]
                        labels = labels[::-1]
                    if kwargs.get('ax'):
                        lgd = plt.gca().legend(handles, labels, **leg_options)
                        ax.get_legend().draw_frame(leg_frame)
                    else:
                        lgd = plt.legend(handles, labels, **leg_options)
                        lgd.draw_frame(leg_frame)

    if interactive:
        # 1 = highlight lines
        # 2 = line labels
        # 3 = legend switches
        ax = plt.gca()
        # fails for piemode
        lines = ax.lines
        handles, labels = plt.gca().get_legend_handles_labels()
        if 1 in interactive_types:
            plugins.connect(plt.gcf(), HighlightLines(lines))

        if 3 in interactive_types:
            plugins.connect(plt.gcf(), InteractiveLegendPlugin(lines, labels, alpha_unsel=0.0))

        for i, l in enumerate(lines):
            y_vals = l.get_ydata()
            x_vals = l.get_xdata()
            x_vals = [str(x) for x in x_vals]
            if absolutes:
                ls = ['%s (%s: %d)' % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals)]
            else:
                ls = ['%s (%s: %.2f%%)' % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals)]
            if 2 in interactive_types:
                #if 'kind' in kwargs and kwargs['kind'] == 'area':
                tooltip_line = mpld3.plugins.LineLabelTooltip(lines[i], labels[i])
                mpld3.plugins.connect(plt.gcf(), tooltip_line)
                #else:
                if kind == 'line':
                    tooltip_point = mpld3.plugins.PointLabelTooltip(l, labels = ls)
                    mpld3.plugins.connect(plt.gcf(), tooltip_point)
        
    if piemode:
        if not sbplt:
            plt.axis('equal')
            ax.get_xaxis().set_visible(False)
            ax.get_yaxis().set_visible(False)

    # add x label
    # this could be revised now!
    # if time series period, it's year for now
    if type(dataframe.index) == pandas.tseries.period.PeriodIndex:
        x_label = 'Year'

    y_l = False
    if not absolutes:
        y_l = 'Percentage'
    else:
        y_l = 'Absolute frequency'

    # hacky: turn legend into subplot titles :)
    if sbplt:
        # title the big plot
        #plt.gca().suptitle(title, fontsize = 16)
        #plt.subplots_adjust(top=0.9)
        # get all axes
        if 'layout' not in kwargs:
            axes = [l for index, l in enumerate(ax)]
        else:
            axes = []
            cols = [l for index, l in enumerate(ax)]
            for col in cols:
                for bit in col:
                    axes.append(bit)
    
        # set subplot titles
        for index, a in enumerate(axes):
            try:
                titletext = list(dataframe.columns)[index]
            except:
                pass
            a.set_title(titletext)
            try:
                a.legend_.remove()
            except:
                pass
            #try:
            #    from matplotlib.ticker import MaxNLocator
            #    from corpkit.process import is_number
            #    indx = list(dataframe.index)
            #    if all([is_number(qq) for qq in indx]):
            #        ax.get_xaxis().set_major_locator(MaxNLocator(integer=True))
            #except:
            #    pass
            # remove axis labels for pie plots
            if piemode:
                a.axes.get_xaxis().set_visible(False)
                a.axes.get_yaxis().set_visible(False)
                a.axis('equal')

            a.grid(b=show_grid)
        
    # add sums to bar graphs and pie graphs
    # doubled right now, no matter

    if not sbplt:
        
        # show grid
        ax.grid(b=show_grid)

        if kind.startswith('bar'):
            width = ax.containers[0][0].get_width()

    if was_series:
        the_y_limit = plt.ylim()[1]
        if show_totals.endswith('plot') or show_totals.endswith('both'):
            # make plot a bit higher if putting these totals on it
            plt.ylim([0,the_y_limit * 1.05])
            for i, label in enumerate(list(dataframe.index)):
                if len(dataframe.ix[label]) == 1:
                    score = dataframe.ix[label][0]
                else:
                    if absolutes:
                        score = dataframe.ix[label].sum()
                    else:
                        #import warnings
                        #warnings.warn("It's not possible to determine total percentage from individual percentages.")
                        continue
                if not absolutes:
                    plt.annotate('%.2f' % score, (i, score), ha = 'center', va = 'bottom')
                else:
                    plt.annotate(score, (i, score), ha = 'center', va = 'bottom')
    else:
        the_y_limit = plt.ylim()[1]
        if show_totals.endswith('plot') or show_totals.endswith('both'):
            for i, label in enumerate(list(dataframe.columns)):
                if len(dataframe[label]) == 1:
                    score = dataframe[label][0]
                else:
                    if absolutes:
                        score = dataframe[label].sum()
                    else:
                        #import warnings
                        #warnings.warn("It's not possible to determine total percentage from individual percentages.")
                        continue
                if not absolutes:
                    plt.annotate('%.2f' % score, (i, score), ha='center', va='bottom')
                else:
                    plt.annotate(score, (i, score), ha='center', va='bottom')        

    if not kwargs.get('layout') and not sbplt and not kwargs.get('ax'):
        plt.tight_layout()
    if kwargs.get('ax'):
        try:
            plt.gcf().set_tight_layout(False)
        except:
            pass
        try:
            plt.set_tight_layout(False)
        except:
            pass

    if save:
        if running_python_tex:
            imagefolder = '../images'
        else:
            imagefolder = 'images'

        savename = get_savename(imagefolder, save=save, title=title, ext=output_format)

        if not os.path.isdir(imagefolder):
            os.makedirs(imagefolder)

        # save image and get on with our lives
        if legend_pos.startswith('o') and not sbplt:
            plt.gcf().savefig(savename, dpi=150, bbox_extra_artists=(lgd,), 
                              bbox_inches='tight', format=output_format)
        else:
            plt.gcf().savefig(savename, dpi=150, format=output_format)
        time = strftime("%H:%M:%S", localtime())
        if os.path.isfile(savename):
            print('\n' + time + ": " + savename + " created.")
        else:
            raise ValueError("Error making %s." % savename)

    if dragmode:
        plt.legend().draggable()

    if sbplt:
        plt.subplots_adjust(right=.8)
        plt.subplots_adjust(left=.1)

    # add DataCursor to notebook backend if possible
    if have_mpldc:
        if kind == 'line':
            HighlightingDataCursor(plt.gca().get_lines(), highlight_width=4, highlight_color = False,
                    formatter=lambda **kwargs: '%s: %s' % (kwargs['label'], "{0:.3f}".format(kwargs['y'])))
        else:
            datacursor(formatter=lambda **kwargs: '%s: %s' % (kwargs['label'], "{0:.3f}".format(kwargs['height'])))

    #if not interactive and not running_python_tex and not running_spider \
    #    and not tk:
    #    plt.gcf().show()
    #    return plt
    #elif running_spider or tk:
    #    return plt

    if interactive:
        plt.subplots_adjust(right=.8)
        plt.subplots_adjust(left=.1)
        try:
            ax.legend_.remove()
        except:
            pass
        return mpld3.display()
    else:
        return plt
class KMeansPlusPlus:

    def __init__(self, data_frame, k, columns=None, max_iterations=None,
                 appended_column_name=None):
        if not isinstance(data_frame, DataFrame):
            raise Exception("data_frame argument is not a pandas DataFrame")
        elif data_frame.empty:
            raise Exception("The given data frame is empty")

        if max_iterations is not None and max_iterations <= 0:
            raise Exception("max_iterations must be positive!")

        if not isinstance(k, Integral) or k <= 0:
            raise Exception("The value of k must be a positive integer")

        self.data_frame = data_frame  # m x n
        self.numRows = data_frame.shape[0]  # m

        # k x n, the i,j entry being the jth coordinate of center i
        self.centers = None

        # m x k , the i,j entry represents the distance
        # from point i to center j
        # (where i and j start at 0)
        self.distance_matrix = None

        # Series of length m, consisting of integers 0,1,...,k-1
        self.clusters = None

        # To keep track of clusters in the previous iteration
        self.previous_clusters = None

        self.max_iterations = max_iterations
        self.appended_column_name = appended_column_name
        self.k = k

        if columns is None:
            self.columns = data_frame.columns
        else:
            for col in columns:
                if col not in data_frame.columns:
                    raise Exception(
                        "Column '%s' not found in the given DataFrame" % col)
                if not self._is_numeric(col):
                    raise Exception(
                        "The column '%s' is either not numeric or contains NaN values" % col)
            self.columns = columns

    def _populate_initial_centers(self):
        rows = []
        rows.append(self._grab_random_point())
        distances = None

        while len(rows) < self.k:
            if distances is None:
                distances = self._distances_from_point(rows[0])
            else:
                distances = self._distances_from_point_list(rows)

            normalized_distances = distances / distances.sum()
            normalized_distances.sort()
            dice_roll = np.random.rand()
            min_over_roll = normalized_distances[
                normalized_distances.cumsum() >= dice_roll].min()
            index = normalized_distances[
                normalized_distances == min_over_roll].index[0]
            rows.append(self.data_frame[self.columns].iloc[index, :])

        self.centers = DataFrame(rows, columns=self.columns)

    def _compute_distances(self):
        if self.centers is None:
            raise Exception(
                "Must populate centers before distances can be calculated!")

        column_dict = {}

        for i in list(range(self.k)):
            column_dict[i] = self._distances_from_point(
                self.centers.iloc[i, :])

        self.distance_matrix = DataFrame(
            column_dict, columns=list(range(self.k)))

    def _get_clusters(self):
        if self.distance_matrix is None:
            raise Exception(
                "Must compute distances before closest centers can be calculated")

        min_distances = self.distance_matrix.min(axis=1)

        # We need to make sure the index
        min_distances.index = list(range(self.numRows))

        cluster_list = [boolean_series.index[j]
                        for boolean_series in
                        [
                            self.distance_matrix.iloc[i,
                                                      :] == min_distances.iloc[i]
                            for i in list(range(self.numRows))
                        ]
                        for j in list(range(self.k))
                        if boolean_series[j]
                        ]

        self.clusters = Series(cluster_list, index=self.data_frame.index)

    def _compute_new_centers(self):
        if self.centers is None:
            raise Exception("Centers not initialized!")

        if self.clusters is None:
            raise Exception("Clusters not computed!")

        for i in list(range(self.k)):
            self.centers.ix[i, :] = self.data_frame[
                self.columns].ix[self.clusters == i].mean()

    def cluster(self):

        self._populate_initial_centers()
        self._compute_distances()
        self._get_clusters()

        counter = 0

        while True:
            counter += 1

            self.previous_clusters = self.clusters.copy()

            self._compute_new_centers()
            self._compute_distances()
            self._get_clusters()

            if self.max_iterations is not None and counter >= self.max_iterations:
                break
            elif all(self.clusters == self.previous_clusters):
                break

        if self.appended_column_name is not None:
            try:
                self.data_frame[self.appended_column_name] = self.clusters
            except:
                warnings.warn(
                    "Unable to append a column named %s to your data." %
                    self.appended_column_name)
                warnings.warn(
                    "However, the clusters are available via the cluster attribute")

    def _distances_from_point(self, point):
        # pandas Series
        return np.power(self.data_frame[self.columns] - point, 2).sum(axis=1)

    def _distances_from_point_list(self, point_list):
        result = None

        for point in point_list:
            if result is None:
                result = self._distances_from_point(point)
            else:
                result = pd.concat(
                    [result, self._distances_from_point(point)], axis=1).min(axis=1)

        return result

    def _grab_random_point(self):
        index = np.random.random_integers(0, self.numRows - 1)
        # NumPy array
        return self.data_frame[self.columns].iloc[index, :].values

    def _is_numeric(self, col):
        return all(np.isreal(self.data_frame[col])) and not any(np.isnan(self.data_frame[col]))
Example #42
0
  'pastrami': 'cow',
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}

data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
data

data['food'].map(lambda x: meat_to_animal[x.lower()])

# 数据标准化
datafile = 'd:/data/normalization_data.xls' #参数初始化
data = pd.read_excel(datafile, header = None) #读取数据

(data - data.min())/(data.max() - data.min()) #最小-最大规范化
(data - data.mean())/data.std() #零-均值规范化
data/10**np.ceil(np.log10(data.abs().max())) #小数定标规范化


###替换值
data = Series([1., -999., 2., -999., -1000., 3.])
data

data.replace(-999, np.nan)

data.replace([-999, -1000], np.nan)

data.replace([-999, -1000], [np.nan, 0])

data.replace({-999: np.nan, -1000: 0})
import numpy as np
from pandas import Series, DataFrame
import pandas as pd

arr = np.array([[1, 2, np.nan], [np.nan, 3, 4]])

dframe1 = DataFrame(arr, index=["A", "B"], columns=["One", "Two", "Three"])
dframe1

# Sum method
dframe1.sum()  # ignores null values (treats them as 0s)
dframe1.sum(axis=1)  # sum across rows

# Min method
dframe1.min()  # finds the minimum value in each column
dframe1.min(axis=1)  # minimum value of each row

dframe1.idxmin()  # Find the index of minimum value column

# Max method
dframe1.max()
dframe1.idxmax()

# Cumulative sum
dframe1.cumsum()  # accumulates along each columns values

# Describe method
dframe1.describe()  # summary statistics of dataframe (by columns)

# correlation and covariance