def clean(numpy_array): #load your csv data here in numpy_array data=ut.preprocessData(numpy_array) #print dataarray #print data ###### numpy into pandas dataframe df = pd.DataFrame(data) #print df #print df.dtypes df=df.astype('float16') #print df.dtypes ###### generate preprocessed csv file #df.to_csv('preprocessed_data.csv', sep=',',index=False) ###### normalize data between [0,1] using X_norm= (X - Xmin)/ (Xmax - Xmin) df_norm= (df - df.min()) / (df.max()-df.min()) df_norm=df_norm.fillna(-1) ##### generate normalized csv #df_norm.to_csv('normalized_data.csv',sep=',', index=False) return df_norm.as_matrix()
def __generate_trace(self, objectives: DataFrame, metadata: list = None, legend: str = '', normalize: bool = False, **kwargs): number_of_objectives = objectives.shape[1] if normalize: objectives = (objectives - objectives.min()) / (objectives.max() - objectives.min()) marker = dict( color='rgb(127, 127, 127)', size=3, symbol='x', line=dict( color='rgb(204, 204, 204)', width=1 ), opacity=0.8 ) marker.update(**kwargs) if number_of_objectives == 2: trace = go.Scattergl( x=objectives[0], y=objectives[1], mode='markers', marker=marker, name=legend, customdata=metadata ) elif number_of_objectives == 3: trace = go.Scatter3d( x=objectives[0], y=objectives[1], z=objectives[2], mode='markers', marker=marker, name=legend, customdata=metadata ) else: dimensions = list() for column in objectives: dimensions.append( dict(range=[0, 1], label=self.axis_labels[column:column+1][0] if self.axis_labels[column:column+1] else None, values=objectives[column]) ) trace = go.Parcoords( line=dict(color='blue'), dimensions=dimensions, name=legend, ) return trace
def analyze(): signals = read_csv(FILE_SIGNALS) devices = signals["id"].unique() print("got %d signals from %d devices" % (len(signals), len(devices))) signals = signals.groupby(["frequency", "id"]).size() signals = signals.reindex(MultiIndex.from_product([SPECTRUM, devices], names=signals.index.names), fill_value=0) signals = signals.unstack("id") # let's only keep frequencies with all signals present candidates = signals.dropna() # suggest frequency where the weakest sensor has the most # received signals, and then the frequency with most total # received signals for all sensors candidates = DataFrame({"total": candidates.sum(axis=1), "weakest": candidates.min(axis=1)}) appropriate_freq = candidates.sort(["weakest", "total"], ascending=False).index[0] print("suggesting frequency %s" % mhz(appropriate_freq)) signals.to_csv("spectrum.csv") import matplotlib.pyplot as plt from matplotlib.ticker import EngFormatter p=signals.plot(kind="Area") p.xaxis.set_major_formatter(EngFormatter(unit='Hz', places=2)) plt.savefig(FILE_SPECTRUM, dpi=300) print("saved spectrum as %s" % FILE_SPECTRUM)
class LogAggregate: def __init__(self, dataset): self.dataset = DataFrame(dataset) def get_median(self, *arg, **kwarg): if kwarg.has_key('group_by'): return self.dataset.groupby(kwarg['group_by']).median()[kwarg['key']] else: return self.dataset.median()[kwarg['key']] def get_average(self, *arg, **kwarg): if kwarg.has_key('group_by'): return self.dataset.groupby(kwarg['group_by']).mean()[kwarg['key']] else: return self.dataset.mean()[kwarg['key']] def get_min(self, *arg, **kwarg): if kwarg.has_key('group_by'): return self.dataset.groupby(kwarg['group_by']).min()[kwarg['key']] else: return self.dataset.min()[kwarg['key']] def get_max(self, *arg, **kwarg): if kwarg.has_key('group_by'): return self.dataset.groupby(kwarg['group_by']).max()[kwarg['key']] else: return self.dataset.max()[kwarg['key']] def get_count(self, *arg, **kwarg): if kwarg.has_key('group_by'): return self.dataset.groupby(kwarg['group_by']).count()[kwarg['key']] else: return self.dataset.count()[kwarg['key']]
def test_min_max_dt64_with_NaT(self): # Both NaT and Timestamp are in DataFrame. df = DataFrame({"foo": [pd.NaT, pd.NaT, Timestamp("2012-05-01")]}) res = df.min() exp = Series([Timestamp("2012-05-01")], index=["foo"]) tm.assert_series_equal(res, exp) res = df.max() exp = Series([Timestamp("2012-05-01")], index=["foo"]) tm.assert_series_equal(res, exp) # GH12941, only NaTs are in DataFrame. df = DataFrame({"foo": [pd.NaT, pd.NaT]}) res = df.min() exp = Series([pd.NaT], index=["foo"]) tm.assert_series_equal(res, exp) res = df.max() exp = Series([pd.NaT], index=["foo"]) tm.assert_series_equal(res, exp)
def get_val(df: pd.DataFrame, method): if isinstance(method, str): if method.lower() == "mean": return df.mean() elif method.lower() == "max": return df.max() elif method.lower() == "min": return df.min() elif isinstance(method, int) or isinstance(method, float): return method else: raise ValueError(f"unknown method {method} to replace nan vlaues")
def _flow_and_probability_mapper(monthly_data: pd.DataFrame, to_probability: bool = False, to_flow: bool = False, extrapolate: bool = False) -> interpolate.interp1d: if not to_flow and not to_probability: raise ValueError('You need to specify either to_probability or to_flow as True') # get maximum value to bound histogram max_val = math.ceil(np.max(monthly_data.max())) min_val = math.floor(np.min(monthly_data.min())) if max_val == min_val: warnings.warn('The observational data has the same max and min value. You may get unanticipated results.') max_val += .1 # determine number of histograms bins needed number_of_points = len(monthly_data.values) number_of_classes = math.ceil(1 + (3.322 * math.log10(number_of_points))) # specify the bin width for histogram (in m3/s) step_width = (max_val - min_val) / number_of_classes # specify histogram bins bins = np.arange(-np.min(step_width), max_val + 2 * np.min(step_width), np.min(step_width)) if bins[0] == 0: bins = np.concatenate((-bins[1], bins)) elif bins[0] > 0: bins = np.concatenate((-bins[0], bins)) # make the histogram counts, bin_edges = np.histogram(monthly_data, bins=bins) # adjust the bins to be the center bin_edges = bin_edges[1:] # normalize the histograms counts = counts.astype(float) / monthly_data.size # calculate the cdfs cdf = np.cumsum(counts) # interpolated function to convert simulated streamflow to prob if to_probability: if extrapolate: return interpolate.interp1d(bin_edges, cdf, fill_value='extrapolate') return interpolate.interp1d(bin_edges, cdf) # interpolated function to convert simulated prob to observed streamflow elif to_flow: if extrapolate: return interpolate.interp1d(cdf, bin_edges, fill_value='extrapolate') return interpolate.interp1d(cdf, bin_edges)
def normalize(df: DataFrame, is_string: bool = False) -> DataFrame: """Normalizes numeric columns in a data frame""" ptid_col = None has_ptid: bool = PATIENT_ID_COL_NAME in list(df) if has_ptid: # Remove the PTID column ptid_col: DataFrame = get_del_col(data_set=df, col_name=PATIENT_ID_COL_NAME) if is_string: df: DataFrame = DataFrame(data=df.to_numpy(dtype=float), columns=list(df)) # Normalize df: DataFrame = (df - df.min(axis=0)) / (df.max(axis=0) - df.min(axis=0)) if has_ptid: # Reattach the patient ID column df: DataFrame = concat([ptid_col, df], axis=1) return df
def get_dist_to_nearest_target(self, bagfile): self._targets = self.detect_targets() positions = utilities.get_positions_from_bag(bagfile) distances = DataFrame() for target in range(len(self._targets)): px, py, pr = self._targets[target] distances['d'+ str(target)] = ((((positions['fly_x'] - px)/5.2)**2 + ((positions['fly_y'] - py)/4.8)**2)**0.5)# - pr distances['Timestamp'] = positions.Timestamp distances = utilities.convert_timestamps(distances) self.dtarget = DataFrame(distances.min(axis=1), columns=['dtarget']) return self.dtarget
def init_scaler(source_df: pd.DataFrame, target_columns: List[str]) -> MinMaxScaler: # use min and max from source data if no definition is available. definitions see above: data_ranges source_min_df = source_df.min().to_frame().T source_max_df = source_df.max().to_frame().T target_min_df = pd.DataFrame(index=np.arange(0, 1), columns=target_columns) target_max_df = pd.DataFrame(index=np.arange(0, 1), columns=target_columns) # add min and max range for certain columns for column in target_columns: if column in data_ranges: # check if real data point is within defined data range. adapt accordingly if column in source_min_df and source_min_df.iloc[0][ column] < data_ranges[column]["min"]: # "bad" case 1 logger.write( "Scaler init warning: Defined data range for column {}: [{}, {}], got minimum of {}" .format(column, data_ranges[column]["min"], data_ranges[column]["max"], source_min_df.iloc[0][column])) target_min_df.iloc[0][column] = source_min_df.iloc[0][column] else: target_min_df.loc[0][column] = data_ranges[column]["min"] if column in source_max_df and source_max_df.iloc[0][ column] > data_ranges[column]["max"]: # "bad" case 2 logger.write( "Scaler init warning: Defined data range for column {}: [{}, {}], got maximum of {}" .format(column, data_ranges[column]["min"], data_ranges[column]["max"], source_max_df.iloc[0][column])) target_max_df.iloc[0][column] = source_max_df.iloc[0][column] else: target_max_df.loc[0][column] = data_ranges[column]["max"] elif column in source_min_df: target_min_df.loc[0][column] = source_min_df.iloc[0][column] target_max_df.loc[0][column] = source_min_df.iloc[0][column] else: raise ValueError( f"Unknown column {column}! No min and max values available!") min_max_df = pd.concat([target_min_df, target_max_df]) min_max_data = min_max_df.to_numpy() # copy = False, if input already is numpy array scaler = MinMaxScaler(feature_range=(0, 1), copy=False) scaler.fit(min_max_data) return scaler
def get_min_dist(df: pds.DataFrame, tol: float = 1e-10): """ Get the shortest pair distance from the given DataFrame. Args: df (DataFrame): index is the radial distance in Angstrom, and column is the time step in ps. tol (float): any float number less than tol is considered as zero. Returns: The shorted pair distance throughout the table. """ # TODO: Add unittest for i, col in enumerate(df.columns): min_dist = df.min(axis="index")[i] if min_dist > tol: return float(col)
def get_totals(df: pd.DataFrame): """ The function takes a pandas DataFrame and creates a dictionary with selected summary statistics. """ out = dict() out['min'] = df.min() out['per15'] = df.quantile(0.15) out['qr1'] = df.quantile(0.25) out['median'] = df.median() out['qr3'] = df.quantile(0.75) out['per85'] = df.quantile(0.85) out['max'] = df.max() out['count'] = df.count() out['mean'] = df.mean() out['iqr'] = out['qr3'] - out['qr1'] return pd.DataFrame(out)
def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Transforms the variables using log transformation. Parameters ---------- X : Pandas DataFrame of shape = [n_samples, n_features] The data to be transformed. Raises ------ TypeError If the input is not a Pandas DataFrame ValueError - If the dataframe not of the same size as that used in fit(). - If some variables contains zero or negative values. Returns ------- X : pandas dataframe The dataframe with the transformed variables. """ # check input dataframe and if class was fitted X = _to_dataframe(X) # check if input contains zero or negative values if (X <= 0).any().any(): raise ValueError( "Some variables contain zero or negative values, can't apply log2" ) # check if input contains infinite values if np.isinf(X).values.any(): raise ValueError( "Some of the variables contain infinite values, can't apply log2" ) # transform # Default lambda if self.lamb is None: lamb = X.min().min() / 10.0 else: lamb = self.lamb # Apply the transformation y = X.values y = np.log2((y + (y**2 + lamb**2)**0.5) / 2) return pd.DataFrame(y, index=X.index, columns=X.columns)
def get_dist_to_nearest_target(self, bagfile): self._targets = self.detect_targets() positions = utilities.get_positions_from_bag(bagfile) distances = DataFrame() for target in range(len(self._targets)): px, py, pr = self._targets[target] distances['d' + str(target)] = ( (((positions['fly_x'] - px) / 5.2)**2 + ((positions['fly_y'] - py) / 4.8)**2)**0.5) # - pr distances['Timestamp'] = positions.Timestamp distances = utilities.convert_timestamps(distances) self.dtarget = DataFrame(distances.min(axis=1), columns=['dtarget']) return self.dtarget
def generate_series_data(df: pd.DataFrame, column_length: int) -> pd.Series: df_max = df.max() df_max.index = df_max.index + '_max' df_min = df.min() df_min.index = df_min.index + '_min' df_mean = df.mean() df_mean.index = df_mean.index + '_mean' df_var = df.var() df_var.index = df_var.index + '_var' df_series = df.stack() series_index = np.array([ [i] * column_length for i in range(int(len(df_series.index) / column_length)) ]).reshape(1, -1)[0].astype(str) df_series.index = df_series.index.get_level_values(1) + '_' + series_index return pd.concat([df_series, df_max, df_min, df_mean, df_var])
def dataframe_to_image(df: pd.DataFrame, image_filename: str) -> None: plt.figure(figsize=(12, 9)) ax = plt.subplot(111) ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) ax.get_xaxis().tick_bottom() ax.get_yaxis().tick_left() y_min = int(min(df.min().values * 0.98)) y_max = int(max(df.max().values * 1.02)) plt.ylim(y_min, y_max) plt.yticks(range(y_min, y_max, 5), fontsize=14) start_date = df.index.min().isoformat()[:10] end_date = df.index.max().isoformat()[:10] plt.title(f"Weight {start_date} - {end_date}", fontsize=22) plt.plot( df, lw=2.5, ) plt.savefig(image_filename, bbox_inches="tight")
def cross_validate_trades(trades, N = 20, subset_fraction = 0.7): tickers = trades.tickers sample_size = round(len(tickers) * subset_fraction) summary = DataFrame(dtype = float) for n in range(N): sample_tickers = list(random.choice(tickers, sample_size, replace = False)) trade_subset = trades.find(lambda T: T.ticker in sample_tickers) summary[n] = summary_report(trade_subset) result = DataFrame(dtype = float) result['Base'] = summary_report(trades) result['Mean'] = summary.mean(axis = 1) result['Std'] = summary.std(axis = 1) result['Median'] = summary.median(axis = 1) result['Max'] = summary.max(axis = 1) result['Min'] = summary.min(axis = 1) return (result, summary)
def resilience(sim: pd.DataFrame, rec_th=1., idxs=[]) -> Tuple[float, float]: """ Parameters ---------- sim : pd.DataFrame return of simulate() Returns ------- Tuple[float, float] shock intensity, time to recovery """ average_shock = sim.min() - 1 time_to_rec = (sim >= rec_th)[1:].idxmax() if len(idxs) > 0: return average_shock[idxs], time_to_rec[idxs] return average_shock, time_to_rec
def animated_training(self, data: np.ndarray, df: pd.DataFrame, mus, sigmas): fig, ax = plt.subplots(figsize=(16, 9), dpi=70) def animate(epoch: int): self.run_epoch(data) ax.clear() plt.title(f'epoch = {epoch}') ax.set_xlabel('km') ax.set_ylabel('price') ax.set_xlim(data.min(axis=0)[0] - 1, data.max(axis=0)[0] + 1) ax.set_ylim(-4, 4) x = np.linspace(start=data.min(axis=0)[0] - 1, stop=data.max(axis=0)[0] + 1, num=100) y = self.estimator.predict(x) line = plt.plot(x, y, label='prediction') plt.scatter(data[:, 0], data[:, 1], label='raw data', marker='x') plt.legend() return line, ani = animation.FuncAnimation(fig, animate, frames=self.epochs, interval=10, blit=False) plt.show() for epoch in range(self.epochs): self.run_epoch(data) scaled_x = np.linspace(start=data.min(axis=0)[0] - 1, stop=data.max(axis=0)[0] + 1, num=100) self.graph(scaled_x, self.estimator.predict(scaled_x), data, 'k', f'Scaled data ({self.epochs})') x_lin = np.linspace(start=df.min(axis=0)[0] - 1, stop=df.max(axis=0)[0] + 1, num=100) y_lin = self.estimator.predict(scaled_x) * sigmas[1] + mus[1] self.graph(x_lin, y_lin, (np.matrix([df.km, df.price]).T).A, 'b', 'Resulting unscaled prediction') return
def _feature_extraction(data: pd.DataFrame) -> pd.Series: def nlargest_index(df, n): return df.nlargest(n).index.unique()[0:n] # first 225 statistical features statistical = data.min() statistical = statistical.append(data.max(), ignore_index=True) statistical = statistical.append(data.mean(), ignore_index=True) statistical = statistical.append(data.skew(), ignore_index=True) statistical = statistical.append(data.kurtosis(), ignore_index=True) # FFT features fft = pd.DataFrame(np.fft.fft(data)) fft_angle = fft.applymap(np.angle) fft = fft.applymap(np.abs) largest_values = pd.Series() largest_angles = pd.Series() largest_indices = pd.Series() for i in range(0, 45): five_largest_idx = nlargest_index(fft.ix[:, i].map(abs), 5) # is map(abs) redundant? largest_indices = largest_indices.append(pd.Series(five_largest_idx), ignore_index=True) five_largest = fft_angle.ix[five_largest_idx, i].T largest_angles = largest_angles.append(five_largest) five_largest = fft.ix[five_largest_idx, i].T largest_values = largest_values.append(five_largest) # Autocorrelation autocorrelation = pd.Series() autocorrelation = autocorrelation.append(data.apply(lambda col: col.autocorr(1), axis=0)) for i in range(5, 51, 5): autocorrelation = autocorrelation.append(data.apply(lambda col: col.autocorr(i), axis=0)) # Make result feature_vector = pd.Series() feature_vector = feature_vector.append(statistical) feature_vector = feature_vector.append(largest_values) feature_vector = feature_vector.append(largest_angles) feature_vector = feature_vector.append(largest_indices) feature_vector = feature_vector.append(autocorrelation) return feature_vector
def cross_validate_trades(trades, N=20, subset_fraction=0.7): tickers = trades.tickers sample_size = round(len(tickers) * subset_fraction) summary = DataFrame(dtype=float) for n in range(N): sample_tickers = list( random.choice(tickers, sample_size, replace=False)) trade_subset = trades.find(lambda T: T.ticker in sample_tickers) summary[n] = summary_report(trade_subset) result = DataFrame(dtype=float) result['Base'] = summary_report(trades) result['Mean'] = summary.mean(axis=1) result['Std'] = summary.std(axis=1) result['Median'] = summary.median(axis=1) result['Max'] = summary.max(axis=1) result['Min'] = summary.min(axis=1) return (result, summary)
def dataFrameMathTest(): #Note : The methods that return a series default to working on columns. df = DataFrame() # Load a DataFrame from a CSV file org_df = pd.read_csv('mlg.csv') df = org_df.iloc[:,1:7] resAbs = df.abs() # absolute values print(resAbs) #resAdd = df.add(o) # add df, Series or value #print(resAdd) resCount = df.count() # non NA/null values print(resCount) resCumMax = df.cummax() # (cols default axis) print(resCumMax) resCumMin = df.cummin() # (cols default axis) print(resCumMin) resCumSum = df.cumsum() # (cols default axis) print(resCumSum) resDiff = df.diff() # 1st diff (col def axis) print(resDiff) resDiv = df.div(12) # div by df, Series, value print(resDiv) #resDot = df.dot(13) # matrix dot product #print(resDot) resMax = df.max() # max of axis (col def) print(resMax) resMean = df.mean() # mean (col default axis) print(resMean) resMedian = df.median()# median (col default) print(resMedian) resMin = df.min() # min of axis (col def) print(resMin) resMul = df.mul(2) # mul by df Series val print(resMul) resSum = df.sum() # sum axis (cols default) print(resSum) resWhere = df.where(df > 0.5, other=np.nan) print(resWhere)
def normalize(data_frame: pd.DataFrame, norm_type="mean", df_mean: pd.Series = None, df_std: pd.Series = None, df_min: pd.Series = None, df_max: pd.Series = None) -> pd.DataFrame: if norm_type == "min_max": if df_min is None: df_min = data_frame.min() if df_max is None: df_max = data_frame.max() result = (data_frame - df_min) / (df_max - df_min) else: if df_mean is None: df_mean = data_frame.mean() if df_mean is None: df_std = data_frame.std() result = (data_frame - df_mean) / df_std return result.fillna(0)
def test_min_max_dt64_with_NaT_skipna_false(self, tz_naive_fixture): # GH#36907 tz = tz_naive_fixture df = DataFrame({ "a": [ Timestamp("2020-01-01 08:00:00", tz=tz), Timestamp("1920-02-01 09:00:00", tz=tz), ], "b": [Timestamp("2020-02-01 08:00:00", tz=tz), pd.NaT], }) res = df.min(axis=1, skipna=False) expected = Series([df.loc[0, "a"], pd.NaT]) assert expected.dtype == df["a"].dtype tm.assert_series_equal(res, expected) res = df.max(axis=1, skipna=False) expected = Series([df.loc[0, "b"], pd.NaT]) assert expected.dtype == df["a"].dtype tm.assert_series_equal(res, expected)
def simulate_df_with_same_variation(df: pd.DataFrame, sampling_size: int) -> pd.DataFrame: max_data_frame = df.max() min_data_frame = df.min() uniformly_selected_values_0 = np.random.uniform(min_data_frame[0], max_data_frame[0], sampling_size) uniformly_selected_values_1 = np.random.uniform(min_data_frame[1], max_data_frame[1], sampling_size) uniformly_selected_observations = np.column_stack( (uniformly_selected_values_0, uniformly_selected_values_1)) if len(max_data_frame) >= 2: for i in range(2, len(max_data_frame)): uniformly_selected_values_i = np.random.uniform( min_data_frame[i], max_data_frame[i], sampling_size) to_stack = (uniformly_selected_observations, uniformly_selected_values_i) uniformly_selected_observations = np.column_stack(to_stack) uniformly_selected_observations_df = pd.DataFrame( uniformly_selected_observations) return uniformly_selected_observations_df
def optimal_weights(exp_rets: pd.DataFrame, cov: pd.DataFrame, n_points: int) -> np.array: """Retorna uma lista dos pesos que minimizam a volatilidade, dados os retornos esperados 'exp_rets' e a matriz de covariância 'cov'. Considera o retorno esperado mínimo e o máximo para criar uma lista com 'n_points' retornos igualmente espaçados entre eles. Para cada retorno desta lista, executa a função minimize_vol. Args: exp_rets (pd.DataFrame): retornos esperados. cov (pd.DataFrame): matriz de covariância. n_points (int): número de intervalos, igualmente espaçados, entre o menor e o maior retorno. Returns: np.array """ target_returns = np.linspace(exp_rets.min(), exp_rets.max(), n_points) weights = [ minimize_vol(target_return, exp_rets, cov) for target_return in target_returns ] return weights
def get_Ys(do_pca=False): """Get Ys as DataFrame for fitting, if no PCA measurements are scaled from -1 to 1""" sv_db = access_db(0, True) measurements = get_msrmnts(sv_db, Q) if do_pca: X, df = pca_X() my_pca = PCA(n_components=0.99) my_pca.fit(X) X_trans = my_pca.transform(X) sn_Y = list(df.index) names = ['PCA Comp_' + str(i + 1) for i in range(my_pca.n_components_)] Ys = DataFrame(X_trans, index=sn_Y, columns=names) return Ys Ys = measurements Ys = Ys - Ys.min() Ys = Ys / Ys.max() return Ys * 2 - 1
def get_scatter_view_lims(counts_df: pd.DataFrame) -> Tuple[float, float]: """Calculates scatter view limits for the counts dataframe""" x0 = counts_df.min(axis='columns').where(lambda x: x != 0).dropna().min() x1 = np.max(counts_df).max() minpos = 1e-300 if not np.isfinite([x0, x1]).all() or not isinstance(x0, np.float) or x1 <= 0: print("The provided dataset contains invalid values.") return (minpos, minpos) x0, x1 = (minpos if x0 <= 0 else x0, minpos if x1 <= 0 else x1) transform = LogTransform(base=2) inverse_trans = transform.inverted() x0t, x1t = transform.transform([x0, x1]) delta = (x1t - x0t) * mpl.rcParams.get('axes.xmargin', 0) if not np.isfinite(delta): delta = 0 return inverse_trans.transform([x0t - delta, x1t + delta])
def group_msgs_by_term(df_msgs: pd.DataFrame, term: str) -> dict: # set term term_days = 8 if term == 'lm': term_days = 31 print('group messages every {0} days'.format(term_days)) # analyze timestamp now_in_sec = (datetime.now(JST) - datetime.fromtimestamp(0, JST)).total_seconds() interval_days = timedelta(days=term_days) interval_seconds = interval_days.total_seconds() oldest_timestamp = df_msgs.min().timestamp oldest_ts_in_sec = (datetime.fromtimestamp(oldest_timestamp, JST) - datetime.fromtimestamp(0, JST)).total_seconds() loop_num = (abs(now_in_sec - oldest_ts_in_sec) / interval_seconds) + 1 # extract by term dict_msgs_by_term = {} df_tmp = df_msgs now_tmp = now_in_sec for i in range(int(loop_num)): # make current term string cur_term_s = 'term_ago_{0}'.format(str(i).zfill(3)) print(cur_term_s) # current messages df_msgs_cur = df_tmp.query('@now_tmp - timestamp < @interval_seconds') df_msgs_other = df_tmp.query( '@now_tmp - timestamp >= @interval_seconds') # messages does not exist. break. if df_msgs_cur.shape[0] == 0: break # add current messages to dict dict_msgs_by_term[cur_term_s] = ' '.join( df_msgs_cur.wakati_msg.dropna().values.tolist()) # update temp value for next loop now_tmp = now_tmp - interval_seconds df_tmp = df_msgs_other return dict_msgs_by_term
def _r2_containment(data: pd.DataFrame, curve: pd.Series, relax: bool) -> float: """ Produces \lambda_r with the given input data, using the standard ordering on R as the definition for containment. Parameters: ---------- data: list DataFrame of real-valued functions that define our band in R^2 (columns are time intervals, rows are functions) curve: pd.Series Function to check containment on relax: bool If False, we use the strict definition of containment. If True, we consider the proportion of time the curve is in the band Returns: ---------- float: If relax=False, then 0 if the function is not contained in the curve, 1 if it is. If relax=True, then we consider the proportion of time the curve is in the band, so we will return a number between 0 and 1. """ containment = 0 y_range = [] # Grab the mins/maxs across all rows (functions at each time index) mins = data.min(axis=1) maxs = data.max(axis=1) # Generate intervals in R over each time index intervals = [[i, j] for i, j in zip(mins, maxs)] # Check if each value in the curve is contained within the band for index, val in enumerate(curve): if intervals[index][0] <= val <= intervals[index][1]: containment += 1 # If relax=True, then we return the proportion of points in the band, else, Python integer division will round down to 0 unless all points are contained in the band (strict containment) return containment / len(curve) if relax else containment // len(curve)
def calc_stats(dataset: pd.DataFrame, dataname: str): mean = float(np.round(np.mean(dataset), 3)) median = float(np.round(np.median(dataset), 3)) min_value = float(np.round(dataset.min(), 3)) max_value = float(np.round(dataset.max(), 3)) quartile_1 = float(np.round(dataset.quantile(0.25), 3)) quartile_3 = float(np.round(dataset.quantile(0.75), 3)) iqr = np.round(quartile_3 - quartile_1, 3) lower_bound = np.round(quartile_1 - iqr * 1.5, 3) upper_bound = np.round(quartile_3 + iqr * 1.5, 3) print(f'{dataname} summary statistics') print(f'Min : {min_value}') print(f'Mean : {mean}') print(f'Max : {max_value}') print('') print(f'25th percentile : {quartile_1}') print(f'Median : {median}') print(f'75th percentile : {quartile_3}') print(f'Interquartile range (IQR): {iqr}') print('') print(f'Lower outlier bound : {lower_bound}') print(f'Upper outlier bound : {upper_bound}') print('--------------------------------')
def draw_rphi_map(W : Dict[int, List[KrSector]], aMap : DataFrame, alims : Optional[Tuple[float, float]] = None, title : str = 'E', cmap : Colormap = matplotlib.cm.viridis, alpha : float = 1.0, # level of transparency rmax : float = 200, # the largest radius scale : float = 0.5, # needed to fit the map figsize : Tuple[float, float] = (14,10)): fig = plt.figure(figsize=figsize) # give plots a rectangular frame ax = fig.add_subplot(1,1,1) if alims == None: e0M = aMap.max().max() e0m = aMap.min().min() else: e0m, e0M = alims[0], alims[1] p = add_map_values_to_axis_(W, aMap, ax, cmap, alpha, rmax, scale, clims=(e0m, e0M)) fig.colorbar(p, ax=ax) plt.title(title) plt.tight_layout() plt.show()
def selected_set_index(df: pd.DataFrame, indices: List[int], minimize: bool) -> List[float]: """ Convenience function. Returns per row the (minimum, maximum) of a selected set of columns. Parameters ---------- df: pd.DataFrame A dataframe with each column representing a dataset, and each row representing a configuration. indices: List The rows to select minimize: bool Whether to return the sum of column-wise minimum or the sum of column-wise maximum Returns ------- List[float] per column (minimum, maximum) of the selected rows """ is_series = isinstance(df, pd.Series) # filters out only the algorithms that we have in the 'set of defaults' df = df.iloc[indices] # df.min(axis=0) returns per dataset the minimum score obtained by 'set of defaults' # then we take the median of this if minimize: result = df.min(axis=0) else: result = df.max(axis=0) if is_series: result = [result] if np.isnan(sum(result)): raise ValueError('None of the results of this function should be NaN') return result
def normalize(dataframe: DataFrame, column: str) -> DataFrame: maxd = dataframe.max(axis=0) mind = dataframe.min(axis=0) return (dataframe[column] - mind[column]) / (maxd[column] - mind[column])
def __generate_trace(self, points: pd.DataFrame, legend: str, metadata: list = None, normalize: bool = False, **kwargs): dimension = points.shape[1] # tweak points size for 3D plots marker_size = 8 if dimension == 3: marker_size = 4 # if indicated, perform normalization if normalize: points = (points - points.min()) / (points.max() - points.min()) marker = dict( color='#236FA4', size=marker_size, symbol='circle', line=dict( color='#236FA4', width=1 ), opacity=0.8 ) marker.update(**kwargs) if dimension == 2: trace = go.Scattergl( x=points[0], y=points[1], mode='markers', marker=marker, name=legend, customdata=metadata ) elif dimension == 3: trace = go.Scatter3d( x=points[0], y=points[1], z=points[2], mode='markers', marker=marker, name=legend, customdata=metadata ) else: dimensions = list() for column in points: dimensions.append( dict(range=[0, 1], label=self.axis_labels[column:column + 1][0] if self.axis_labels[column:column + 1] else None, values=points[column]) ) trace = go.Parcoords( line=dict( color='#236FA4' ), dimensions=dimensions, name=legend, ) return trace
class Experiment(): def __init__(self, n_training, n_test, dimensions, actualsEstimator, name): self.__name__ = name self.n_training = n_training self.n_test = n_test self.dimensions = dimensions self.actualsEstimator = actualsEstimator.fit() self.train = DataFrame(self.actualsEstimator.sample(self.n_training)) self.importance_test = DataFrame( self.actualsEstimator.sample(self.n_test)) self.lows = self.train.min(axis=0) self.highs = self.train.max(axis=0) self.uniform_test = DataFrame( uniform(low=self.lows, high=self.highs, size=(self.n_test, self.dimensions))) self.importance_actuals = self.actualsEstimator.predict( self.importance_test) self.uniform_actuals = self.actualsEstimator.predict(self.uniform_test) self.test = self.importance_test self.test_actuals = self.importance_actuals #Build up a KDTRee for faster processing self.kdt_ = KDTree(self.train, leaf_size=30, metric='euclidean') self.dist_, self.nn_ = self.kdt_.query(self.test, k=int(1 + self.n_test**0.5), return_distance=True) self.dist_loo_, self.nn_loo_ = self.kdt_.query( self.train, k=int(1 + self.n_training**0.5), return_distance=True) def ISE(self, estimates, actuals): r""" .. math:: Q_N(e,a,p) = \frac{1}{N}\sum_{i=0}^N\frac{(e_i-a_i)^2}{p_i} Integrated Squared Error with Importance Sampling """ return mean(((estimates - actuals)**2.))**0.5 def IAE(self, estimates, actuals): r""" .. math:: Q_N(e,a,p) = \frac{1}{N}\sum_{i=0}^N\frac{|e_i-a_i|}{p_i} Integrated Absolute Error with Importance Sampling """ return mean(abs(estimates - actuals)) def EmpericalEntropy(self, estimates): return entropy(estimates, base=2) def JensenShannon(self, estimates, actuals): M = 0.5 * (estimates + actuals) return 0.5 * (entropy(estimates, M, base=2) + entropy(actuals, M, base=2)) def KullbackLeiber(self, estimates, actuals): return entropy(actuals, estimates, base=2) def getResults(self, estimator, prekdt=False): # uni_est = estimator.predict(self.uniform_test) #Attach some pre calculated results to the estimator if prekdt: estimator.nn_ = self.nn_ estimator.dist_ = self.dist_ estimator.nn_loo_ = self.nn_loo_ estimator.dist_loo_ = self.dist_loo_ estimator.kdt_ = self.kdt_ est = estimator.predict(self.test) actuals = self.test_actuals estimator.nn_ = None estimator.dist_ = None estimator.nn_loo_ = None estimator.dist_loo_ = None estimator.kdt_ = None return self.ISE(est, actuals), self.IAE(est, actuals), self.JensenShannon( est, actuals), self.KullbackLeiber( est, actuals), self.EmpericalEntropy(est)
def pca(x, y=None, ylev=None, nlab=0, lsize=10, lalpha=1, center="both", scale="none", legend=True, cname="variable", color=None): if type(color) != type({}): color = None xForSvd = x.ix[:, x.std(axis=0) > 0] xsvd = svdForPca(xForSvd, center, scale) svdRowPlot = DataFrame( xsvd[0][:, 0:2], index = xForSvd.index, columns = ["PC1", "PC2"] ) svdRowPlot = svdRowPlot.divide(svdRowPlot.max(axis=0) - svdRowPlot.min(axis=0), axis=1) svdColPlot = DataFrame( numpy.transpose(xsvd[2][0:2, :]), index = xForSvd.columns, columns = ["PC1", "PC2"] ) svdColPlot = svdColPlot.divide(svdColPlot.max(axis=0) - svdColPlot.min(axis=0), axis=1) if nlab > 0: svdColPlotMag = (svdColPlot**2).sum(axis=1) svdColPlotMag.sort_values(ascending=False, inplace=True) svdColPlot = svdColPlot.ix[svdColPlotMag.index] svdColPlot["label"] = "" svdColPlot.ix[0:nlab, "label"] = \ svdColPlot.ix[0:nlab].index.to_series() if legend: ax = plt.subplot(111) plt.plot(svdColPlot["PC1"], svdColPlot["PC2"], "o", color=(0, 0, 0, 0.1), markersize=5, label=cname) if nlab > 0: for i in range(nlab): plt.text(svdColPlot.ix[i, "PC1"], svdColPlot.ix[i, "PC2"], svdColPlot.ix[i, "label"], fontsize = lsize, color = (0, 0, 0, lalpha), label = None) if y is not None: if ylev is None: ylev = y.unique() for level in ylev: if color is not None and level in color.keys(): plt.plot(svdRowPlot.ix[y == level, 0], svdRowPlot.ix[y == level, 1], "o", markersize = 8, label = level, color = color[level]) else: plt.plot(svdRowPlot.ix[y == level, 0], svdRowPlot.ix[y == level, 1], "o", markersize = 8, label = level) else: plt.plot(svdRowPlot["PC1"], svdRowPlot["PC2"], "o", markersize=8) if legend: box = ax.get_position() ax.set_position([box.x0, box.y0, box.width*0.8, box.height]) ax.legend(loc="center left", bbox_to_anchor=(1, 0.5), numpoints=1) plt.show()
# <codecell> check_functions = [(MVhypergeo_test, "MV_hypergeo"), (fishers_test, "Fishers")] results = DataFrame(index=range(0, (refseq != "-").sum())) for (g1seqs, g2seqs, nref, gname), (func, funcname) in product(grouping_seq, check_functions): print(gname, funcname) res = func(g1seqs, g2seqs, nref) aggres = resolve_indices(res, nref) colname = gname + "_" + funcname results[colname] = aggres # <codecell> results.min() # <codecell> from collections import defaultdict naggres = defaultdict(set) for col in results.columns: naggres[col] = set(results[col][results[col] < 0.05].index) print(naggres) # <codecell> for c1, c2 in combinations(naggres.keys(), 2): common = naggres[c1] & naggres[c2] if common:
__author__ = 'Executor' import numpy as np import pandas as pa from pandas import Series, DataFrame arr = np.array([[1, 2, np.nan], [np.nan, 3, 4]]) dframe1 = DataFrame(arr, index=['A', 'B'], columns=['One', 'Two', 'Three']) print(dframe1.sum()) print(dframe1.sum(axis=1)) print(dframe1.min()) print(dframe1) print(dframe1.idxmin()) print(dframe1) print(dframe1.cumsum()) print(dframe1.describe()) from IPython.display import YouTubeVideo YouTubeVideo('xGbpuFNR1ME') YouTubeVideo('4EXNedimDMs') ''' stupid thing doesn't work!'''
def plotter(df, title=False, kind='line', x_label=None, y_label=None, style='ggplot', figsize=(8, 4), save=False, legend_pos='best', reverse_legend='guess', num_to_plot=7, tex='try', colours='default', cumulative=False, pie_legend=True, partial_pie=False, show_totals=False, transparent=False, output_format='png', interactive=False, black_and_white=False, show_p_val=False, indices=False, transpose=False, rot=False, **kwargs): """Visualise corpus interrogations. :param title: A title for the plot :type title: str :param df: Data to be plotted :type df: Pandas DataFrame :param x_label: A label for the x axis :type x_label: str :param y_label: A label for the y axis :type y_label: str :param kind: The kind of chart to make :type kind: str ('line'/'bar'/'barh'/'pie'/'area') :param style: Visual theme of plot :type style: str ('ggplot'/'bmh'/'fivethirtyeight'/'seaborn-talk'/etc) :param figsize: Size of plot :type figsize: tuple (int, int) :param save: If bool, save with *title* as name; if str, use str as name :type save: bool/str :param legend_pos: Where to place legend :type legend_pos: str ('upper right'/'outside right'/etc) :param reverse_legend: Reverse the order of the legend :type reverse_legend: bool :param num_to_plot: How many columns to plot :type num_to_plot: int/'all' :param tex: Use TeX to draw plot text :type tex: bool :param colours: Colourmap for lines/bars/slices :type colours: str :param cumulative: Plot values cumulatively :type cumulative: bool :param pie_legend: Show a legend for pie chart :type pie_legend: bool :param partial_pie: Allow plotting of pie slices only :type partial_pie: bool :param show_totals: Print sums in plot where possible :type show_totals: str -- 'legend'/'plot'/'both' :param transparent: Transparent .png background :type transparent: bool :param output_format: File format for saved image :type output_format: str -- 'png'/'pdf' :param black_and_white: Create black and white line styles :type black_and_white: bool :param show_p_val: Attempt to print p values in legend if contained in df :type show_p_val: bool :param indices: To use when plotting "distance from root" :type indices: bool :param stacked: When making bar chart, stack bars on top of one another :type stacked: str :param filled: For area and bar charts, make every column sum to 100 :type filled: str :param legend: Show a legend :type legend: bool :param rot: Rotate x axis ticks by *rot* degrees :type rot: int :param subplots: Plot each column separately :type subplots: bool :param layout: Grid shape to use when *subplots* is True :type layout: tuple -- (int, int) :param interactive: Experimental interactive options :type interactive: list -- [1, 2, 3] :returns: matplotlib figure """ import corpkit import os try: from IPython.utils.shimmodule import ShimWarning import warnings warnings.simplefilter('ignore', ShimWarning) except: pass kwargs['rot'] = rot import matplotlib as mpl from matplotlib import rc # prefer seaborn plotting try: import seaborn as sns except ImportError: pass if interactive: import matplotlib.pyplot as plt, mpld3 else: import matplotlib.pyplot as plt import pandas from pandas import DataFrame, Series from time import localtime, strftime from process import checkstack if interactive: import mpld3 import collections from mpld3 import plugins, utils from plugins import InteractiveLegendPlugin, HighlightLines have_mpldc = False try: from mpldatacursor import datacursor, HighlightingDataCursor have_mpldc = True except ImportError: pass # check what environment we're in tk = checkstack('tkinter') running_python_tex = checkstack('pythontex') running_spider = checkstack('spyder') if not title: title = '' def truncate_colormap(cmap, minval=0.0, maxval=1.0, n=100): """remove extreme values from colourmap --- no pure white""" import matplotlib.colors as colors import numpy as np new_cmap = colors.LinearSegmentedColormap.from_list( 'trunc({n},{a:.2f},{b:.2f})'.format(n=cmap.name, a=minval, b=maxval), cmap(np.linspace(minval, maxval, n))) return new_cmap def get_savename(imagefolder, save = False, title = False, ext = 'png'): """Come up with the savename for the image.""" import os from corpkit.process import urlify # name as if not ext.startswith('.'): ext = '.' + ext if isinstance(save, STRINGTYPE): savename = os.path.join(imagefolder, (urlify(save) + ext)) #this 'else' is redundant now that title is obligatory else: if title: filename = urlify(title) + ext savename = os.path.join(imagefolder, filename) # remove duplicated ext if savename.endswith('%s%s' % (ext, ext)): savename = savename.replace('%s%s' % (ext, ext), ext, 1) return savename def rename_data_with_total(dataframe, was_series = False, using_tex = False, absolutes = True): """adds totals (abs, rel, keyness) to entry name strings""" if was_series: where_the_words_are = dataframe.index else: where_the_words_are = dataframe.columns the_labs = [] for w in list(where_the_words_are): if not absolutes: if was_series: perc = dataframe.T[w][0] else: the_labs.append(w) continue if using_tex: the_labs.append('%s (%.2f\%%)' % (w, perc)) else: the_labs.append('%s (%.2f %%)' % (w, perc)) else: if was_series: score = dataframe.T[w].sum() else: score = dataframe[w].sum() if using_tex: the_labs.append('%s (n=%d)' % (w, score)) else: the_labs.append('%s (n=%d)' % (w, score)) if not was_series: dataframe.columns = the_labs else: vals = list(dataframe[list(dataframe.columns)[0]].values) dataframe = pandas.DataFrame(vals, index = the_labs) dataframe.columns = ['Total'] return dataframe def auto_explode(dataframe, tinput, was_series = False, num_to_plot = 7): """give me a list of strings and i'll output explode option""" output = [0 for s in range(num_to_plot)] if was_series: l = list(dataframe.index) else: l = list(dataframe.columns) if isinstance(tinput, (STRINGTYPE, int)): tinput = [tinput] if isinstance(tinput, list): for i in tinput: if isinstance(i, STRINGTYPE): index = l.index(i) else: index = i output[index] = 0.1 return output # get a few options from kwargs sbplt = kwargs.get('subplots', False) show_grid = kwargs.pop('grid', True) the_rotation = kwargs.get('rot', False) dragmode = kwargs.pop('draggable', False) leg_frame = kwargs.pop('legend_frame', True) leg_alpha = kwargs.pop('legend_alpha', 0.8) # auto set num to plot based on layout lo = kwargs.get('layout', None) if lo: num_to_plot = lo[0] * lo[1] # todo: get this dynamically instead. styles = ['dark_background', 'bmh', 'grayscale', 'ggplot', 'fivethirtyeight', 'matplotlib', False, 'mpl-white'] #if style not in styles: #raise ValueError('Style %s not found. Use %s' % (str(style), ', '.join(styles))) if style == 'mpl-white': try: sns.set_style("whitegrid") except: pass style = 'matplotlib' if kwargs.get('savepath'): mpl.rcParams['savefig.directory'] = kwargs.get('savepath') kwargs.pop('savepath', None) mpl.rcParams['savefig.bbox'] = 'tight' mpl.rcParams.update({'figure.autolayout': True}) # try to use tex # TO DO: # make some font kwargs here using_tex = False mpl.rcParams['font.family'] = 'sans-serif' mpl.rcParams['text.latex.unicode'] = True if tex == 'try' or tex is True: try: rc('text', usetex=True) rc('font', **{'family': 'serif', 'serif': ['Computer Modern']}) using_tex = True except: matplotlib.rc('font', family='sans-serif') matplotlib.rc('font', serif='Helvetica Neue') matplotlib.rc('text', usetex='false') rc('text', usetex=False) else: rc('text', usetex=False) if interactive: using_tex = False if show_totals is False: show_totals = 'none' # find out what kind of plot we're making, and enable # or disable interactive values if need be kwargs['kind'] = kind.lower() if interactive: if kwargs['kind'].startswith('bar'): interactive_types = [3] elif kwargs['kind'] == 'area': interactive_types = [2, 3] elif kwargs['kind'] == 'line': interactive_types = [2, 3] elif kwargs['kind'] == 'pie': interactive_types = None warnings.warn('Interactive plotting not yet available for pie plots.') else: interactive_types = [None] if interactive is False: interactive_types = [None] # find out if pie mode, add autopct format piemode = False if kind == 'pie': piemode = True # always the best spot for pie #if legend_pos == 'best': #legend_pos = 'lower left' if show_totals.endswith('plot') or show_totals.endswith('both'): kwargs['pctdistance'] = 0.6 if using_tex: kwargs['autopct'] = r'%1.1f\%%' else: kwargs['autopct'] = '%1.1f%%' # copy data, make series into df dataframe = df.copy() was_series = False if isinstance(dataframe, Series): was_series = True if not cumulative: dataframe = DataFrame(dataframe) else: dataframe = DataFrame(dataframe.cumsum()) else: # don't know if this is much good. if transpose: dataframe = dataframe.T if cumulative: dataframe = DataFrame(dataframe.cumsum()) if len(list(dataframe.columns)) == 1: was_series = True # attempt to convert x axis to ints: #try: # dataframe.index = [int(i) for i in list(dataframe.index)] #except: # pass # remove totals and tkinter order if not was_series: for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]): try: dataframe = dataframe.drop(name, axis = ax, errors = 'ignore') except: pass try: dataframe = dataframe.drop('tkintertable-order', errors = 'ignore') except: pass try: dataframe = dataframe.drop('tkintertable-order', axis = 1, errors = 'ignore') except: pass # look at columns to see if all can be ints, in which case, set up figure # for depnumming if not was_series: if indices == 'guess': def isint(x): try: a = float(x) b = int(a) except ValueError or OverflowError: return False else: return a == b if all([isint(x) is True for x in list(dataframe.columns)]): indices = True else: indices = False # if depnumming, plot all, transpose, and rename axes if indices is True: num_to_plot = 'all' dataframe = dataframe.T if y_label is None: y_label = 'Percentage of all matches' if x_label is None: x_label = '' # set backend? output_formats = ['svgz', 'ps', 'emf', 'rgba', 'raw', 'pdf', 'svg', 'eps', 'png', 'pgf'] if output_format not in output_formats: raise ValueError('%s output format not recognised. Must be: %s' % (output_format, ', '.join(output_formats))) # don't know if these are necessary if 'pdf' in output_format: plt.switch_backend(output_format) if 'pgf' in output_format: plt.switch_backend(output_format) if num_to_plot == 'all': if was_series: if not piemode: num_to_plot = len(dataframe) else: num_to_plot = len(dataframe) else: if not piemode: num_to_plot = len(list(dataframe.columns)) else: num_to_plot = len(dataframe.index) # explode pie, or remove if not piemode if piemode and not sbplt and kwargs.get('explode'): kwargs['explode'] = auto_explode(dataframe, kwargs['explode'], was_series=was_series, num_to_plot=num_to_plot) else: kwargs.pop('explode', None) legend = kwargs.get('legend', True) #cut data short plotting_a_totals_column = False if was_series: if list(dataframe.columns)[0] != 'Total': try: can_be_ints = [int(x) for x in list(dataframe.index)] num_to_plot = len(dataframe) except: dataframe = dataframe[:num_to_plot] elif list(dataframe.columns)[0] == 'Total': plotting_a_totals_column = True if not 'legend' in kwargs: legend = False num_to_plot = len(dataframe) else: if transpose: dataframe = dataframe.head(num_to_plot) else: dataframe = dataframe.T.head(num_to_plot).T # remove stats fields, put p in entry text, etc. statfields = ['slope', 'intercept', 'r', 'p', 'stderr'] try: dataframe = dataframe.drop(statfields, axis = 1, errors = 'ignore') except: pass try: dataframe.ix['p'] there_are_p_vals = True except: there_are_p_vals = False if show_p_val: if there_are_p_vals: newnames = [] for col in list(dataframe.columns): pval = dataframe[col]['p'] def p_string_formatter(val): if val < 0.001: if not using_tex: return 'p < 0.001' else: return r'p $<$ 0.001' else: return 'p = %s' % format(val, '.3f') pstr = p_string_formatter(pval) newname = '%s (%s)' % (col, pstr) newnames.append(newname) dataframe.columns = newnames dataframe.drop(statfields, axis = 0, inplace = True, errors = 'ignore') else: warnings.warn('No p-values calculated to show.\n\nUse keep_stats kwarg while editing to generate these values.') else: if there_are_p_vals: dataframe.drop(statfields, axis = 0, inplace = True, errors = 'ignore') # make and set y label absolutes = True if type(dataframe) == DataFrame: try: if not all([s.is_integer() for s in dataframe.iloc[0,:].values]): absolutes = False except: pass else: if not all([s.is_integer() for s in dataframe.values]): absolutes = False ########################################## ################ COLOURS ################# ########################################## # set defaults, with nothing for heatmap yet if colours is True or colours == 'default': if kind != 'heatmap': colours = 'viridis' else: colours = 'default' # assume it's a single color, unless string denoting map cmap_or_c = 'color' if colours is not False and type(colours) == str: cmap_or_c = 'colormap' from matplotlib.colors import LinearSegmentedColormap if type(colours)==LinearSegmentedColormap: cmap_or_c = 'colormap' # for heatmaps, it's always a colormap if kind == 'heatmap': cmap_or_c = 'cmap' # if it's a defaulty string, set accordingly if type(colours) == str: if colours.lower().startswith('diverg'): colours = sns.diverging_palette(10, 133, as_cmap=True) # if default not set, do diverge for any df with a number < 0 elif colours.lower() == 'default': mn = dataframe.min() if type(mn) == Series: mn = mn.min() if mn < 0: colours = sns.diverging_palette(10, 133, as_cmap=True) else: colours = sns.light_palette("green", as_cmap=True) if 'seaborn' not in style: kwargs[cmap_or_c] = colours #if not was_series: # if kind in ['pie', 'line', 'area']: # if colours and not plotting_a_totals_column: # kwargs[cmap_or_c] = colours # else: # if colours: # kwargs[cmap_or_c] = colours #if piemode: # if num_to_plot > 0: # kwargs[cmap_or_c] = colours # else: # if num_to_plot > 0: # kwargs[cmap_or_c] = colours # multicoloured bar charts #if colours and cmap_or_c == 'colormap': # if kind.startswith('bar'): # if len(list(dataframe.columns)) == 1: # if not black_and_white: # import numpy as np # the_range = np.linspace(0, 1, num_to_plot) # middle = len(the_range) / 2 # try: # cmap = plt.get_cmap(colours) # kwargs[cmap_or_c] = [cmap(n) for n in the_range][middle] # except ValueError: # kwargs[cmap_or_c] = colours # # make a bar width ... ? ... # #kwargs['width'] = (figsize[0] / float(num_to_plot)) / 1.5 # reversing legend option if reverse_legend is True: rev_leg = True elif reverse_legend is False: rev_leg = False # show legend or don't, guess whether to reverse based on kind if kind in ['bar', 'barh', 'area', 'line', 'pie']: if was_series: legend = False if kind == 'pie': if pie_legend: legend = True else: legend = False if kind in ['barh', 'area']: if reverse_legend == 'guess': rev_leg = True if not 'rev_leg' in locals(): rev_leg = False # the default legend placement if legend_pos is True: legend_pos = 'best' # cut dataframe if just_totals try: tst = dataframe['Combined total'] dataframe = dataframe.head(num_to_plot) except: pass # no title for subplots because ugly, if title and not sbplt: kwargs['title'] = title # no interactive subplots yet: if sbplt and interactive: import warnings interactive = False warnings.warn('No interactive subplots yet, sorry.') return # not using pandas for labels or legend anymore. #kwargs['labels'] = None #kwargs['legend'] = False if legend: if num_to_plot > 6: if not kwargs.get('ncol'): kwargs['ncol'] = num_to_plot / 7 # kwarg options go in leg_options leg_options = {'framealpha': leg_alpha, 'shadow': kwargs.get('shadow', False), 'ncol': kwargs.pop('ncol', 1)} # determine legend position based on this dict if legend_pos: possible = {'best': 0, 'upper right': 1, 'upper left': 2, 'lower left': 3, 'lower right': 4, 'right': 5, 'center left': 6, 'center right': 7, 'lower center': 8, 'upper center': 9, 'center': 10, 'o r': 2, 'outside right': 2, 'outside upper right': 2, 'outside center right': 'center left', 'outside lower right': 'lower left'} if type(legend_pos) == int: the_loc = legend_pos elif type(legend_pos) == str: try: the_loc = possible[legend_pos] except KeyError: raise KeyError('legend_pos value must be one of:\n%s\n or an int between 0-10.' %', '.join(list(possible.keys()))) leg_options['loc'] = the_loc #weirdness needed for outside plot if legend_pos in ['o r', 'outside right', 'outside upper right']: leg_options['bbox_to_anchor'] = (1.02, 1) if legend_pos == 'outside center right': leg_options['bbox_to_anchor'] = (1.02, 0.5) if legend_pos == 'outside lower right': leg_options['loc'] == 'upper right' leg_options['bbox_to_anchor'] = (0.5, 0.5) # a bit of distance between legend and plot for outside legends if type(legend_pos) == str: if legend_pos.startswith('o'): leg_options['borderaxespad'] = 1 if not piemode: if show_totals.endswith('both') or show_totals.endswith('legend'): dataframe = rename_data_with_total(dataframe, was_series = was_series, using_tex = using_tex, absolutes = absolutes) else: if pie_legend: if show_totals.endswith('both') or show_totals.endswith('legend'): dataframe = rename_data_with_total(dataframe, was_series = was_series, using_tex = using_tex, absolutes = absolutes) if piemode: if partial_pie: dataframe = dataframe / 100.0 # some pie things if piemode: if not sbplt: kwargs['y'] = list(dataframe.columns)[0] def filler(df): pby = df.T.copy() for i in list(pby.columns): tot = pby[i].sum() pby[i] = pby[i] * 100.0 / tot return pby.T areamode = False if kind == 'area': areamode = True if legend is False: kwargs['legend'] = False # line highlighting option for interactive! if interactive: if 2 in interactive_types: if kind == 'line': kwargs['marker'] = ',' if not piemode: kwargs['alpha'] = 0.1 # convert dates --- works only in my current case! if plotting_a_totals_column or not was_series: try: can_it_be_int = int(list(dataframe.index)[0]) can_be_int = True except: can_be_int = False if can_be_int: if 1500 < int(list(dataframe.index)[0]): if 2050 > int(list(dataframe.index)[0]): n = pandas.PeriodIndex([d for d in list(dataframe.index)], freq='A') dataframe = dataframe.set_index(n) if kwargs.get('filled'): if areamode or kind.startswith('bar'): dataframe = filler(dataframe) kwargs.pop('filled', None) MARKERSIZE = 4 COLORMAP = { 0: {'marker': None, 'dash': (None,None)}, 1: {'marker': None, 'dash': [5,5]}, 2: {'marker': "o", 'dash': (None,None)}, 3: {'marker': None, 'dash': [1,3]}, 4: {'marker': "s", 'dash': [5,2,5,2,5,10]}, 5: {'marker': None, 'dash': [5,3,1,2,1,10]}, 6: {'marker': 'o', 'dash': (None,None)}, 7: {'marker': None, 'dash': [5,3,1,3]}, 8: {'marker': "1", 'dash': [1,3]}, 9: {'marker': "*", 'dash': [5,5]}, 10: {'marker': "2", 'dash': [5,2,5,2,5,10]}, 11: {'marker': "s", 'dash': (None,None)} } HATCHES = { 0: {'color': '#dfdfdf', 'hatch':"/"}, 1: {'color': '#6f6f6f', 'hatch':"\\"}, 2: {'color': 'b', 'hatch':"|"}, 3: {'color': '#dfdfdf', 'hatch':"-"}, 4: {'color': '#6f6f6f', 'hatch':"+"}, 5: {'color': 'b', 'hatch':"x"} } if black_and_white: if kind == 'line': kwargs['linewidth'] = 1 cmap = plt.get_cmap('Greys') new_cmap = truncate_colormap(cmap, 0.25, 0.95) if kind == 'bar': # darker if just one entry if len(dataframe.columns) == 1: new_cmap = truncate_colormap(cmap, 0.70, 0.90) kwargs[cmap_or_c] = new_cmap # remove things from kwargs if heatmap if kind == 'heatmap': hmargs = {'annot': kwargs.pop('annot', True), cmap_or_c: kwargs.pop(cmap_or_c, None), 'fmt': kwargs.pop('fmt', ".2f"), 'cbar': kwargs.pop('cbar', False)} for i in ['vmin', 'vmax', 'linewidths', 'linecolor', 'robust', 'center', 'cbar_kws', 'cbar_ax', 'square', 'mask']: if i in kwargs.keys(): hmargs[i] = kwargs.pop(i, None) class dummy_context_mgr(): """a fake context for plotting without style perhaps made obsolete by 'classic' style in new mpl""" def __enter__(self): return None def __exit__(self, one, two, three): return False with plt.style.context((style)) if style != 'matplotlib' else dummy_context_mgr(): if not sbplt: # check if negative values, no stacked if so if areamode: if not kwargs.get('ax'): kwargs['legend'] = False if dataframe.applymap(lambda x: x < 0.0).any().any(): kwargs['stacked'] = False rev_leg = False if kind != 'heatmap': # turn off pie labels at the last minute if kind == 'pie' and pie_legend: kwargs['labels'] = None kwargs['autopct'] = '%.2f' if kind == 'pie': kwargs.pop('color', None) ax = dataframe.plot(figsize=figsize, **kwargs) else: plt.figure(figsize=figsize) if title: plt.title(title) ax = kwargs.get('ax', plt.axes()) sns.heatmap(dataframe, ax=ax, **hmargs) plt.yticks(rotation=0) if areamode and not kwargs.get('ax'): handles, labels = plt.gca().get_legend_handles_labels() del handles del labels if x_label: ax.set_xlabel(x_label) if y_label: ax.set_ylabel(y_label) else: if not kwargs.get('layout'): plt.gcf().set_tight_layout(False) if kind != 'heatmap': ax = dataframe.plot(figsize=figsize, **kwargs) else: plt.figure(figsize=figsize) if title: plt.title(title) ax = plt.axes() sns.heatmap(dataframe, ax=ax, **hmargs) plt.xticks(rotation=0) plt.yticks(rotation=0) def rotate_degrees(rotation, labels): if rotation is None: if max(labels, key=len) > 6: return 45 else: return 0 elif rotation is False: return 0 elif rotation is True: return 45 else: return rotation if sbplt: if 'layout' not in kwargs: axes = [l for index, l in enumerate(ax)] else: axes = [] cols = [l for index, l in enumerate(ax)] for col in cols: for bit in col: axes.append(bit) for index, a in enumerate(axes): labels = [item.get_text() for item in a.get_xticklabels()] rotation = rotate_degrees(the_rotation, labels) a.set_xticklabels(labels, rotation = rotation, ha='right') else: if kind == 'heatmap': labels = [item.get_text() for item in ax.get_xticklabels()] rotation = rotate_degrees(the_rotation, labels) ax.set_xticklabels(labels, rotation = rotation, ha='right') if transparent: plt.gcf().patch.set_facecolor('white') plt.gcf().patch.set_alpha(0) if black_and_white: if kind == 'line': # white background # change everything to black and white with interesting dashes and markers c = 0 for line in ax.get_lines(): line.set_color('black') #line.set_width(1) line.set_dashes(COLORMAP[c]['dash']) line.set_marker(COLORMAP[c]['marker']) line.set_markersize(MARKERSIZE) c += 1 if c == len(list(COLORMAP.keys())): c = 0 # draw legend with proper placement etc if legend: if not piemode and not sbplt and kind != 'heatmap': if 3 not in interactive_types: handles, labels = plt.gca().get_legend_handles_labels() # area doubles the handles and labels. this removes half: #if areamode: # handles = handles[-len(handles) / 2:] # labels = labels[-len(labels) / 2:] if rev_leg: handles = handles[::-1] labels = labels[::-1] if kwargs.get('ax'): lgd = plt.gca().legend(handles, labels, **leg_options) ax.get_legend().draw_frame(leg_frame) else: lgd = plt.legend(handles, labels, **leg_options) lgd.draw_frame(leg_frame) if interactive: # 1 = highlight lines # 2 = line labels # 3 = legend switches ax = plt.gca() # fails for piemode lines = ax.lines handles, labels = plt.gca().get_legend_handles_labels() if 1 in interactive_types: plugins.connect(plt.gcf(), HighlightLines(lines)) if 3 in interactive_types: plugins.connect(plt.gcf(), InteractiveLegendPlugin(lines, labels, alpha_unsel=0.0)) for i, l in enumerate(lines): y_vals = l.get_ydata() x_vals = l.get_xdata() x_vals = [str(x) for x in x_vals] if absolutes: ls = ['%s (%s: %d)' % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals)] else: ls = ['%s (%s: %.2f%%)' % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals)] if 2 in interactive_types: #if 'kind' in kwargs and kwargs['kind'] == 'area': tooltip_line = mpld3.plugins.LineLabelTooltip(lines[i], labels[i]) mpld3.plugins.connect(plt.gcf(), tooltip_line) #else: if kind == 'line': tooltip_point = mpld3.plugins.PointLabelTooltip(l, labels = ls) mpld3.plugins.connect(plt.gcf(), tooltip_point) if piemode: if not sbplt: plt.axis('equal') ax.get_xaxis().set_visible(False) ax.get_yaxis().set_visible(False) # add x label # this could be revised now! # if time series period, it's year for now if type(dataframe.index) == pandas.tseries.period.PeriodIndex: x_label = 'Year' y_l = False if not absolutes: y_l = 'Percentage' else: y_l = 'Absolute frequency' # hacky: turn legend into subplot titles :) if sbplt: # title the big plot #plt.gca().suptitle(title, fontsize = 16) #plt.subplots_adjust(top=0.9) # get all axes if 'layout' not in kwargs: axes = [l for index, l in enumerate(ax)] else: axes = [] cols = [l for index, l in enumerate(ax)] for col in cols: for bit in col: axes.append(bit) # set subplot titles for index, a in enumerate(axes): try: titletext = list(dataframe.columns)[index] except: pass a.set_title(titletext) try: a.legend_.remove() except: pass #try: # from matplotlib.ticker import MaxNLocator # from corpkit.process import is_number # indx = list(dataframe.index) # if all([is_number(qq) for qq in indx]): # ax.get_xaxis().set_major_locator(MaxNLocator(integer=True)) #except: # pass # remove axis labels for pie plots if piemode: a.axes.get_xaxis().set_visible(False) a.axes.get_yaxis().set_visible(False) a.axis('equal') a.grid(b=show_grid) # add sums to bar graphs and pie graphs # doubled right now, no matter if not sbplt: # show grid ax.grid(b=show_grid) if kind.startswith('bar'): width = ax.containers[0][0].get_width() if was_series: the_y_limit = plt.ylim()[1] if show_totals.endswith('plot') or show_totals.endswith('both'): # make plot a bit higher if putting these totals on it plt.ylim([0,the_y_limit * 1.05]) for i, label in enumerate(list(dataframe.index)): if len(dataframe.ix[label]) == 1: score = dataframe.ix[label][0] else: if absolutes: score = dataframe.ix[label].sum() else: #import warnings #warnings.warn("It's not possible to determine total percentage from individual percentages.") continue if not absolutes: plt.annotate('%.2f' % score, (i, score), ha = 'center', va = 'bottom') else: plt.annotate(score, (i, score), ha = 'center', va = 'bottom') else: the_y_limit = plt.ylim()[1] if show_totals.endswith('plot') or show_totals.endswith('both'): for i, label in enumerate(list(dataframe.columns)): if len(dataframe[label]) == 1: score = dataframe[label][0] else: if absolutes: score = dataframe[label].sum() else: #import warnings #warnings.warn("It's not possible to determine total percentage from individual percentages.") continue if not absolutes: plt.annotate('%.2f' % score, (i, score), ha='center', va='bottom') else: plt.annotate(score, (i, score), ha='center', va='bottom') if not kwargs.get('layout') and not sbplt and not kwargs.get('ax'): plt.tight_layout() if kwargs.get('ax'): try: plt.gcf().set_tight_layout(False) except: pass try: plt.set_tight_layout(False) except: pass if save: if running_python_tex: imagefolder = '../images' else: imagefolder = 'images' savename = get_savename(imagefolder, save=save, title=title, ext=output_format) if not os.path.isdir(imagefolder): os.makedirs(imagefolder) # save image and get on with our lives if legend_pos.startswith('o') and not sbplt: plt.gcf().savefig(savename, dpi=150, bbox_extra_artists=(lgd,), bbox_inches='tight', format=output_format) else: plt.gcf().savefig(savename, dpi=150, format=output_format) time = strftime("%H:%M:%S", localtime()) if os.path.isfile(savename): print('\n' + time + ": " + savename + " created.") else: raise ValueError("Error making %s." % savename) if dragmode: plt.legend().draggable() if sbplt: plt.subplots_adjust(right=.8) plt.subplots_adjust(left=.1) # add DataCursor to notebook backend if possible if have_mpldc: if kind == 'line': HighlightingDataCursor(plt.gca().get_lines(), highlight_width=4, highlight_color = False, formatter=lambda **kwargs: '%s: %s' % (kwargs['label'], "{0:.3f}".format(kwargs['y']))) else: datacursor(formatter=lambda **kwargs: '%s: %s' % (kwargs['label'], "{0:.3f}".format(kwargs['height']))) #if not interactive and not running_python_tex and not running_spider \ # and not tk: # plt.gcf().show() # return plt #elif running_spider or tk: # return plt if interactive: plt.subplots_adjust(right=.8) plt.subplots_adjust(left=.1) try: ax.legend_.remove() except: pass return mpld3.display() else: return plt
class KMeansPlusPlus: def __init__(self, data_frame, k, columns=None, max_iterations=None, appended_column_name=None): if not isinstance(data_frame, DataFrame): raise Exception("data_frame argument is not a pandas DataFrame") elif data_frame.empty: raise Exception("The given data frame is empty") if max_iterations is not None and max_iterations <= 0: raise Exception("max_iterations must be positive!") if not isinstance(k, Integral) or k <= 0: raise Exception("The value of k must be a positive integer") self.data_frame = data_frame # m x n self.numRows = data_frame.shape[0] # m # k x n, the i,j entry being the jth coordinate of center i self.centers = None # m x k , the i,j entry represents the distance # from point i to center j # (where i and j start at 0) self.distance_matrix = None # Series of length m, consisting of integers 0,1,...,k-1 self.clusters = None # To keep track of clusters in the previous iteration self.previous_clusters = None self.max_iterations = max_iterations self.appended_column_name = appended_column_name self.k = k if columns is None: self.columns = data_frame.columns else: for col in columns: if col not in data_frame.columns: raise Exception( "Column '%s' not found in the given DataFrame" % col) if not self._is_numeric(col): raise Exception( "The column '%s' is either not numeric or contains NaN values" % col) self.columns = columns def _populate_initial_centers(self): rows = [] rows.append(self._grab_random_point()) distances = None while len(rows) < self.k: if distances is None: distances = self._distances_from_point(rows[0]) else: distances = self._distances_from_point_list(rows) normalized_distances = distances / distances.sum() normalized_distances.sort() dice_roll = np.random.rand() min_over_roll = normalized_distances[ normalized_distances.cumsum() >= dice_roll].min() index = normalized_distances[ normalized_distances == min_over_roll].index[0] rows.append(self.data_frame[self.columns].iloc[index, :]) self.centers = DataFrame(rows, columns=self.columns) def _compute_distances(self): if self.centers is None: raise Exception( "Must populate centers before distances can be calculated!") column_dict = {} for i in list(range(self.k)): column_dict[i] = self._distances_from_point( self.centers.iloc[i, :]) self.distance_matrix = DataFrame( column_dict, columns=list(range(self.k))) def _get_clusters(self): if self.distance_matrix is None: raise Exception( "Must compute distances before closest centers can be calculated") min_distances = self.distance_matrix.min(axis=1) # We need to make sure the index min_distances.index = list(range(self.numRows)) cluster_list = [boolean_series.index[j] for boolean_series in [ self.distance_matrix.iloc[i, :] == min_distances.iloc[i] for i in list(range(self.numRows)) ] for j in list(range(self.k)) if boolean_series[j] ] self.clusters = Series(cluster_list, index=self.data_frame.index) def _compute_new_centers(self): if self.centers is None: raise Exception("Centers not initialized!") if self.clusters is None: raise Exception("Clusters not computed!") for i in list(range(self.k)): self.centers.ix[i, :] = self.data_frame[ self.columns].ix[self.clusters == i].mean() def cluster(self): self._populate_initial_centers() self._compute_distances() self._get_clusters() counter = 0 while True: counter += 1 self.previous_clusters = self.clusters.copy() self._compute_new_centers() self._compute_distances() self._get_clusters() if self.max_iterations is not None and counter >= self.max_iterations: break elif all(self.clusters == self.previous_clusters): break if self.appended_column_name is not None: try: self.data_frame[self.appended_column_name] = self.clusters except: warnings.warn( "Unable to append a column named %s to your data." % self.appended_column_name) warnings.warn( "However, the clusters are available via the cluster attribute") def _distances_from_point(self, point): # pandas Series return np.power(self.data_frame[self.columns] - point, 2).sum(axis=1) def _distances_from_point_list(self, point_list): result = None for point in point_list: if result is None: result = self._distances_from_point(point) else: result = pd.concat( [result, self._distances_from_point(point)], axis=1).min(axis=1) return result def _grab_random_point(self): index = np.random.random_integers(0, self.numRows - 1) # NumPy array return self.data_frame[self.columns].iloc[index, :].values def _is_numeric(self, col): return all(np.isreal(self.data_frame[col])) and not any(np.isnan(self.data_frame[col]))
'pastrami': 'cow', 'corned beef': 'cow', 'honey ham': 'pig', 'nova lox': 'salmon' } data['animal'] = data['food'].map(str.lower).map(meat_to_animal) data data['food'].map(lambda x: meat_to_animal[x.lower()]) # 数据标准化 datafile = 'd:/data/normalization_data.xls' #参数初始化 data = pd.read_excel(datafile, header = None) #读取数据 (data - data.min())/(data.max() - data.min()) #最小-最大规范化 (data - data.mean())/data.std() #零-均值规范化 data/10**np.ceil(np.log10(data.abs().max())) #小数定标规范化 ###替换值 data = Series([1., -999., 2., -999., -1000., 3.]) data data.replace(-999, np.nan) data.replace([-999, -1000], np.nan) data.replace([-999, -1000], [np.nan, 0]) data.replace({-999: np.nan, -1000: 0})
import numpy as np from pandas import Series, DataFrame import pandas as pd arr = np.array([[1, 2, np.nan], [np.nan, 3, 4]]) dframe1 = DataFrame(arr, index=["A", "B"], columns=["One", "Two", "Three"]) dframe1 # Sum method dframe1.sum() # ignores null values (treats them as 0s) dframe1.sum(axis=1) # sum across rows # Min method dframe1.min() # finds the minimum value in each column dframe1.min(axis=1) # minimum value of each row dframe1.idxmin() # Find the index of minimum value column # Max method dframe1.max() dframe1.idxmax() # Cumulative sum dframe1.cumsum() # accumulates along each columns values # Describe method dframe1.describe() # summary statistics of dataframe (by columns) # correlation and covariance