def data_sample(X: pd.DataFrame, y: pd.Series, nrows: int = 5000, method: int = 2, random_state=SEED): # -> (pd.DataFrame, pd.Series): if len(X) > nrows: if method == 0: X_sample = X.sample(nrows, random_state=random_state) y_sample = y[X_sample.index] elif method == 1: # for unbalanced data - take care of imbalance X_sample = X.assign(label=y) rate = pd.DataFrame(data=[[1, len(y) - np.sum(y)], [0, np.sum(y)]], columns=['label', 'rate']) X_sample = X_sample.merge(rate, on='label') \ .sample(nrows, random_state=random_state, weights='rate') y_sample = X_sample['label'] X_sample = X_sample.drop(['label', 'rate'], axis=1) else: # for unbalanced data - keep imbalance nrows_1 = int(np.ceil(np.sum(y) / len(y) * np.sum(nrows))) X_sample = X.assign(label=y) X_sample_1 = X_sample \ .query("label == 1") \ .sample(nrows_1, random_state=random_state) X_sample_0 = X_sample \ .query("label == 0") \ .sample(nrows - nrows_1, random_state=random_state) X_sample = pd.concat([X_sample_0, X_sample_1], sort=False, ignore_index=True) X_sample = shuffle(X_sample) y_sample = X_sample['label'] X_sample = X_sample.drop(['label'], axis=1)
def test_sample_axis1(self): # Check weights with axis = 1 easy_weight_list = [0] * 3 easy_weight_list[2] = 1 df = DataFrame({ "col1": range(10, 20), "col2": range(20, 30), "colString": ["a"] * 10 }) sample1 = df.sample(n=1, axis=1, weights=easy_weight_list) tm.assert_frame_equal(sample1, df[["colString"]]) # Test default axes tm.assert_frame_equal(df.sample(n=3, random_state=42), df.sample(n=3, axis=0, random_state=42))
def get_cubic_fit(data: pd.DataFrame): data = data.sort_values(by='x') data_x, data_y = data['x'].dropna(), data['y'].dropna() f = scipy.interpolate.CubicSpline(data_x, data_y) x = list(np.linspace(min(data_x), max(data_x), NUM_INTERPOLATED_POINTS)) data = pd.DataFrame(dict(x=x, y=f(x))) return data.sample(frac=0.1, weights=data.y)
def find_risk_profile(df: pd.DataFrame, feature: str, topk_ratio: float, adj: float, option: str) -> list or dict: """ dtype feature: str dtype topk_ratio: float (range: 0-1) dtype adj: float (to modify the mean) dtype option: str ('topk', 'ratio') rtype: list(option='topk') or dict(option='ratio') The option topk is usually better than the ratio because of overfitting. """ # Top-k suspicious item flagging if option == 'topk': total_cnt = df.groupby([feature])['illicit'] nrisky_profile = int(topk_ratio * len(total_cnt)) + 1 # prob_illicit = total_cnt.mean() # Simple mean adj_prob_illicit = total_cnt.sum() / (total_cnt.count() + adj ) # Smoothed mean return list( adj_prob_illicit.sort_values( ascending=False).head(nrisky_profile).index) # Illicit-ratio encoding (Mean target encoding) # Refer: http://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-munging/target-encoding.html # Refer: https://towardsdatascience.com/why-you-should-try-mean-encoding-17057262cd0 elif option == 'ratio': # For target encoding, we just use 70% of train data to avoid overfitting (otherwise, test AUC drops significantly) total_cnt = df.sample(frac=0.7).groupby([feature])['illicit'] nrisky_profile = int(topk_ratio * len(total_cnt)) + 1 # prob_illicit = total_cnt.mean() # Simple mean adj_prob_illicit = total_cnt.sum() / (total_cnt.count() + adj ) # Smoothed mean return adj_prob_illicit.to_dict()
def create_rankings(ex_mtx: pd.DataFrame, seed=None) -> pd.DataFrame: """ Create a whole genome rankings dataframe from a single cell expression profile dataframe. :param ex_mtx: The expression profile matrix. The rows should correspond to different cells, the columns to different genes (n_cells x n_genes). :return: A genome rankings dataframe (n_cells x n_genes). """ # Do a shuffle would be nice for exactly similar behaviour as R implementation. # 1. Ranks are assigned in the range of 1 to n, therefore we need to subtract 1. # 2. In case of a tie the 'first' method is used, i.e. we keep the order in the original array. The remove any # bias we shuffle the dataframe before ranking it. This introduces a performance penalty! # 3. Genes are ranked according to gene expression in descending order, i.e. from highly expressed (0) to low expression (n). # 3. NAs should be given the highest rank numbers. Documentation is bad, so tested implementation via code snippet: # # import pandas as pd # import numpy as np # df = pd.DataFrame(data=[4, 1, 3, np.nan, 2, 3], columns=['values']) # # Run below statement multiple times to see effect of shuffling in case of a tie. # df.sample(frac=1.0, replace=False).rank(ascending=False, method='first', na_option='bottom').sort_index() - 1 # return ( ex_mtx.sample(frac=1.0, replace=False, axis=1, random_state=seed).rank( axis=1, ascending=False, method='first', na_option='bottom').astype(DTYPE) - 1)
def pareto_and_runtimes_by_task(df: pd.DataFrame) -> alt.Chart: """Creates an interactive Pareto curve and scatter plot of task runtimes. Tracing each curve shows to what extent a small proportion of long-running regions contribute disproportionately to the overall runtime. That is, "The longest-running X% of regions account for Y% of the total runtime." There is a curve for each task. Args: df: A dataframe of all regions. Returns: An altair chart. """ grouped = df.groupby(df['Task'], sort=False) df = grouped.apply(calculate_pareto_metrics) # Sample along the Pareto curve, ensuring the longest regions are shown. if len(df) > 5000: x = 1000 df = pd.concat([df.nlargest(x, 'total runtime'), df.sample(5000 - x)]) # Limit columns to greatly reduce the size of the html report. columns_used = [ 'task cumsum order', 'task cumsum fraction', 'tooltip', 'Task', 'task total runtime', 'task num examples', 'Runtime for task' ] df = df[columns_used] # Brushing on the task_scatter plot highlights the same tasks in the Pareto # curve. brush = alt.selection_interval() pareto_by_task = alt.Chart(df).mark_line(size=2).encode( x=alt.X( 'task cumsum order', title='The longest-runtime X% of regions', axis=alt.Axis(format='%')), y=alt.Y( 'task cumsum fraction', title='Account for Y% of the total runtime', axis=alt.Axis(format='%')), tooltip='tooltip', color=alt.condition(brush, 'Task:N', alt.value('lightgray'))).properties( title='Pareto curve for each task').interactive() # This chart needs to use the same dataframe as the first chart to enable the # brushing on one to affect the other. Using max(task) for 'text' is a # trick that causes bundling by task to avoid showing multiple overlapping # points which otherwise make the text look funky. task_scatter = alt.Chart(df).mark_point(size=10).encode( x=alt.X('max(task total runtime)', title='Runtime (seconds)'), y=alt.Y('task num examples:Q', title='Number of examples'), color=alt.condition(brush, 'Task:N', alt.value('lightgray')), tooltip=['Task', 'Runtime for task'] ) \ .properties(title='Total runtime for each task (drag to highlight)') \ .add_selection(brush) return pareto_by_task | task_scatter
def accumulateAll(self, app: pd.DataFrame, tracker: pd.DataFrame) -> pd.DataFrame: """ Function call to prepare the dataset. (Needed application and tracker data set) call :func:`prepareData` before calling this function. :param app: Application data :param tracker: tracker data :return: A dataset. """ print( "Considering only {} sample of applications. (Might take some time ! ! !)" .format(self.sample)) app_data = app.sample(self.sample) ## detailed extraction df = pd.DataFrame() for x in tqdm(app_data.index): try: data = self.jsonExtractor(handle=x)[x] temp = pd.DataFrame(data.get('reports')[0])[[ 'version', 'updated_at', 'trackers', 'downloads' ]] temp['handle'] = x except KeyError: continue df = pd.concat([df, temp], axis=0) df.rename(columns={'trackers': 'tracker_id'}, inplace=True) df['tracker_id'] = df.tracker_id.astype('int') df = df.set_index('tracker_id').join(tracker, how='left').reset_index() final = df.set_index('handle').join( app_data, how='left').reset_index().sort_values(by='handle') return final
def _samplepointwisedepth(data: pd.DataFrame, to_compute: pd.Index = None, K=2, containment='simplex', quiet=True) -> pd.Series: """ Compute sample pointwise depth for n points in R^p, where data is an nxp matrix of points. If points is not None, only compute depth for the given points (should be a subset of data.index) Parameters: ---------- data: pd.DataFrame n x d DataFrame, where we have n points in d dimensional space. points: list, pd.Index The particular points (indices) we would like to calculate band curve for. If None, we calculate depth for all points. K=2: Number of blocks to compute sample depth with. containment: str Definition of containment. Returns: ---------- pd.Series: Depth values for the given points with respect to the data. Index of Series are indices of points in the original data, and the values are the depths """ # If K=1, don't bother splitting the data. Just return regular depth. if K == 1: return _pointwisedepth(data=data, to_compute=to_compute, containment=containment) n, d = data.shape depths = [] if to_compute is None: to_compute = data.index # K blocks of points (indices) ss = n // K # Compute sample depth of each point, should be containment agnostic # Since the computation is being done in _pointwisedepth, which will call the appropriate depth measure for time in tqdm(to_compute, disable=quiet): cd = [] for _ in tqdm(range(ss), disable=quiet): sdata = data.sample(n=ss, axis=0) # If our current datapoint isnt in the sampled data, just append it since we need to sample it if not time in sdata.index: sdata = sdata.append(data.loc[time, :]) cd.append( _pointwisedepth(data=sdata, to_compute=[time], containment=containment)) depths.append(np.mean(cd)) return pd.Series(index=to_compute, data=depths)
def test_sample_ignore_index(self): # GH 38581 df = DataFrame( {"col1": range(10, 20), "col2": range(20, 30), "colString": ["a"] * 10} ) result = df.sample(3, ignore_index=True) expected_index = Index([0, 1, 2]) tm.assert_index_equal(result.index, expected_index)
def split(df: pd.DataFrame): train = df.sample(frac=0.8) test = df.drop(train.index) print("测试集大小:", len(test)) print("测试集故障数据量", len(test.loc[df["y_final_result"] == 1])) print("训练集大小:", len(train)) print("训练集故障数据量", len(test.loc[df["y_final_result"] == 0])) return train, test
def get_geometries(shp, **filter_attrs): gdf = read_file(shp) df = DataFrame(gdf) if 'select' in filter_attrs.keys(): _drop = [x for x in df.columns if x not in filter_attrs['select']] df.drop(columns=_drop, inplace=True) df = df.sample(frac=1.) return df['geometry']
def sample(df: pd.DataFrame, args: dict) -> pd.DataFrame: if args['subsample_ratio'] > 1: ratio = args['subsample_ratio'] else: ratio = int(args['subsample_ratio'] * df.shape[0]) df = df.sample(n=int(ratio), random_state=args['seed']) return df
def _pick_random_missions(missions: pd.DataFrame) -> list[dict[str, Any]]: if len(missions) > num_missions: samples = missions.sample(num_missions) else: samples = missions return typing.cast( list[dict[str, Any]], samples[['associationName', 'title', 'link', 'description']].to_dict('records'))
def get_feature_importance(df: pd.DataFrame, feat: str, n: int = 5000): """Get the importance of each figure wrt the specified feature""" df = df.sample(n, axis=0) model = AdaBoostClassifier().fit(df.loc[:, df.columns != feat], df[feat].transform(lambda x: int(x))) return model.feature_importances_
def create_train_test_datasets(df: pd.DataFrame) -> pd.DataFrame: df = df.sample(frac=1) # shuffle df["price"] = df["price"] * 1000 test_size = int(df.shape[0] * 0.1) df_test = df[:test_size] df_train = df[test_size:] return df_test, df_train
def test_sample_does_not_modify_weights(self): # GH-42843 result = np.array([np.nan, 1, np.nan]) expected = result.copy() ser = Series([1, 2, 3]) # Test numpy array weights won't be modified in place ser.sample(weights=result) tm.assert_numpy_array_equal(result, expected) # Test DataFrame column won't be modified in place df = DataFrame({"values": [1, 1, 1], "weights": [1, np.nan, np.nan]}) expected = df["weights"].copy() df.sample(frac=1.0, replace=True, weights="weights") result = df["weights"] tm.assert_series_equal(result, expected)
def assign_validation_split(dataset: pd.DataFrame, split: float, random_state=None) -> pd.DataFrame: dataset = dataset.copy() dataset["train"] = True val_subset = dataset.sample(frac=split, random_state=random_state) dataset.loc[val_subset.index, "train"] = False return dataset
def divide_datasets(df_merged: pd.DataFrame, percentage: float = 0.67) -> (pd.DataFrame, pd.DataFrame): df_divide = df_merged.sample(frac=1) df_train = df_divide[:int((len(df_divide)) * percentage)] df_test = df_divide[int((len(df_divide)) * percentage):] return df_train, df_test
def test_sample_is_copy(self): # GH#27357, GH#30784: ensure the result of sample is an actual copy and # doesn't track the parent dataframe / doesn't give SettingWithCopy warnings df = DataFrame(np.random.randn(10, 3), columns=["a", "b", "c"]) df2 = df.sample(3) with tm.assert_produces_warning(None): df2["d"] = 1
def get_predictions_500(data: DataFrame, model) -> Series: data_500 = data.sample(500, replace=True, random_state=12345).reset_index(drop=True) features_train_500 = data_500.drop(['product'], axis=1).reset_index(drop=True) predictions = model.predict(features_train_500) return pd.Series(predictions)
def fetch_covid_data(): JSONContent = requests.get("https://api.covid19india.org/data.json").json() if 'error' not in JSONContent: channels_list = JSONContent["statewise"] channel = [] c = 0.0 for i in channels_list: c = c + 1 channel.append([i['state'], i['lastupdatedtime'], i['confirmed']]) channel = channel[1:] dataset = DataFrame(channel, columns=['State', 'Lastupdatedtime', 'Confirmed']) dataset.sample(5) dataset.to_csv('file1.csv', encoding='utf-8', sep='\t', index=False) return c
def gmm_model_selection( x: pd.DataFrame, n_components_range: range, part_size: int, n_runs: int = 100, n_cores: int = False, cv_types: Tuple = ("spherical", "tied", "diag", "full"), ) -> Tuple[List[list], List[np.ndarray], Union[int, Any]]: """ Runs GMM clustering model selection on the specified X dataframe, outputs the bic distribution per model, a vector with the median BICs and an object with the overall best model. Args: x (pandas.DataFrame): Data matrix to train the models n_components_range (range): Generator with numbers of components to evaluate n_runs (int): Number of bootstraps for each model part_size (int): Size of bootstrap samples for each model n_cores (int): Number of cores to use for computation cv_types (tuple): Covariance Matrices to try. All four available by default Returns: - bic (list): All recorded BIC values for all attempted parameter combinations (useful for plotting) - m_bic(list): All minimum BIC values recorded throughout the process (useful for plottinh) - best_bic_gmm (sklearn.GMM): Unfitted version of the best found model """ # Set the default of n_cores to the most efficient value if not n_cores: n_cores = min(multiprocessing.cpu_count(), n_runs) bic = [] m_bic = [] lowest_bic = np.inf best_bic_gmm = 0 pbar = tqdm(total=len(cv_types) * len(n_components_range)) for cv_type in cv_types: for n_components in n_components_range: res = Parallel(n_jobs=n_cores, prefer="threads")(delayed(gmm_compute)(x.sample( part_size, replace=True), n_components, cv_type) for _ in range(n_runs)) bic.append([i[1] for i in res]) pbar.update(1) m_bic.append(np.median([i[1] for i in res])) if m_bic[-1] < lowest_bic: lowest_bic = m_bic[-1] best_bic_gmm = res[0][0] return bic, m_bic, best_bic_gmm
def feature_selection(df: pd.DataFrame, config: Config): if config.is_train(): df_size_mb = df.memory_usage(deep=True).sum() / 1024 / 1024 if df_size_mb < 2 * 1024: return selected_columns = [] config_sample = copy.deepcopy(config) for i in range(10): df_sample = df.sample(min(1000, len(df)), random_state=i).copy(deep=True) preprocess_pipeline(df_sample, config_sample) y = df_sample["target"] X = df_sample.drop("target", axis=1) if len(selected_columns) > 0: X = X.drop(selected_columns, axis=1) if len(X.columns) > 0: selected_columns += select_features(X, y, config["mode"]) else: break log("Selected columns: {}".format(selected_columns)) drop_number_columns = [ c for c in df if (c.startswith("number_") or c.startswith("id_")) and c not in selected_columns ] if len(drop_number_columns) > 0: config["drop_number_columns"] = drop_number_columns config["date_columns"] = {} for c in [c for c in selected_columns if c.startswith("datetime_")]: d = c.split("_") date_col = d[0] + "_" + d[1] date_part = d[2] if date_col not in config["date_columns"]: config["date_columns"][date_col] = [] config["date_columns"][date_col].append(date_part) drop_datetime_columns = [ c for c in df if c.startswith("datetime_") and c not in config["date_columns"] ] if len(drop_datetime_columns) > 0: config["drop_datetime_columns"] = drop_datetime_columns if "drop_number_columns" in config: log("Drop number columns: {}".format(config["drop_number_columns"])) df.drop(config["drop_number_columns"], axis=1, inplace=True) if "drop_datetime_columns" in config: log("Drop datetime columns: {}".format( config["drop_datetime_columns"])) df.drop(config["drop_datetime_columns"], axis=1, inplace=True)
def split_data(df: pd.DataFrame): df_shuffled = df.sample(frac=1).reset_index(drop=True) num_rows = df.shape[0] num_rows_train_data = int(num_rows * 0.8) train_data = df_shuffled[:num_rows_train_data] val_data = df_shuffled[num_rows_train_data:] return (train_data, val_data)
def train(sets: DataFrame): print(f"Processing {len(sets)} sets\n") sets = sets.drop_duplicates() train_sets = sets.sample(frac=0.8) test_sets = concat([sets, train_sets]).drop_duplicates(keep=False) train_upset = train_sets.pop("upset").astype("int") test_upset = test_sets.pop("upset").astype("int") train_gnb(train_sets, train_upset, test_sets, test_upset) train_sgdc(train_sets, train_upset, test_sets, test_upset)
def data_split_pandas(data: pd.DataFrame, train_ratio, test_ratio): assert (train_ratio + test_ratio) == 1., 'ratio sum != 1' data = data.sample(frac=1).reset_index(drop=True) # shuffle train_index = int(len(data) * train_ratio) train_data = data.loc[:train_index, :].reset_index(drop=True) test_data = data.loc[train_index:, :].reset_index(drop=True) return train_data, test_data
def split_train_test(df: pd.DataFrame, test_size: float = 0.2)\ -> Tuple[np.array, np.array, np.array, np.array]: # TODO: make smart split (choose not randomly, but equally from each group) assert test_size < 1, f"too big test_size = {test_size}" test = df.sample(int(test_size * len(df))) train = df.drop(test.index) assert len(test) + len(train) == len(df), f"wrong size test + train != df" return train.drop(SURVIVED, axis=1).to_numpy(), train[SURVIVED].to_numpy(),\ test.drop(SURVIVED, axis=1).to_numpy(), test[SURVIVED].to_numpy()
def data_sample(X: pd.DataFrame, y: pd.Series, nrows: int=5000) -> (pd.DataFrame, pd.Series): if len(X) > nrows: X_sample = X.sample(nrows, random_state=1) y_sample = y[X_sample.index] else: X_sample = X y_sample = y return X_sample, y_sample
def fetch_card_images(cards_df: pd.DataFrame, limit_n=None, limit_frac=None, max_workers=5, delay=0.1): #, i=None): ''' Return n card images from `cards_df`.\n --- `cards_df` cards dataframe\n `limit_n:int` (optional) how many cards to pool from `cards_df`\n `limit_frac:float[0-1]` (optional) how many cards to pool from `cards_df`\n `max_workers` will be passed to TaskExecutor\n `delay` will be passed to TaskExecutor ''' if limit_n != None: cards_df = cards_df.sample(n=limit_n) elif limit_frac != None: cards_df = cards_df.sample(frac=limit_frac) # setup queue for fetching requested card images # added delay to workers as requested by scryfall, # https://scryfall.com/docs/api#rate-limits-and-good-citizenship task_master = TaskExecutor(max_workers=max_workers, delay=delay) futures = [] for (_i, card) in cards_df.iterrows(): future = task_master.submit(task=fetch_card_img, card=card, to_file=True) futures += [(card['name'], future)] # get results from futures res = [] for (card_name, future) in futures: try: res += [future.result()] except TypeError as err: if 'NoneType' in str(err): print( f'#### TypeError(NoneType) while retrieving results from {card_name} ####' ) # print(err) else: raise err return res
def data_sample(X: pd.DataFrame, nrows: int = 5000): # -> (pd.DataFrame, pd.Series): """ zypang change to one line :param X: :param nrows: :return: """ return X.copy() if len(X.index) <= nrows else X.sample( nrows, random_state=1).reset_index(drop=True)
class GroupStrings(object): def setup(self): n = 2 * 10**5 alpha = list(map(''.join, product(ascii_letters, repeat=4))) data = np.random.choice(alpha, (n // 5, 4), replace=False) data = np.repeat(data, 5, axis=0) self.df = DataFrame(data, columns=list('abcd')) self.df['joe'] = (np.random.randn(len(self.df)) * 10).round(3) self.df = self.df.sample(frac=1).reset_index(drop=True) def time_multi_columns(self): self.df.groupby(list('abcd')).max()
class I8Merge(object): params = ['inner', 'outer', 'left', 'right'] param_names = ['how'] def setup(self, how): low, high, n = -1000, 1000, 10**6 self.left = DataFrame(np.random.randint(low, high, (n, 7)), columns=list('ABCDEFG')) self.left['left'] = self.left.sum(axis=1) self.right = self.left.sample(frac=1).rename({'left': 'right'}, axis=1) self.right = self.right.reset_index(drop=True) self.right['right'] *= -1 def time_i8merge(self, how): merge(self.left, self.right, how=how)
def split_train_test(df: pd.DataFrame, percent: float=0.8): """ Creates a train and test set by random sampling where 'percent' of the initial data is used for training. :param df: The DataFrame to split. :param percent: The percentage of data to use for training. :return: A DataFrame consisting of train data, and a DataFrame consisting of test/validation data. """ df = df.sample(frac=1).reset_index(drop=True) num_rows = len(df) split_index = int(percent * num_rows) train_df = df.iloc[:split_index] test_df = df.iloc[split_index + 1:] return train_df, test_df
def build_zip_data(where_inner="", where_outer=""): """ Generates a scatter plot of complaint counts vs media income per zip code """ query = COMPLAINTS_WITH_MEDIAN_INCOME.format(where_inner, where_outer) cur.execute(query) cc_by_zip = DataFrame(cur.fetchall(), columns = [ 'zip_code', 'complaint_count', 'median_income']) cc_by_zip.set_index('zip_code', drop=False) # There are over 20,000 zip codes, so let's just take a sample, if needed if len(cc_by_zip.index) > 5000: cc_by_zip = cc_by_zip.sample(5000) # Remove outliers to make for a easier to read plot cc_by_zip = cc_by_zip[ numpy.abs( cc_by_zip.complaint_count - cc_by_zip.complaint_count.mean() ) <= (10*cc_by_zip.complaint_count.std()) ] return cc_by_zip
def test_sample(sel): # Fixes issue: 2419 # additional specific object based tests # A few dataframe test with degenerate weights. easy_weight_list = [0] * 10 easy_weight_list[5] = 1 df = pd.DataFrame({'col1': range(10, 20), 'col2': range(20, 30), 'colString': ['a'] * 10, 'easyweights': easy_weight_list}) sample1 = df.sample(n=1, weights='easyweights') assert_frame_equal(sample1, df.iloc[5:6]) # Ensure proper error if string given as weight for Series or # DataFrame with axis = 1. s = Series(range(10)) with pytest.raises(ValueError): s.sample(n=3, weights='weight_column') with pytest.raises(ValueError): df.sample(n=1, weights='weight_column', axis=1) # Check weighting key error with pytest.raises(KeyError): df.sample(n=3, weights='not_a_real_column_name') # Check that re-normalizes weights that don't sum to one. weights_less_than_1 = [0] * 10 weights_less_than_1[0] = 0.5 tm.assert_frame_equal( df.sample(n=1, weights=weights_less_than_1), df.iloc[:1]) ### # Test axis argument ### # Test axis argument df = pd.DataFrame({'col1': range(10), 'col2': ['a'] * 10}) second_column_weight = [0, 1] assert_frame_equal( df.sample(n=1, axis=1, weights=second_column_weight), df[['col2']]) # Different axis arg types assert_frame_equal(df.sample(n=1, axis='columns', weights=second_column_weight), df[['col2']]) weight = [0] * 10 weight[5] = 0.5 assert_frame_equal(df.sample(n=1, axis='rows', weights=weight), df.iloc[5:6]) assert_frame_equal(df.sample(n=1, axis='index', weights=weight), df.iloc[5:6]) # Check out of range axis values with pytest.raises(ValueError): df.sample(n=1, axis=2) with pytest.raises(ValueError): df.sample(n=1, axis='not_a_name') with pytest.raises(ValueError): s = pd.Series(range(10)) s.sample(n=1, axis=1) # Test weight length compared to correct axis with pytest.raises(ValueError): df.sample(n=1, axis=1, weights=[0.5] * 10) # Check weights with axis = 1 easy_weight_list = [0] * 3 easy_weight_list[2] = 1 df = pd.DataFrame({'col1': range(10, 20), 'col2': range(20, 30), 'colString': ['a'] * 10}) sample1 = df.sample(n=1, axis=1, weights=easy_weight_list) assert_frame_equal(sample1, df[['colString']]) # Test default axes assert_frame_equal( df.sample(n=3, random_state=42), df.sample(n=3, axis=0, random_state=42)) # Test that function aligns weights with frame df = DataFrame( {'col1': [5, 6, 7], 'col2': ['a', 'b', 'c'], }, index=[9, 5, 3]) s = Series([1, 0, 0], index=[3, 5, 9]) assert_frame_equal(df.loc[[3]], df.sample(1, weights=s)) # Weights have index values to be dropped because not in # sampled DataFrame s2 = Series([0.001, 0, 10000], index=[3, 5, 10]) assert_frame_equal(df.loc[[3]], df.sample(1, weights=s2)) # Weights have empty values to be filed with zeros s3 = Series([0.01, 0], index=[3, 5]) assert_frame_equal(df.loc[[3]], df.sample(1, weights=s3)) # No overlap in weight and sampled DataFrame indices s4 = Series([1, 0], index=[1, 2]) with pytest.raises(ValueError): df.sample(1, weights=s4)