Example #1
0
def data_sample(X: pd.DataFrame, y: pd.Series, nrows: int = 5000, method: int = 2, random_state=SEED):
    # -> (pd.DataFrame, pd.Series):
    if len(X) > nrows:
        if method == 0:
            X_sample = X.sample(nrows, random_state=random_state)
            y_sample = y[X_sample.index]
        elif method == 1:
            # for unbalanced data - take care of imbalance
            X_sample = X.assign(label=y)
            rate = pd.DataFrame(data=[[1, len(y) - np.sum(y)], [0, np.sum(y)]],
                                columns=['label', 'rate'])
            X_sample = X_sample.merge(rate, on='label') \
                .sample(nrows, random_state=random_state, weights='rate')
            y_sample = X_sample['label']
            X_sample = X_sample.drop(['label', 'rate'], axis=1)
        else:
            # for unbalanced data - keep imbalance
            nrows_1 = int(np.ceil(np.sum(y) / len(y) * np.sum(nrows)))
            X_sample = X.assign(label=y)
            X_sample_1 = X_sample \
                .query("label == 1") \
                .sample(nrows_1, random_state=random_state)
            X_sample_0 = X_sample \
                .query("label == 0") \
                .sample(nrows - nrows_1, random_state=random_state)
            X_sample = pd.concat([X_sample_0, X_sample_1],
                                 sort=False,
                                 ignore_index=True)
            X_sample = shuffle(X_sample)
            y_sample = X_sample['label']
            X_sample = X_sample.drop(['label'], axis=1)
Example #2
0
    def test_sample_axis1(self):
        # Check weights with axis = 1
        easy_weight_list = [0] * 3
        easy_weight_list[2] = 1

        df = DataFrame({
            "col1": range(10, 20),
            "col2": range(20, 30),
            "colString": ["a"] * 10
        })
        sample1 = df.sample(n=1, axis=1, weights=easy_weight_list)
        tm.assert_frame_equal(sample1, df[["colString"]])

        # Test default axes
        tm.assert_frame_equal(df.sample(n=3, random_state=42),
                              df.sample(n=3, axis=0, random_state=42))
def get_cubic_fit(data: pd.DataFrame):
    data = data.sort_values(by='x')
    data_x, data_y = data['x'].dropna(), data['y'].dropna()
    f = scipy.interpolate.CubicSpline(data_x, data_y)
    x = list(np.linspace(min(data_x), max(data_x), NUM_INTERPOLATED_POINTS))
    data = pd.DataFrame(dict(x=x, y=f(x)))
    return data.sample(frac=0.1, weights=data.y)
Example #4
0
def find_risk_profile(df: pd.DataFrame, feature: str, topk_ratio: float,
                      adj: float, option: str) -> list or dict:
    """
    dtype feature: str
    dtype topk_ratio: float (range: 0-1)
    dtype adj: float (to modify the mean)
    dtype option: str ('topk', 'ratio')
    rtype: list(option='topk') or dict(option='ratio')
    
    The option topk is usually better than the ratio because of overfitting.
    """

    # Top-k suspicious item flagging
    if option == 'topk':
        total_cnt = df.groupby([feature])['illicit']
        nrisky_profile = int(topk_ratio * len(total_cnt)) + 1
        # prob_illicit = total_cnt.mean()  # Simple mean
        adj_prob_illicit = total_cnt.sum() / (total_cnt.count() + adj
                                              )  # Smoothed mean
        return list(
            adj_prob_illicit.sort_values(
                ascending=False).head(nrisky_profile).index)

    # Illicit-ratio encoding (Mean target encoding)
    # Refer: http://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-munging/target-encoding.html
    # Refer: https://towardsdatascience.com/why-you-should-try-mean-encoding-17057262cd0
    elif option == 'ratio':
        # For target encoding, we just use 70% of train data to avoid overfitting (otherwise, test AUC drops significantly)
        total_cnt = df.sample(frac=0.7).groupby([feature])['illicit']
        nrisky_profile = int(topk_ratio * len(total_cnt)) + 1
        # prob_illicit = total_cnt.mean()  # Simple mean
        adj_prob_illicit = total_cnt.sum() / (total_cnt.count() + adj
                                              )  # Smoothed mean
        return adj_prob_illicit.to_dict()
Example #5
0
def create_rankings(ex_mtx: pd.DataFrame, seed=None) -> pd.DataFrame:
    """
    Create a whole genome rankings dataframe from a single cell expression profile dataframe.

    :param ex_mtx: The expression profile matrix. The rows should correspond to different cells, the columns to different
        genes (n_cells x n_genes).
    :return: A genome rankings dataframe (n_cells x n_genes).
    """
    # Do a shuffle would be nice for exactly similar behaviour as R implementation.
    # 1. Ranks are assigned in the range of 1 to n, therefore we need to subtract 1.
    # 2. In case of a tie the 'first' method is used, i.e. we keep the order in the original array. The remove any
    #    bias we shuffle the dataframe before ranking it. This introduces a performance penalty!
    # 3. Genes are ranked according to gene expression in descending order, i.e. from highly expressed (0) to low expression (n).
    # 3. NAs should be given the highest rank numbers. Documentation is bad, so tested implementation via code snippet:
    #
    #    import pandas as pd
    #    import numpy as np
    #    df = pd.DataFrame(data=[4, 1, 3, np.nan, 2, 3], columns=['values'])
    #    # Run below statement multiple times to see effect of shuffling in case of a tie.
    #    df.sample(frac=1.0, replace=False).rank(ascending=False, method='first', na_option='bottom').sort_index() - 1
    #
    return (
        ex_mtx.sample(frac=1.0, replace=False, axis=1, random_state=seed).rank(
            axis=1, ascending=False, method='first',
            na_option='bottom').astype(DTYPE) - 1)
Example #6
0
def pareto_and_runtimes_by_task(df: pd.DataFrame) -> alt.Chart:
  """Creates an interactive Pareto curve and scatter plot of task runtimes.

  Tracing each curve shows to what extent a small proportion of long-running
  regions contribute disproportionately to the overall runtime. That is,
  "The longest-running X% of regions account for Y% of the total runtime."
  There is a curve for each task.

  Args:
    df: A dataframe of all regions.

  Returns:
    An altair chart.
  """
  grouped = df.groupby(df['Task'], sort=False)
  df = grouped.apply(calculate_pareto_metrics)

  # Sample along the Pareto curve, ensuring the longest regions are shown.
  if len(df) > 5000:
    x = 1000
    df = pd.concat([df.nlargest(x, 'total runtime'), df.sample(5000 - x)])

  # Limit columns to greatly reduce the size of the html report.
  columns_used = [
      'task cumsum order', 'task cumsum fraction', 'tooltip', 'Task',
      'task total runtime', 'task num examples', 'Runtime for task'
  ]
  df = df[columns_used]

  # Brushing on the task_scatter plot highlights the same tasks in the Pareto
  # curve.
  brush = alt.selection_interval()

  pareto_by_task = alt.Chart(df).mark_line(size=2).encode(
      x=alt.X(
          'task cumsum order',
          title='The longest-runtime X% of regions',
          axis=alt.Axis(format='%')),
      y=alt.Y(
          'task cumsum fraction',
          title='Account for Y% of the total runtime',
          axis=alt.Axis(format='%')),
      tooltip='tooltip',
      color=alt.condition(brush, 'Task:N', alt.value('lightgray'))).properties(
          title='Pareto curve for each task').interactive()

  # This chart needs to use the same dataframe as the first chart to enable the
  # brushing on one to affect the other. Using max(task) for 'text' is a
  # trick that causes bundling by task to avoid showing multiple overlapping
  # points which otherwise make the text look funky.
  task_scatter = alt.Chart(df).mark_point(size=10).encode(
      x=alt.X('max(task total runtime)', title='Runtime (seconds)'),
      y=alt.Y('task num examples:Q', title='Number of examples'),
      color=alt.condition(brush, 'Task:N', alt.value('lightgray')),
      tooltip=['Task', 'Runtime for task']
    ) \
    .properties(title='Total runtime for each task (drag to highlight)') \
    .add_selection(brush)

  return pareto_by_task | task_scatter
Example #7
0
    def accumulateAll(self, app: pd.DataFrame,
                      tracker: pd.DataFrame) -> pd.DataFrame:
        """ Function call to prepare the dataset. (Needed application and tracker data set)
        call :func:`prepareData` before calling this function.

        :param app: Application data
        :param tracker: tracker data
        :return: A dataset.

        """

        print(
            "Considering only {} sample of applications. (Might take some time ! ! !)"
            .format(self.sample))
        app_data = app.sample(self.sample)

        ## detailed extraction
        df = pd.DataFrame()
        for x in tqdm(app_data.index):
            try:
                data = self.jsonExtractor(handle=x)[x]
                temp = pd.DataFrame(data.get('reports')[0])[[
                    'version', 'updated_at', 'trackers', 'downloads'
                ]]
                temp['handle'] = x
            except KeyError:
                continue
            df = pd.concat([df, temp], axis=0)
        df.rename(columns={'trackers': 'tracker_id'}, inplace=True)
        df['tracker_id'] = df.tracker_id.astype('int')

        df = df.set_index('tracker_id').join(tracker, how='left').reset_index()
        final = df.set_index('handle').join(
            app_data, how='left').reset_index().sort_values(by='handle')
        return final
Example #8
0
def _samplepointwisedepth(data: pd.DataFrame,
                          to_compute: pd.Index = None,
                          K=2,
                          containment='simplex',
                          quiet=True) -> pd.Series:
    """
    Compute sample pointwise depth for n points in R^p, where data is an nxp matrix of points. If points is not None,
    only compute depth for the given points (should be a subset of data.index)
    
    Parameters:
    ----------
    data: pd.DataFrame
        n x d DataFrame, where we have n points in d dimensional space.
    points: list, pd.Index
        The particular points (indices) we would like to calculate band curve for. If None, we calculate depth for all points.
    K=2:
        Number of blocks to compute sample depth with. 
    containment: str
        Definition of containment.

    Returns:
    ----------
    pd.Series: Depth values for the given points with respect to the data. Index of Series are indices of points in the original data, and the values are the depths

    """

    # If K=1, don't bother splitting the data. Just return regular depth.
    if K == 1:
        return _pointwisedepth(data=data,
                               to_compute=to_compute,
                               containment=containment)

    n, d = data.shape
    depths = []

    if to_compute is None:
        to_compute = data.index

    # K blocks of points (indices)
    ss = n // K

    # Compute sample depth of each point, should be containment agnostic
    # Since the computation is being done in _pointwisedepth, which will call the appropriate depth measure
    for time in tqdm(to_compute, disable=quiet):
        cd = []
        for _ in tqdm(range(ss), disable=quiet):
            sdata = data.sample(n=ss, axis=0)

            # If our current datapoint isnt in the sampled data, just append it since we need to sample it
            if not time in sdata.index:
                sdata = sdata.append(data.loc[time, :])

            cd.append(
                _pointwisedepth(data=sdata,
                                to_compute=[time],
                                containment=containment))
        depths.append(np.mean(cd))

    return pd.Series(index=to_compute, data=depths)
Example #9
0
 def test_sample_ignore_index(self):
     # GH 38581
     df = DataFrame(
         {"col1": range(10, 20), "col2": range(20, 30), "colString": ["a"] * 10}
     )
     result = df.sample(3, ignore_index=True)
     expected_index = Index([0, 1, 2])
     tm.assert_index_equal(result.index, expected_index)
Example #10
0
def split(df: pd.DataFrame):
    train = df.sample(frac=0.8)
    test = df.drop(train.index)
    print("测试集大小:", len(test))
    print("测试集故障数据量", len(test.loc[df["y_final_result"] == 1]))
    print("训练集大小:", len(train))
    print("训练集故障数据量", len(test.loc[df["y_final_result"] == 0]))
    return train, test
Example #11
0
def get_geometries(shp, **filter_attrs):
    gdf = read_file(shp)
    df = DataFrame(gdf)
    if 'select' in filter_attrs.keys():
        _drop = [x for x in df.columns if x not in filter_attrs['select']]
        df.drop(columns=_drop, inplace=True)
    df = df.sample(frac=1.)
    return df['geometry']
Example #12
0
def sample(df: pd.DataFrame, args: dict) -> pd.DataFrame:
    if args['subsample_ratio'] > 1:
        ratio = args['subsample_ratio']
    else:
        ratio = int(args['subsample_ratio'] * df.shape[0])

    df = df.sample(n=int(ratio), random_state=args['seed'])
    return df
 def _pick_random_missions(missions: pd.DataFrame) -> list[dict[str, Any]]:
     if len(missions) > num_missions:
         samples = missions.sample(num_missions)
     else:
         samples = missions
     return typing.cast(
         list[dict[str, Any]],
         samples[['associationName', 'title', 'link', 'description']].to_dict('records'))
Example #14
0
def get_feature_importance(df: pd.DataFrame, feat: str, n: int = 5000):
    """Get the importance of each figure wrt the specified feature"""

    df = df.sample(n, axis=0)
    model = AdaBoostClassifier().fit(df.loc[:, df.columns != feat],
                                     df[feat].transform(lambda x: int(x)))

    return model.feature_importances_
def create_train_test_datasets(df: pd.DataFrame) -> pd.DataFrame:
    df = df.sample(frac=1)  # shuffle
    df["price"] = df["price"] * 1000
    test_size = int(df.shape[0] * 0.1)
    df_test = df[:test_size]
    df_train = df[test_size:]

    return df_test, df_train
Example #16
0
    def test_sample_does_not_modify_weights(self):
        # GH-42843
        result = np.array([np.nan, 1, np.nan])
        expected = result.copy()
        ser = Series([1, 2, 3])

        # Test numpy array weights won't be modified in place
        ser.sample(weights=result)
        tm.assert_numpy_array_equal(result, expected)

        # Test DataFrame column won't be modified in place
        df = DataFrame({"values": [1, 1, 1], "weights": [1, np.nan, np.nan]})
        expected = df["weights"].copy()

        df.sample(frac=1.0, replace=True, weights="weights")
        result = df["weights"]
        tm.assert_series_equal(result, expected)
Example #17
0
def assign_validation_split(dataset: pd.DataFrame,
                            split: float,
                            random_state=None) -> pd.DataFrame:
    dataset = dataset.copy()
    dataset["train"] = True
    val_subset = dataset.sample(frac=split, random_state=random_state)
    dataset.loc[val_subset.index, "train"] = False
    return dataset
Example #18
0
def divide_datasets(df_merged: pd.DataFrame,
                    percentage: float = 0.67) -> (pd.DataFrame, pd.DataFrame):

    df_divide = df_merged.sample(frac=1)
    df_train = df_divide[:int((len(df_divide)) * percentage)]
    df_test = df_divide[int((len(df_divide)) * percentage):]

    return df_train, df_test
Example #19
0
    def test_sample_is_copy(self):
        # GH#27357, GH#30784: ensure the result of sample is an actual copy and
        # doesn't track the parent dataframe / doesn't give SettingWithCopy warnings
        df = DataFrame(np.random.randn(10, 3), columns=["a", "b", "c"])
        df2 = df.sample(3)

        with tm.assert_produces_warning(None):
            df2["d"] = 1
Example #20
0
def get_predictions_500(data: DataFrame, model) -> Series:
    data_500 = data.sample(500, replace=True,
                           random_state=12345).reset_index(drop=True)
    features_train_500 = data_500.drop(['product'],
                                       axis=1).reset_index(drop=True)
    predictions = model.predict(features_train_500)

    return pd.Series(predictions)
def fetch_covid_data():
    JSONContent = requests.get("https://api.covid19india.org/data.json").json()

    if 'error' not in JSONContent:
        channels_list = JSONContent["statewise"]
    channel = []
    c = 0.0
    for i in channels_list:
        c = c + 1
        channel.append([i['state'], i['lastupdatedtime'], i['confirmed']])
    channel = channel[1:]

    dataset = DataFrame(channel,
                        columns=['State', 'Lastupdatedtime', 'Confirmed'])
    dataset.sample(5)
    dataset.to_csv('file1.csv', encoding='utf-8', sep='\t', index=False)
    return c
Example #22
0
def gmm_model_selection(
    x: pd.DataFrame,
    n_components_range: range,
    part_size: int,
    n_runs: int = 100,
    n_cores: int = False,
    cv_types: Tuple = ("spherical", "tied", "diag", "full"),
) -> Tuple[List[list], List[np.ndarray], Union[int, Any]]:
    """

    Runs GMM clustering model selection on the specified X dataframe, outputs the bic distribution per model,
    a vector with the median BICs and an object with the overall best model.

    Args:
        x (pandas.DataFrame): Data matrix to train the models
        n_components_range (range): Generator with numbers of components to evaluate
        n_runs (int): Number of bootstraps for each model
        part_size (int): Size of bootstrap samples for each model
        n_cores (int): Number of cores to use for computation
        cv_types (tuple): Covariance Matrices to try. All four available by default

    Returns:
        - bic (list): All recorded BIC values for all attempted parameter combinations
        (useful for plotting)
        - m_bic(list): All minimum BIC values recorded throughout the process
        (useful for plottinh)
        - best_bic_gmm (sklearn.GMM): Unfitted version of the best found model

    """

    # Set the default of n_cores to the most efficient value
    if not n_cores:
        n_cores = min(multiprocessing.cpu_count(), n_runs)

    bic = []
    m_bic = []
    lowest_bic = np.inf
    best_bic_gmm = 0

    pbar = tqdm(total=len(cv_types) * len(n_components_range))

    for cv_type in cv_types:

        for n_components in n_components_range:

            res = Parallel(n_jobs=n_cores,
                           prefer="threads")(delayed(gmm_compute)(x.sample(
                               part_size, replace=True), n_components, cv_type)
                                             for _ in range(n_runs))
            bic.append([i[1] for i in res])

            pbar.update(1)
            m_bic.append(np.median([i[1] for i in res]))
            if m_bic[-1] < lowest_bic:
                lowest_bic = m_bic[-1]
                best_bic_gmm = res[0][0]

    return bic, m_bic, best_bic_gmm
Example #23
0
def feature_selection(df: pd.DataFrame, config: Config):
    if config.is_train():
        df_size_mb = df.memory_usage(deep=True).sum() / 1024 / 1024
        if df_size_mb < 2 * 1024:
            return

        selected_columns = []
        config_sample = copy.deepcopy(config)
        for i in range(10):
            df_sample = df.sample(min(1000, len(df)),
                                  random_state=i).copy(deep=True)
            preprocess_pipeline(df_sample, config_sample)
            y = df_sample["target"]
            X = df_sample.drop("target", axis=1)

            if len(selected_columns) > 0:
                X = X.drop(selected_columns, axis=1)

            if len(X.columns) > 0:
                selected_columns += select_features(X, y, config["mode"])
            else:
                break

        log("Selected columns: {}".format(selected_columns))

        drop_number_columns = [
            c for c in df if (c.startswith("number_") or c.startswith("id_"))
            and c not in selected_columns
        ]
        if len(drop_number_columns) > 0:
            config["drop_number_columns"] = drop_number_columns

        config["date_columns"] = {}
        for c in [c for c in selected_columns if c.startswith("datetime_")]:
            d = c.split("_")
            date_col = d[0] + "_" + d[1]
            date_part = d[2]

            if date_col not in config["date_columns"]:
                config["date_columns"][date_col] = []

            config["date_columns"][date_col].append(date_part)

        drop_datetime_columns = [
            c for c in df
            if c.startswith("datetime_") and c not in config["date_columns"]
        ]
        if len(drop_datetime_columns) > 0:
            config["drop_datetime_columns"] = drop_datetime_columns

    if "drop_number_columns" in config:
        log("Drop number columns: {}".format(config["drop_number_columns"]))
        df.drop(config["drop_number_columns"], axis=1, inplace=True)

    if "drop_datetime_columns" in config:
        log("Drop datetime columns: {}".format(
            config["drop_datetime_columns"]))
        df.drop(config["drop_datetime_columns"], axis=1, inplace=True)
Example #24
0
def split_data(df: pd.DataFrame):
    df_shuffled = df.sample(frac=1).reset_index(drop=True)
    num_rows = df.shape[0]
    num_rows_train_data = int(num_rows * 0.8)

    train_data = df_shuffled[:num_rows_train_data]
    val_data = df_shuffled[num_rows_train_data:]

    return (train_data, val_data)
Example #25
0
def train(sets: DataFrame):
    print(f"Processing {len(sets)} sets\n")
    sets = sets.drop_duplicates()
    train_sets = sets.sample(frac=0.8)
    test_sets = concat([sets, train_sets]).drop_duplicates(keep=False)
    train_upset = train_sets.pop("upset").astype("int")
    test_upset = test_sets.pop("upset").astype("int")
    train_gnb(train_sets, train_upset, test_sets, test_upset)
    train_sgdc(train_sets, train_upset, test_sets, test_upset)
Example #26
0
def data_split_pandas(data: pd.DataFrame, train_ratio, test_ratio):
    assert (train_ratio + test_ratio) == 1., 'ratio sum != 1'
    data = data.sample(frac=1).reset_index(drop=True)  # shuffle
    train_index = int(len(data) * train_ratio)

    train_data = data.loc[:train_index, :].reset_index(drop=True)
    test_data = data.loc[train_index:, :].reset_index(drop=True)

    return train_data, test_data
Example #27
0
def split_train_test(df: pd.DataFrame, test_size: float = 0.2)\
        -> Tuple[np.array, np.array, np.array, np.array]:
    # TODO: make smart split (choose not randomly, but equally from each group)
    assert test_size < 1, f"too big test_size = {test_size}"
    test = df.sample(int(test_size * len(df)))
    train = df.drop(test.index)
    assert len(test) + len(train) == len(df), f"wrong size test + train != df"
    return train.drop(SURVIVED, axis=1).to_numpy(), train[SURVIVED].to_numpy(),\
           test.drop(SURVIVED, axis=1).to_numpy(), test[SURVIVED].to_numpy()
Example #28
0
def data_sample(X: pd.DataFrame, y: pd.Series, nrows: int=5000) -> (pd.DataFrame, pd.Series):
    if len(X) > nrows:
        X_sample = X.sample(nrows, random_state=1)
        y_sample = y[X_sample.index]
    else:
        X_sample = X
        y_sample = y

    return X_sample, y_sample
Example #29
0
def fetch_card_images(cards_df: pd.DataFrame,
                      limit_n=None,
                      limit_frac=None,
                      max_workers=5,
                      delay=0.1):  #, i=None):
    '''
    Return n card images from `cards_df`.\n
    ---
    `cards_df` cards dataframe\n
    `limit_n:int` (optional) how many cards to pool from `cards_df`\n
    `limit_frac:float[0-1]` (optional) how many cards to pool from `cards_df`\n
    `max_workers` will be passed to TaskExecutor\n
    `delay` will be passed to TaskExecutor
    '''
    if limit_n != None:
        cards_df = cards_df.sample(n=limit_n)
    elif limit_frac != None:
        cards_df = cards_df.sample(frac=limit_frac)

    # setup queue for fetching requested card images
    # added delay to workers as requested by scryfall,
    # https://scryfall.com/docs/api#rate-limits-and-good-citizenship
    task_master = TaskExecutor(max_workers=max_workers, delay=delay)
    futures = []
    for (_i, card) in cards_df.iterrows():
        future = task_master.submit(task=fetch_card_img,
                                    card=card,
                                    to_file=True)
        futures += [(card['name'], future)]

    # get results from futures
    res = []
    for (card_name, future) in futures:
        try:
            res += [future.result()]
        except TypeError as err:
            if 'NoneType' in str(err):
                print(
                    f'#### TypeError(NoneType) while retrieving results from {card_name} ####'
                )
                # print(err)
            else:
                raise err
    return res
Example #30
0
def data_sample(X: pd.DataFrame, nrows: int = 5000):
    # -> (pd.DataFrame, pd.Series):
    """
    zypang change to one line
    :param X:
    :param nrows:
    :return:
    """
    return X.copy() if len(X.index) <= nrows else X.sample(
        nrows, random_state=1).reset_index(drop=True)
Example #31
0
class GroupStrings(object):

    def setup(self):
        n = 2 * 10**5
        alpha = list(map(''.join, product(ascii_letters, repeat=4)))
        data = np.random.choice(alpha, (n // 5, 4), replace=False)
        data = np.repeat(data, 5, axis=0)
        self.df = DataFrame(data, columns=list('abcd'))
        self.df['joe'] = (np.random.randn(len(self.df)) * 10).round(3)
        self.df = self.df.sample(frac=1).reset_index(drop=True)

    def time_multi_columns(self):
        self.df.groupby(list('abcd')).max()
Example #32
0
class I8Merge(object):

    params = ['inner', 'outer', 'left', 'right']
    param_names = ['how']

    def setup(self, how):
        low, high, n = -1000, 1000, 10**6
        self.left = DataFrame(np.random.randint(low, high, (n, 7)),
                              columns=list('ABCDEFG'))
        self.left['left'] = self.left.sum(axis=1)
        self.right = self.left.sample(frac=1).rename({'left': 'right'}, axis=1)
        self.right = self.right.reset_index(drop=True)
        self.right['right'] *= -1

    def time_i8merge(self, how):
        merge(self.left, self.right, how=how)
def split_train_test(df: pd.DataFrame, percent: float=0.8):
    """
    Creates a train and test set by random sampling where 'percent' of the initial
    data is used for training.
    :param df: The DataFrame to split.
    :param percent: The percentage of data to use for training.
    :return: A DataFrame consisting of train data, and a DataFrame consisting of test/validation data.
    """
    df = df.sample(frac=1).reset_index(drop=True)
    num_rows = len(df)

    split_index = int(percent * num_rows)

    train_df = df.iloc[:split_index]
    test_df = df.iloc[split_index + 1:]

    return train_df, test_df
Example #34
0
def build_zip_data(where_inner="", where_outer=""):
    """
    Generates a scatter plot of complaint counts vs media income per zip code
    """
    query = COMPLAINTS_WITH_MEDIAN_INCOME.format(where_inner, where_outer)
    cur.execute(query)
    cc_by_zip = DataFrame(cur.fetchall(), columns = [
                          'zip_code', 'complaint_count', 'median_income'])
    cc_by_zip.set_index('zip_code', drop=False)

    # There are over 20,000 zip codes, so let's just take a sample, if needed
    if len(cc_by_zip.index) > 5000:
        cc_by_zip = cc_by_zip.sample(5000)

    # Remove outliers to make for a easier to read plot
    cc_by_zip = cc_by_zip[
        numpy.abs(
            cc_by_zip.complaint_count - cc_by_zip.complaint_count.mean()
        ) <= (10*cc_by_zip.complaint_count.std())
    ]

    return cc_by_zip
Example #35
0
    def test_sample(sel):
        # Fixes issue: 2419
        # additional specific object based tests

        # A few dataframe test with degenerate weights.
        easy_weight_list = [0] * 10
        easy_weight_list[5] = 1

        df = pd.DataFrame({'col1': range(10, 20),
                           'col2': range(20, 30),
                           'colString': ['a'] * 10,
                           'easyweights': easy_weight_list})
        sample1 = df.sample(n=1, weights='easyweights')
        assert_frame_equal(sample1, df.iloc[5:6])

        # Ensure proper error if string given as weight for Series or
        # DataFrame with axis = 1.
        s = Series(range(10))
        with pytest.raises(ValueError):
            s.sample(n=3, weights='weight_column')

        with pytest.raises(ValueError):
            df.sample(n=1, weights='weight_column', axis=1)

        # Check weighting key error
        with pytest.raises(KeyError):
            df.sample(n=3, weights='not_a_real_column_name')

        # Check that re-normalizes weights that don't sum to one.
        weights_less_than_1 = [0] * 10
        weights_less_than_1[0] = 0.5
        tm.assert_frame_equal(
            df.sample(n=1, weights=weights_less_than_1), df.iloc[:1])

        ###
        # Test axis argument
        ###

        # Test axis argument
        df = pd.DataFrame({'col1': range(10), 'col2': ['a'] * 10})
        second_column_weight = [0, 1]
        assert_frame_equal(
            df.sample(n=1, axis=1, weights=second_column_weight), df[['col2']])

        # Different axis arg types
        assert_frame_equal(df.sample(n=1, axis='columns',
                                     weights=second_column_weight),
                           df[['col2']])

        weight = [0] * 10
        weight[5] = 0.5
        assert_frame_equal(df.sample(n=1, axis='rows', weights=weight),
                           df.iloc[5:6])
        assert_frame_equal(df.sample(n=1, axis='index', weights=weight),
                           df.iloc[5:6])

        # Check out of range axis values
        with pytest.raises(ValueError):
            df.sample(n=1, axis=2)

        with pytest.raises(ValueError):
            df.sample(n=1, axis='not_a_name')

        with pytest.raises(ValueError):
            s = pd.Series(range(10))
            s.sample(n=1, axis=1)

        # Test weight length compared to correct axis
        with pytest.raises(ValueError):
            df.sample(n=1, axis=1, weights=[0.5] * 10)

        # Check weights with axis = 1
        easy_weight_list = [0] * 3
        easy_weight_list[2] = 1

        df = pd.DataFrame({'col1': range(10, 20),
                           'col2': range(20, 30),
                           'colString': ['a'] * 10})
        sample1 = df.sample(n=1, axis=1, weights=easy_weight_list)
        assert_frame_equal(sample1, df[['colString']])

        # Test default axes
        assert_frame_equal(
            df.sample(n=3, random_state=42), df.sample(n=3, axis=0,
                                                       random_state=42))

        # Test that function aligns weights with frame
        df = DataFrame(
            {'col1': [5, 6, 7],
             'col2': ['a', 'b', 'c'], }, index=[9, 5, 3])
        s = Series([1, 0, 0], index=[3, 5, 9])
        assert_frame_equal(df.loc[[3]], df.sample(1, weights=s))

        # Weights have index values to be dropped because not in
        # sampled DataFrame
        s2 = Series([0.001, 0, 10000], index=[3, 5, 10])
        assert_frame_equal(df.loc[[3]], df.sample(1, weights=s2))

        # Weights have empty values to be filed with zeros
        s3 = Series([0.01, 0], index=[3, 5])
        assert_frame_equal(df.loc[[3]], df.sample(1, weights=s3))

        # No overlap in weight and sampled DataFrame indices
        s4 = Series([1, 0], index=[1, 2])
        with pytest.raises(ValueError):
            df.sample(1, weights=s4)