Example #1
0
class GroupStrings(object):

    def setup(self):
        n = 2 * 10**5
        alpha = list(map(''.join, product(ascii_letters, repeat=4)))
        data = np.random.choice(alpha, (n // 5, 4), replace=False)
        data = np.repeat(data, 5, axis=0)
        self.df = DataFrame(data, columns=list('abcd'))
        self.df['joe'] = (np.random.randn(len(self.df)) * 10).round(3)
        self.df = self.df.sample(frac=1).reset_index(drop=True)

    def time_multi_columns(self):
        self.df.groupby(list('abcd')).max()
Example #2
0
class I8Merge(object):

    params = ['inner', 'outer', 'left', 'right']
    param_names = ['how']

    def setup(self, how):
        low, high, n = -1000, 1000, 10**6
        self.left = DataFrame(np.random.randint(low, high, (n, 7)),
                              columns=list('ABCDEFG'))
        self.left['left'] = self.left.sum(axis=1)
        self.right = self.left.sample(frac=1).rename({'left': 'right'}, axis=1)
        self.right = self.right.reset_index(drop=True)
        self.right['right'] *= -1

    def time_i8merge(self, how):
        merge(self.left, self.right, how=how)
def split_train_test(df: pd.DataFrame, percent: float=0.8):
    """
    Creates a train and test set by random sampling where 'percent' of the initial
    data is used for training.
    :param df: The DataFrame to split.
    :param percent: The percentage of data to use for training.
    :return: A DataFrame consisting of train data, and a DataFrame consisting of test/validation data.
    """
    df = df.sample(frac=1).reset_index(drop=True)
    num_rows = len(df)

    split_index = int(percent * num_rows)

    train_df = df.iloc[:split_index]
    test_df = df.iloc[split_index + 1:]

    return train_df, test_df
Example #4
0
def build_zip_data(where_inner="", where_outer=""):
    """
    Generates a scatter plot of complaint counts vs media income per zip code
    """
    query = COMPLAINTS_WITH_MEDIAN_INCOME.format(where_inner, where_outer)
    cur.execute(query)
    cc_by_zip = DataFrame(cur.fetchall(), columns = [
                          'zip_code', 'complaint_count', 'median_income'])
    cc_by_zip.set_index('zip_code', drop=False)

    # There are over 20,000 zip codes, so let's just take a sample, if needed
    if len(cc_by_zip.index) > 5000:
        cc_by_zip = cc_by_zip.sample(5000)

    # Remove outliers to make for a easier to read plot
    cc_by_zip = cc_by_zip[
        numpy.abs(
            cc_by_zip.complaint_count - cc_by_zip.complaint_count.mean()
        ) <= (10*cc_by_zip.complaint_count.std())
    ]

    return cc_by_zip
Example #5
0
    def test_sample(sel):
        # Fixes issue: 2419
        # additional specific object based tests

        # A few dataframe test with degenerate weights.
        easy_weight_list = [0] * 10
        easy_weight_list[5] = 1

        df = pd.DataFrame({'col1': range(10, 20),
                           'col2': range(20, 30),
                           'colString': ['a'] * 10,
                           'easyweights': easy_weight_list})
        sample1 = df.sample(n=1, weights='easyweights')
        assert_frame_equal(sample1, df.iloc[5:6])

        # Ensure proper error if string given as weight for Series or
        # DataFrame with axis = 1.
        s = Series(range(10))
        with pytest.raises(ValueError):
            s.sample(n=3, weights='weight_column')

        with pytest.raises(ValueError):
            df.sample(n=1, weights='weight_column', axis=1)

        # Check weighting key error
        with pytest.raises(KeyError):
            df.sample(n=3, weights='not_a_real_column_name')

        # Check that re-normalizes weights that don't sum to one.
        weights_less_than_1 = [0] * 10
        weights_less_than_1[0] = 0.5
        tm.assert_frame_equal(
            df.sample(n=1, weights=weights_less_than_1), df.iloc[:1])

        ###
        # Test axis argument
        ###

        # Test axis argument
        df = pd.DataFrame({'col1': range(10), 'col2': ['a'] * 10})
        second_column_weight = [0, 1]
        assert_frame_equal(
            df.sample(n=1, axis=1, weights=second_column_weight), df[['col2']])

        # Different axis arg types
        assert_frame_equal(df.sample(n=1, axis='columns',
                                     weights=second_column_weight),
                           df[['col2']])

        weight = [0] * 10
        weight[5] = 0.5
        assert_frame_equal(df.sample(n=1, axis='rows', weights=weight),
                           df.iloc[5:6])
        assert_frame_equal(df.sample(n=1, axis='index', weights=weight),
                           df.iloc[5:6])

        # Check out of range axis values
        with pytest.raises(ValueError):
            df.sample(n=1, axis=2)

        with pytest.raises(ValueError):
            df.sample(n=1, axis='not_a_name')

        with pytest.raises(ValueError):
            s = pd.Series(range(10))
            s.sample(n=1, axis=1)

        # Test weight length compared to correct axis
        with pytest.raises(ValueError):
            df.sample(n=1, axis=1, weights=[0.5] * 10)

        # Check weights with axis = 1
        easy_weight_list = [0] * 3
        easy_weight_list[2] = 1

        df = pd.DataFrame({'col1': range(10, 20),
                           'col2': range(20, 30),
                           'colString': ['a'] * 10})
        sample1 = df.sample(n=1, axis=1, weights=easy_weight_list)
        assert_frame_equal(sample1, df[['colString']])

        # Test default axes
        assert_frame_equal(
            df.sample(n=3, random_state=42), df.sample(n=3, axis=0,
                                                       random_state=42))

        # Test that function aligns weights with frame
        df = DataFrame(
            {'col1': [5, 6, 7],
             'col2': ['a', 'b', 'c'], }, index=[9, 5, 3])
        s = Series([1, 0, 0], index=[3, 5, 9])
        assert_frame_equal(df.loc[[3]], df.sample(1, weights=s))

        # Weights have index values to be dropped because not in
        # sampled DataFrame
        s2 = Series([0.001, 0, 10000], index=[3, 5, 10])
        assert_frame_equal(df.loc[[3]], df.sample(1, weights=s2))

        # Weights have empty values to be filed with zeros
        s3 = Series([0.01, 0], index=[3, 5])
        assert_frame_equal(df.loc[[3]], df.sample(1, weights=s3))

        # No overlap in weight and sampled DataFrame indices
        s4 = Series([1, 0], index=[1, 2])
        with pytest.raises(ValueError):
            df.sample(1, weights=s4)