Example #1
0
 def test_frequency_is_original(self, num_cols):
     # GH 22150
     index = pd.DatetimeIndex(["1950-06-30", "1952-10-24", "1953-05-29"])
     original = index.copy()
     df = DataFrame(1, index=index, columns=range(num_cols))
     df.apply(lambda x: x)
     assert index.freq == original.freq
Example #2
0
    def test_apply(self, float_frame):
        with np.errstate(all='ignore'):
            # ufunc
            applied = float_frame.apply(np.sqrt)
            tm.assert_series_equal(np.sqrt(float_frame['A']), applied['A'])

            # aggregator
            applied = float_frame.apply(np.mean)
            assert applied['A'] == np.mean(float_frame['A'])

            d = float_frame.index[0]
            applied = float_frame.apply(np.mean, axis=1)
            assert applied[d] == np.mean(float_frame.xs(d))
            assert applied.index is float_frame.index  # want this

        # invalid axis
        df = DataFrame(
            [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['a', 'a', 'c'])
        with pytest.raises(ValueError):
            df.apply(lambda x: x, 2)

        # GH 9573
        df = DataFrame({'c0': ['A', 'A', 'B', 'B'],
                        'c1': ['C', 'C', 'D', 'D']})
        df = df.apply(lambda ts: ts.astype('category'))

        assert df.shape == (4, 2)
        assert isinstance(df['c0'].dtype, CategoricalDtype)
        assert isinstance(df['c1'].dtype, CategoricalDtype)
Example #3
0
    def test_apply_modify_traceback(self):
        data = DataFrame({'A': ['foo', 'foo', 'foo', 'foo',
                                'bar', 'bar', 'bar', 'bar',
                                'foo', 'foo', 'foo'],
                          'B': ['one', 'one', 'one', 'two',
                                'one', 'one', 'one', 'two',
                                'two', 'two', 'one'],
                          'C': ['dull', 'dull', 'shiny', 'dull',
                                'dull', 'shiny', 'shiny', 'dull',
                                'shiny', 'shiny', 'shiny'],
                          'D': np.random.randn(11),
                          'E': np.random.randn(11),
                          'F': np.random.randn(11)})

        data.loc[4, 'C'] = np.nan

        def transform(row):
            if row['C'].startswith('shin') and row['A'] == 'foo':
                row['D'] = 7
            return row

        def transform2(row):
            if (notna(row['C']) and row['C'].startswith('shin') and
                    row['A'] == 'foo'):
                row['D'] = 7
            return row

        try:
            data.apply(transform, axis=1)
        except AttributeError as e:
            assert len(e.args) == 2
            assert e.args[1] == 'occurred at index 4'
            assert e.args[0] == "'float' object has no attribute 'startswith'"
Example #4
0
    def test_with_dictlike_columns(self):
        # GH 17602
        df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
        result = df.apply(lambda x: {'s': x['a'] + x['b']},
                          axis=1)
        expected = Series([{'s': 3} for t in df.itertuples()])
        assert_series_equal(result, expected)

        df['tm'] = [pd.Timestamp('2017-05-01 00:00:00'),
                    pd.Timestamp('2017-05-02 00:00:00')]
        result = df.apply(lambda x: {'s': x['a'] + x['b']},
                          axis=1)
        assert_series_equal(result, expected)

        # compose a series
        result = (df['a'] + df['b']).apply(lambda x: {'s': x})
        expected = Series([{'s': 3}, {'s': 3}])
        assert_series_equal(result, expected)

        # GH 18775
        df = DataFrame()
        df["author"] = ["X", "Y", "Z"]
        df["publisher"] = ["BBC", "NBC", "N24"]
        df["date"] = pd.to_datetime(['17-10-2010 07:15:30',
                                     '13-05-2011 08:20:35',
                                     '15-01-2013 09:09:09'])
        result = df.apply(lambda x: {}, axis=1)
        expected = Series([{}, {}, {}])
        assert_series_equal(result, expected)
Example #5
0
    def test_include_na(self, sparse, dtype):
        s = ['a', 'b', np.nan]
        res = get_dummies(s, sparse=sparse, dtype=dtype)
        exp = DataFrame({'a': [1, 0, 0],
                         'b': [0, 1, 0]},
                        dtype=self.effective_dtype(dtype))
        if sparse:
            exp = exp.apply(pd.SparseArray, fill_value=0.0)
        assert_frame_equal(res, exp)

        # Sparse dataframes do not allow nan labelled columns, see #GH8822
        res_na = get_dummies(s, dummy_na=True, sparse=sparse, dtype=dtype)
        exp_na = DataFrame({nan: [0, 0, 1],
                            'a': [1, 0, 0],
                            'b': [0, 1, 0]},
                           dtype=self.effective_dtype(dtype))
        exp_na = exp_na.reindex(['a', 'b', nan], axis=1)
        # hack (NaN handling in assert_index_equal)
        exp_na.columns = res_na.columns
        if sparse:
            exp_na = exp_na.apply(pd.SparseArray, fill_value=0.0)
        assert_frame_equal(res_na, exp_na)

        res_just_na = get_dummies([nan], dummy_na=True,
                                  sparse=sparse, dtype=dtype)
        exp_just_na = DataFrame(Series(1, index=[0]), columns=[nan],
                                dtype=self.effective_dtype(dtype))
        tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
Example #6
0
    def test_result_type_error(self, result_type):
        # allowed result_type
        df = DataFrame(
            np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1,
            columns=['A', 'B', 'C'])

        with pytest.raises(ValueError):
            df.apply(lambda x: [1, 2, 3],
                     axis=1,
                     result_type=result_type)
Example #7
0
    def test_apply_differently_indexed(self):
        df = DataFrame(np.random.randn(20, 10))

        result0 = df.apply(Series.describe, axis=0)
        expected0 = DataFrame(dict((i, v.describe()) for i, v in compat.iteritems(df)), columns=df.columns)
        assert_frame_equal(result0, expected0)

        result1 = df.apply(Series.describe, axis=1)
        expected1 = DataFrame(dict((i, v.describe()) for i, v in compat.iteritems(df.T)), columns=df.index).T
        assert_frame_equal(result1, expected1)
Example #8
0
def applyDataFrame():
    df = DataFrame(np.arange(12).reshape(4,3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
    print (df)
    f = lambda x: x.max() - x.min()
    func1 =  df.apply(f, axis = 0)
    func2 = df.apply(f, axis = 1)
    print (func1)
    print (func2)

    f2 = lambda x: '%.2f' % x
    df.applymap(f2)
    print (df.applymap(f2))
Example #9
0
    def test_apply_non_numpy_dtype(self):
        df = DataFrame({"dt": pd.date_range("2015-01-01", periods=3, tz="Europe/Brussels")})
        result = df.apply(lambda x: x)
        assert_frame_equal(result, df)

        result = df.apply(lambda x: x + pd.Timedelta("1day"))
        expected = DataFrame({"dt": pd.date_range("2015-01-02", periods=3, tz="Europe/Brussels")})
        assert_frame_equal(result, expected)

        df = DataFrame({"dt": ["a", "b", "c", "a"]}, dtype="category")
        result = df.apply(lambda x: x)
        assert_frame_equal(result, df)
Example #10
0
 def __init__(self, background: pd.DataFrame, permutations: int=100):
     """
     :param background: A data frame containing all the observations as binary data 1 and 0 or True and False where
             rows represent observations and columns represent samples.
     :param permutations: how many permutations by default
     :return:
     """
     self.permutations = permutations
     self.background = background
     self.sample_weights = background.apply(sum) / background.apply(sum).pipe(sum)
     self.cummulative_sum = np.cumsum(self.sample_weights)
     self.sample_indices = [x for x in range(0, background.shape[1])]
Example #11
0
def peakToTroughs(dailyret,dates):
    '''
    Example:
        sr = s['retdat']
        stkd = s['stockData']
        dt = stkd['Date']
        ptk = peakToTroughs(sr,dt)
    '''
    ''' get cummulative percent changes'''
    drs = Series(dailyret)
    soc1dr = drs+1
    soc1cumdr = soc1dr.cumprod()
    localPeaksPairs = peakdetect(y_axis=soc1cumdr,lookahead=1)[0]
    indexOfLocalPeaks  = np.empty(len(localPeaksPairs));
    for i in range(len(indexOfLocalPeaks)):
        indexOfLocalPeaks[i] = localPeaksPairs[i][0]
    # data frame with 2 columns, where column 1 is a peak, and column 2 is the next peak that follows it
    dd = DataFrame({'a':indexOfLocalPeaks[0:(len(indexOfLocalPeaks)-1)],'b':indexOfLocalPeaks[1:len(indexOfLocalPeaks)]})
    # add one more row to dd to represent the last peak and last row of soc1cumdr, so
    #   that you calculate the last possible trough, if it there was one between the last peak and the last day
    #   of data
    lastDdValue = dd.iloc[len(dd)-1,1]
    lastValueInData = len(soc1cumdr)-1
    dd = rbind(dd,[lastDdValue,lastValueInData])
    def minBetween2Peaks(x):
        lowindex = int(x[0])
        highindex = int(x[1])
        minval = min(soc1cumdr[lowindex:(highindex+1)])
        return minval
    localMins = dd.apply(minBetween2Peaks,1)
    localMins.index = range(len(localMins))
    localPeaks = soc1cumdr[indexOfLocalPeaks.astype(int)]
    localPeaks.index = range(len(localPeaks))
    diffs = (localMins - localPeaks)/localPeaks
    
    # get indices of localMins in soc1cumdr so that you can get their dates
    def ff(x):
        ''' this function gets the index of soc1cumdr whose value = x'''
        r = soc1cumdr[soc1cumdr==x].index[0]
        return r
    indexOfLocalMins = map(ff,localMins)
    datesOfLocalMins = Series(dates)[indexOfLocalMins]
    datesOfLocalMins.index = range(len(datesOfLocalMins))
    # calculate peak to end of data
    def minBetweenPeakAndEnd(x):
        arr = soc1cumdr.iloc[x[0]:len(soc1cumdr)]
        return min(arr)
    absMinsToEnd = dd.apply(minBetweenPeakAndEnd,1)
    absMinsToEnd.index = range(len(absMinsToEnd))
    diffsToEnd = (absMinsToEnd - localPeaks)/localPeaks
    ret =  DataFrame({'Date':datesOfLocalMins,'Peak':localPeaks,'Valley':localMins,'Diff':diffs,'DiffToEnd':diffsToEnd})

    return ret
Example #12
0
    def test_consistent_coerce_for_shapes(self):
        # we want column names to NOT be propagated
        # just because the shape matches the input shape
        df = DataFrame(np.random.randn(4, 3), columns=['A', 'B', 'C'])

        result = df.apply(lambda x: [1, 2, 3], axis=1)
        expected = Series([[1, 2, 3] for t in df.itertuples()])
        assert_series_equal(result, expected)

        result = df.apply(lambda x: [1, 2], axis=1)
        expected = Series([[1, 2] for t in df.itertuples()])
        assert_series_equal(result, expected)
Example #13
0
def data_prep(input_file, bad_samples_file, freq_dict=None):
    '''prepare the ibdhmm file byremoving sites that are too close from eachother, calculating major and minor allele
    if specified, the freq_dict should be a json file that contains the frequencies. This is created from freq_parse.py'''
    min_snpD = 10
    tri_allele= 0
    
    output_file = ('.').join(input_file.split('.')[0:-2]) + '_cleaned.txt'
    
    
    #relaxing conditions because we only have 3000 SNPs to begin with
    bad_samples = [sample.strip() for sample in open(bad_samples_file)]                                              
    df = DataFrame(read_csv(input_file, sep = '\t'))
    #remove bad samples
    df.drop(bad_samples, inplace = True, axis =1)
    #remove non-biallelic alleles
    #df.drop(df[df.apply(allele_count, axis = 1) != 2].index, inplace = True)
    
    
    #relaxing conditions because we only have 3000 SNPs to begin with
    '''#remove SNPs that are too close to one another
    df['diff'] = df.groupby('chrom')['pos'].diff()
    df.fillna('first', inplace = True)
    #df.to_csv('test_df.txt', sep = '\t')
    # BUG NOTE MUST FIX THE DAISY CHAIN PROBLEM
    df = df.query('diff > 10 or diff == "first"')
    df.drop('diff', axis = 1, inplace = True)'''
    
    if not freq_dict:
        #calculate the major and minor allele
        major = df.apply(major_find, axis =1 )
        minor = df.apply(minor_find, axis =1 )
        major_prop = df.apply(major_prop_find, axis =1 )
        minor_prop = df.apply(minor_prop_find, axis = 1)
    else:
        snp_dict = json.load(open(freq_dict))
        df['keys'] = df['chrom'].map(str) +':'+ df['pos'].map(str)        
        major = df['keys'].apply(lambda x : snp_dict[x]['major'])
        major_prop = df['keys'].apply(lambda x : snp_dict[x]['major_freq'])
        minor = df['keys'].apply(lambda x : snp_dict[x]['minor'])
        minor_prop = df['keys'].apply(lambda x : snp_dict[x]['minor_freq'])
        
        df.drop('keys', inplace= True, axis = 1)
               
        
        
    #inserting this stuff into dataframe for future use
    df.insert(3, 'minor_prop', minor_prop)
    df.insert(3, 'minor', minor)
    df.insert(3, 'major_prop', major_prop)
    df.insert(3, 'major', major)
    
    df.to_csv(output_file, sep = '\t', index= False)
    return df
Example #14
0
def test():
    frame = DataFrame(numpy.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
    format = lambda x: '%.2f' % x
    range = lambda x: x.max() - x.min()
    
    # http://stackoverflow.com/questions/19798153/difference-between-map-applymap-and-apply-methods-in-pandas/19798528#19798528
    print(frame.apply(range))
    print("")
    print(frame.applymap(format))
    print("")
    print(frame.apply(range).map(format))
    
    return frame
Example #15
0
    def test_with_dictlike_columns_with_infer(self):
        # GH 17602
        df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
        result = df.apply(lambda x: {'s': x['a'] + x['b']},
                          axis=1, result_type='expand')
        expected = DataFrame({'s': [3, 3]})
        assert_frame_equal(result, expected)

        df['tm'] = [pd.Timestamp('2017-05-01 00:00:00'),
                    pd.Timestamp('2017-05-02 00:00:00')]
        result = df.apply(lambda x: {'s': x['a'] + x['b']},
                          axis=1, result_type='expand')
        assert_frame_equal(result, expected)
Example #16
0
    def test_apply_non_numpy_dtype(self):
        df = DataFrame({'dt': pd.date_range(
            "2015-01-01", periods=3, tz='Europe/Brussels')})
        result = df.apply(lambda x: x)
        assert_frame_equal(result, df)

        result = df.apply(lambda x: x + pd.Timedelta('1day'))
        expected = DataFrame({'dt': pd.date_range(
            "2015-01-02", periods=3, tz='Europe/Brussels')})
        assert_frame_equal(result, expected)

        df = DataFrame({'dt': ['a', 'b', 'c', 'a']}, dtype='category')
        result = df.apply(lambda x: x)
        assert_frame_equal(result, df)
Example #17
0
    def test_consistency_for_boxed(self, box):
        # passing an array or list should not affect the output shape
        df = DataFrame(
            np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1,
            columns=['A', 'B', 'C'])

        result = df.apply(lambda x: box([1, 2]), axis=1)
        expected = Series([box([1, 2]) for t in df.itertuples()])
        assert_series_equal(result, expected)

        result = df.apply(lambda x: box([1, 2]), axis=1, result_type='expand')
        expected = DataFrame(
            np.tile(np.arange(2, dtype='int64'), 6).reshape(6, -1) + 1)
        assert_frame_equal(result, expected)
Example #18
0
def compute_confusion_matrix(target, predicted, normalize=True, sort = True):
    """ returns a confusion matrix as a data frame with labels
    Parameters:
        target (array): The values that are predicted.
        predicted (array): predicted values.
        normalize (bool): If True, Normalize
        normalize (bool): If true sort by value.
    Returns (DataFrame): df with the confusion matrix.
    """

    # Determine the uniqu values in the target list, sort them and assign as labels.
    labels = np.unique(list(target))
    labels.sort()

    # Compute the confusion matrix, place into data frame and normailize if desired.
    confusion = metrics.confusion_matrix(target, predicted, labels)
    confusion = DataFrame(confusion, index=labels, columns=labels)
    if normalize:
        confusion = confusion.apply(lambda x: x / np.sum(x), axis=1)

    # if sort is true: find the max value for each and then sort, the confusion matrix
    if sort:
        #get the max values, order and then use to order the confusion matrix on both axes
        max_values =confusion.max(axis = 1)
        max_values.sort(inplace = True, ascending=False)
        order = max_values.index
        confusion = confusion.loc[order,order]
    return confusion
Example #19
0
 def test_apply_mixed_datetimelike(self):
     # mixed datetimelike
     # GH 7778
     df = DataFrame({'A': date_range('20130101', periods=3),
                     'B': pd.to_timedelta(np.arange(3), unit='s')})
     result = df.apply(lambda x: x, axis=1)
     assert_frame_equal(result, df)
Example #20
0
    def test_apply(self):
        with np.errstate(all='ignore'):
            # ufunc
            applied = self.frame.apply(np.sqrt)
            assert_series_equal(np.sqrt(self.frame['A']), applied['A'])

            # aggregator
            applied = self.frame.apply(np.mean)
            self.assertEqual(applied['A'], np.mean(self.frame['A']))

            d = self.frame.index[0]
            applied = self.frame.apply(np.mean, axis=1)
            self.assertEqual(applied[d], np.mean(self.frame.xs(d)))
            self.assertIs(applied.index, self.frame.index)  # want this

        # invalid axis
        df = DataFrame(
            [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['a', 'a', 'c'])
        self.assertRaises(ValueError, df.apply, lambda x: x, 2)

        # GH9573
        df = DataFrame({'c0': ['A', 'A', 'B', 'B'],
                        'c1': ['C', 'C', 'D', 'D']})
        df = df.apply(lambda ts: ts.astype('category'))
        self.assertEqual(df.shape, (4, 2))
        self.assertTrue(isinstance(df['c0'].dtype, CategoricalDtype))
        self.assertTrue(isinstance(df['c1'].dtype, CategoricalDtype))
Example #21
0
    def test_apply_empty_infer_type(self):
        no_cols = DataFrame(index=['a', 'b', 'c'])
        no_index = DataFrame(columns=['a', 'b', 'c'])

        def _check(df, f):
            with warnings.catch_warnings(record=True):
                test_res = f(np.array([], dtype='f8'))
            is_reduction = not isinstance(test_res, np.ndarray)

            def _checkit(axis=0, raw=False):
                res = df.apply(f, axis=axis, raw=raw)
                if is_reduction:
                    agg_axis = df._get_agg_axis(axis)
                    tm.assertIsInstance(res, Series)
                    self.assertIs(res.index, agg_axis)
                else:
                    tm.assertIsInstance(res, DataFrame)

            _checkit()
            _checkit(axis=1)
            _checkit(raw=True)
            _checkit(axis=0, raw=True)

        with np.errstate(all='ignore'):
            _check(no_cols, lambda x: x)
            _check(no_cols, lambda x: x.mean())
            _check(no_index, lambda x: x)
            _check(no_index, lambda x: x.mean())

        result = no_cols.apply(lambda x: x.mean(), broadcast=True)
        tm.assertIsInstance(result, DataFrame)
Example #22
0
    def _grading_policy(self):
        '''
        Gets the grading policy from the course policy.

        Returns
        -------
        grading_policy : DataFrame
            Information about how grades are determined in a course.
        '''
        course_policy = self._xd.get('grading_policy')
        grading_policy = DataFrame(course_policy.iloc[0,:]['GRADER'])

        # type == the gformat of sequences
        grading_policy = grading_policy.set_index('type')

        def max_seqs(seq_type):
            '''
            Determines the max number of sequenences that should contribute to 
            a users final grade for each gformat
            '''
            return seq_type.get('min_count', 1) - seq_type.get('drop_count', 0)

        grading_policy['max_seqs'] = grading_policy.apply(max_seqs, axis = 1)

        return grading_policy
Example #23
0
def get_flights_from_route(cur, origin, destination):
    """
    Returns a dataframe for all flights matching origin, destination.
    """

    import time
    
    ### MySQL query
    time0 = time.time()
    cur.execute("SELECT Year, Month, DayofMonth, DayOfWeek, CRSDepTime, UniqueCarrier, ArrDelay FROM flights_100000 WHERE Origin = %s and Dest = %s;", (origin, destination))
    rows = cur.fetchall()
    td = time.time() - time0
    print 'Database query took %.2f seconds.' % td
    
    ### Convert to dataframe
    df = DataFrame(list(rows), columns=['Year', 'Month', 'DayOfMonth', 'DayOfWeek', 'CRSDepTime', 'Carrier', 'ArrDelay'])

    ### Drop columns without delays (cancellations)
    df = df.dropna()
    
    ### Create some auxiliary columns
    df['DayOfYear'] = df.apply( lambda x: datetime.datetime(x['Year'],x['Month'],x['DayOfMonth']).timetuple().tm_yday, axis=1)
    df['Week'] = df['DayOfYear'] / 7 + 1
    df['DepHour'] = df['CRSDepTime']/100

    ### Drop unused columns
    df = df.drop(['DayOfMonth','CRSDepTime'],axis=1).sort_index(axis=1)

    ## df.head()
    
    return df
def avg_medal_count():
    '''
    Using the dataframe's apply method, create a new Series called 
    avg_medal_count that indicates the average number of gold, silver,
    and bronze medals earned amongst countries who earned at 
    least one medal of any kind at the 2014 Sochi olympics.  Note that
    the countries list already only includes countries that have earned
    at least one medal. No additional filtering is necessary.
    
    You do not need to call the function in your code when running it in the
    browser - the grader will do that automatically when you submit or test it.
    '''

    countries = ['Russian Fed.', 'Norway', 'Canada', 'United States',
                 'Netherlands', 'Germany', 'Switzerland', 'Belarus',
                 'Austria', 'France', 'Poland', 'China', 'Korea', 
                 'Sweden', 'Czech Republic', 'Slovenia', 'Japan',
                 'Finland', 'Great Britain', 'Ukraine', 'Slovakia',
                 'Italy', 'Latvia', 'Australia', 'Croatia', 'Kazakhstan']

    gold = [13, 11, 10, 9, 8, 8, 6, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
    silver = [11, 5, 10, 7, 7, 6, 3, 0, 8, 4, 1, 4, 3, 7, 4, 2, 4, 3, 1, 0, 0, 2, 2, 2, 1, 0]
    bronze = [9, 10, 5, 12, 9, 5, 2, 1, 5, 7, 1, 2, 2, 6, 2, 4, 3, 1, 2, 1, 0, 6, 2, 1, 0, 1]
    
    olympic_medal_counts = {'gold': Series(gold),
                            'silver': Series(silver),
                            'bronze': Series(bronze)}    
    df = DataFrame(olympic_medal_counts)
    
    # YOUR CODE HERE
    avg_medal_count = df.apply(numpy.mean, axis=0)
    print avg_medal_count
Example #25
0
    def test_apply_mixed_dtype_corner(self):
        df = DataFrame({"A": ["foo"], "B": [1.0]})
        result = df[:0].apply(np.mean, axis=1)
        # the result here is actually kind of ambiguous, should it be a Series
        # or a DataFrame?
        expected = Series(np.nan, index=pd.Index([], dtype="int64"))
        assert_series_equal(result, expected)

        df = DataFrame({"A": ["foo"], "B": [1.0]})
        result = df.apply(lambda x: x["A"], axis=1)
        expected = Series(["foo"], index=[0])
        assert_series_equal(result, expected)

        result = df.apply(lambda x: x["B"], axis=1)
        expected = Series([1.0], index=[0])
        assert_series_equal(result, expected)
Example #26
0
    def test_apply(self):
        with np.errstate(all="ignore"):
            # ufunc
            applied = self.frame.apply(np.sqrt)
            assert_series_equal(np.sqrt(self.frame["A"]), applied["A"])

            # aggregator
            applied = self.frame.apply(np.mean)
            self.assertEqual(applied["A"], np.mean(self.frame["A"]))

            d = self.frame.index[0]
            applied = self.frame.apply(np.mean, axis=1)
            self.assertEqual(applied[d], np.mean(self.frame.xs(d)))
            self.assertIs(applied.index, self.frame.index)  # want this

        # invalid axis
        df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=["a", "a", "c"])
        self.assertRaises(ValueError, df.apply, lambda x: x, 2)

        # GH9573
        df = DataFrame({"c0": ["A", "A", "B", "B"], "c1": ["C", "C", "D", "D"]})
        df = df.apply(lambda ts: ts.astype("category"))
        self.assertEqual(df.shape, (4, 2))
        self.assertTrue(isinstance(df["c0"].dtype, CategoricalDtype))
        self.assertTrue(isinstance(df["c1"].dtype, CategoricalDtype))
Example #27
0
    def test_apply_modify_traceback(self):
        data = DataFrame(
            {
                "A": ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"],
                "B": ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"],
                "C": ["dull", "dull", "shiny", "dull", "dull", "shiny", "shiny", "dull", "shiny", "shiny", "shiny"],
                "D": np.random.randn(11),
                "E": np.random.randn(11),
                "F": np.random.randn(11),
            }
        )

        data.loc[4, "C"] = np.nan

        def transform(row):
            if row["C"].startswith("shin") and row["A"] == "foo":
                row["D"] = 7
            return row

        def transform2(row):
            if notnull(row["C"]) and row["C"].startswith("shin") and row["A"] == "foo":
                row["D"] = 7
            return row

        try:
            transformed = data.apply(transform, axis=1)  # noqa
        except AttributeError as e:
            self.assertEqual(len(e.args), 2)
            self.assertEqual(e.args[1], "occurred at index 4")
            self.assertEqual(e.args[0], "'float' object has no attribute 'startswith'")
Example #28
0
    def customer_lifetime_value(self, transaction_prediction_model, frequency, recency, T, monetary_value, time=12, discount_rate=1):
        """
        This method computes the average lifetime value for a group of one or more customers.
            transaction_prediction_model: the model to predict future transactions, literature uses
                pareto/ndb but we can also use a different model like bg
            frequency: the frequency vector of customers' purchases (denoted x in literature).
            recency: the recency vector of customers' purchases (denoted t_x in literature).
            T: the vector of customers' age (time since first purchase)
            monetary_value: the monetary value vector of customer's purchases (denoted m in literature).
            time: the lifetime expected for the user in months. Default: 12
            discount_rate: the monthly adjusted discount rate. Default: 1
        Returns:
            the conditional expectation of the average profit per transaction.
            Also creates a discounted_monthly_cash_flows attribute
        """
        df = DataFrame()
        df['frequency'] = frequency
        df['recency'] = recency
        df['T'] = T

        d = discount_rate
        m = self.conditional_expected_average_profit()
        discounted_monthly_cash_flows = []

        for i in range(30, (time*30)+1, 30):
            df['expected_revenues_period_'+str(i)] = df.apply(
                lambda r: (m*transaction_prediction_model.predict(i, r['frequency'], r['recency'], r['T'])/(1+d)**(i/30)),
                axis=1
            )
            discounted_monthly_cash_flows.append(df['expected_revenues_period_'+str(i)].sum())

        return sum(discounted_monthly_cash_flows)
Example #29
0
    def test_apply_bug(self):

        # GH 6125
        positions = pd.DataFrame(
            [[1, "ABC0", 50], [1, "YUM0", 20], [1, "DEF0", 20], [2, "ABC1", 50], [2, "YUM1", 20], [2, "DEF1", 20]],
            columns=["a", "market", "position"],
        )

        def f(r):
            return r["market"]

        expected = positions.apply(f, axis=1)

        positions = DataFrame(
            [
                [datetime(2013, 1, 1), "ABC0", 50],
                [datetime(2013, 1, 2), "YUM0", 20],
                [datetime(2013, 1, 3), "DEF0", 20],
                [datetime(2013, 1, 4), "ABC1", 50],
                [datetime(2013, 1, 5), "YUM1", 20],
                [datetime(2013, 1, 6), "DEF1", 20],
            ],
            columns=["a", "market", "position"],
        )
        result = positions.apply(f, axis=1)
        assert_series_equal(result, expected)
Example #30
0
File: pas.py Project: pastas/pasta
def pastas_hook(obj):
    for key, value in obj.items():
        if key in ["tmin", "tmax", "date_modified", "date_created"]:
            val = Timestamp(value)
            if val is NaT:
                val = None
            obj[key] = val
        elif key == "series":
            try:
                obj[key] = read_json(value, typ='series', orient="split")
            except:
                try:
                    obj[key] = TimeSeries(**value)
                except:
                    obj[key] = value
        elif key == "time_offset":
            obj[key] = Timedelta(value)
        elif key == "parameters":
            # Necessary to maintain order when using the JSON format!
            value = json.loads(value, object_pairs_hook=OrderedDict)
            param = DataFrame(data=value, columns=value.keys()).T
            obj[key] = param.apply(to_numeric, errors="ignore")
        else:
            try:
                obj[key] = json.loads(value, object_hook=pastas_hook)
            except:
                obj[key] = value
    return obj