Beispiel #1
0
 def test_diff_axis(self):
     # GH 9727
     df = DataFrame([[1., 2.], [3., 4.]])
     assert_frame_equal(df.diff(axis=1), DataFrame(
         [[np.nan, 1.], [np.nan, 1.]]))
     assert_frame_equal(df.diff(axis=0), DataFrame(
         [[np.nan, np.nan], [2., 2.]]))
Beispiel #2
0
 def test_diff_datetime_axis1(self, tz):
     # GH 18578
     df = DataFrame({0: date_range('2010', freq='D', periods=2, tz=tz),
                     1: date_range('2010', freq='D', periods=2, tz=tz)})
     if tz is None:
         result = df.diff(axis=1)
         expected = DataFrame({0: pd.TimedeltaIndex(['NaT', 'NaT']),
                               1: pd.TimedeltaIndex(['0 days',
                                                     '0 days'])})
         assert_frame_equal(result, expected)
     else:
         with pytest.raises(NotImplementedError):
             result = df.diff(axis=1)
Beispiel #3
0
    def test_diff_timedelta(self):
        # GH 4533
        df = DataFrame(dict(time=[Timestamp("20130101 9:01"), Timestamp("20130101 9:02")], value=[1.0, 2.0]))

        res = df.diff()
        exp = DataFrame([[pd.NaT, np.nan], [pd.Timedelta("00:01:00"), 1]], columns=["time", "value"])
        assert_frame_equal(res, exp)
Beispiel #4
0
    def test_diff_datetime_axis0(self, tz):
        # GH 18578
        df = DataFrame({0: date_range('2010', freq='D', periods=2, tz=tz),
                        1: date_range('2010', freq='D', periods=2, tz=tz)})

        result = df.diff(axis=0)
        expected = DataFrame({0: pd.TimedeltaIndex(['NaT', '1 days']),
                              1: pd.TimedeltaIndex(['NaT', '1 days'])})
        assert_frame_equal(result, expected)
Beispiel #5
0
    def test_diff_timedelta(self):
        # GH 4533
        df = DataFrame(
            dict(time=[Timestamp('20130101 9:01'),
                       Timestamp('20130101 9:02')],
                 value=[1.0, 2.0]))

        res = df.diff()
        exp = DataFrame([[pd.NaT, np.nan], [pd.Timedelta('00:01:00'), 1]],
                        columns=['time', 'value'])
        assert_frame_equal(res, exp)
Beispiel #6
0
def rsi(x: pd.DataFrame, d: int = 14) -> pd.DataFrame:
    """Return Relative Strength Index indicator over the past d days."""
    change = x.diff()
    upward, downward = change.copy(), change.copy()
    upward[change <= 0] = 0
    downward[change > 0] = 0
    avg_gain = upward.ewm(d, adjust=False).mean()
    avg_loss = abs(downward.ewm(d, adjust=False).mean())
    rs = avg_gain / avg_loss
    rsi = 100 - 100 / (1 + rs)
    return rsi
Beispiel #7
0
    def test_diff_timedelta(self):
        # GH 4533
        df = DataFrame(dict(time=[Timestamp('20130101 9:01'),
                                  Timestamp('20130101 9:02')],
                            value=[1.0, 2.0]))

        res = df.diff()
        exp = DataFrame([[pd.NaT, np.nan],
                         [pd.Timedelta('00:01:00'), 1]],
                        columns=['time', 'value'])
        assert_frame_equal(res, exp)
Beispiel #8
0
    def test_diff_axis1_mixed_dtypes_negative_periods(self):
        # GH#32995 operate column-wise when we have mixed dtypes and axis=1
        df = DataFrame({
            "A": range(3),
            "B": 2 * np.arange(3, dtype=np.float64)
        })

        expected = DataFrame({"A": -1.0 * df["A"], "B": df["B"] * np.nan})

        result = df.diff(axis=1, periods=-1)
        tm.assert_frame_equal(result, expected)
Beispiel #9
0
    def test_diff_axis1_mixed_dtypes_large_periods(self):
        # GH#32995 operate column-wise when we have mixed dtypes and axis=1
        df = DataFrame({
            "A": range(3),
            "B": 2 * np.arange(3, dtype=np.float64)
        })

        expected = df * np.nan

        result = df.diff(axis=1, periods=3)
        tm.assert_frame_equal(result, expected)
Beispiel #10
0
 def test_diff_datetime_axis1(self, tz):
     # GH 18578
     df = DataFrame(
         {
             0: date_range("2010", freq="D", periods=2, tz=tz),
             1: date_range("2010", freq="D", periods=2, tz=tz),
         }
     )
     if tz is None:
         result = df.diff(axis=1)
         expected = DataFrame(
             {
                 0: pd.TimedeltaIndex(["NaT", "NaT"]),
                 1: pd.TimedeltaIndex(["0 days", "0 days"]),
             }
         )
         assert_frame_equal(result, expected)
     else:
         with pytest.raises(NotImplementedError):
             result = df.diff(axis=1)
Beispiel #11
0
    def test_diff_axis1_mixed_dtypes(self):
        # GH#32995 operate column-wise when we have mixed dtypes and axis=1
        df = DataFrame({
            "A": range(3),
            "B": 2 * np.arange(3, dtype=np.float64)
        })

        expected = DataFrame({"A": [np.nan, np.nan, np.nan], "B": df["B"] / 2})

        result = df.diff(axis=1)
        tm.assert_frame_equal(result, expected)

        # GH#21437 mixed-float-dtypes
        df = DataFrame({
            "a": np.arange(3, dtype="float32"),
            "b": np.arange(3, dtype="float64")
        })
        result = df.diff(axis=1)
        expected = DataFrame({"a": df["a"] * np.nan, "b": df["b"] * 0})
        tm.assert_frame_equal(result, expected)
Beispiel #12
0
    def trending_up(cls, ohlc: DataFrame, col: str, period: int) -> pd.Series:
        """
        Renvoie une série booléenne si la série d'entrées a une tendance
        à la hausse sur les n dernières périodes.

        :param df: data
        :param period: range
        :return: result Series
        """
        return pd.Series(ohlc.diff(period) > 0,
                         name="trending_up {}".format(period))
Beispiel #13
0
    def test_diff_timedelta(self):
        # GH#4533
        df = DataFrame({
            "time": [Timestamp("20130101 9:01"),
                     Timestamp("20130101 9:02")],
            "value": [1.0, 2.0],
        })

        res = df.diff()
        exp = DataFrame([[pd.NaT, np.nan], [pd.Timedelta("00:01:00"), 1]],
                        columns=["time", "value"])
        tm.assert_frame_equal(res, exp)
Beispiel #14
0
    def __get_an_attitude(df: pd.DataFrame) -> pd.DataFrame:
        """
        The method calculates (df[n]-df[n-1])/df[n-1].

        The attitude is needed to calculate R[i,t], CR[i,t], TR[i,t].

        :param df: DataFrame for calculating with attitude
        (R[i,t], CR[i,t], TR[i,t]).
        :return: Calculated DateFrame.
        """
        attitude = df.diff().div(df.shift(1))
        return attitude
 def obtain_jerk_signal(self, signal: pd.DataFrame) -> pd.DataFrame:
     """Derive signal to obtain Jerk signals
     Args:
         signal (pd.DataFrame)
     Returns:
         jerk_signal (pd.DataFrame):
     """
     jerk_signal = signal.diff(periods=1)  # Calculate difference
     jerk_signal.iloc[0] = jerk_signal.iloc[1]  # Fillna
     jerk_signal = jerk_signal / (
         1 / self.fs)  # Derive in time (1 / sampling frequency)
     return jerk_signal
Beispiel #16
0
 def transform(self, data: pd.DataFrame) -> np.ndarray:
     # TODO: scale data
     # scaler = MinMaxScaler(feature_range=(-1, 1))
     # scaler = scaler.fit(train_diff)
     # train_scaled = scaler.transform(train_diff)
     # train_unscaled = scaler.inverse_transform(train_scaled)
     diffed = data.diff(self.diff_lag).dropna()
     supervised = series.dataframe_to_supervised(diffed,
                                                 self.sequence_length)
     rnn_examples = series.supervised_to_rnn_examples(
         supervised)  # type: np.ndarray
     return rnn_examples
Beispiel #17
0
def processUScovidtracking_data(data: pd.DataFrame, run_date: pd.Timestamp):
    """ Processes raw COVIDTracking data to be in a form for the GenerativeModel.
        In many cases, we need to correct data errors or obvious outliers."""
    data["region"] = 'USA'
    #data = data.rename(columns={"state": "region"})
    data["date"] = pd.to_datetime(data["date"], format="%Y%m%d")
    data['total'] = data['positive'] + data['negative']
    data = data.set_index(["region", "date"]).sort_index()
    data = data[["positive", "total"]]
    # Now work with daily counts
    data = data.diff().dropna().clip(0, None).sort_index()
    return data.loc[idx[:, :(run_date - pd.DateOffset(1))],
                    ["positive", "total"]]
Beispiel #18
0
    def test_diff_datetime_axis0(self, tz):
        # GH 18578
        df = DataFrame({
            0: date_range('2010', freq='D', periods=2, tz=tz),
            1: date_range('2010', freq='D', periods=2, tz=tz)
        })

        result = df.diff(axis=0)
        expected = DataFrame({
            0: pd.TimedeltaIndex(['NaT', '1 days']),
            1: pd.TimedeltaIndex(['NaT', '1 days'])
        })
        assert_frame_equal(result, expected)
Beispiel #19
0
    def test_diff_datetime_axis0(self, tz):
        # GH 18578
        df = DataFrame({
            0: date_range("2010", freq="D", periods=2, tz=tz),
            1: date_range("2010", freq="D", periods=2, tz=tz),
        })

        result = df.diff(axis=0)
        expected = DataFrame({
            0: pd.TimedeltaIndex(["NaT", "1 days"]),
            1: pd.TimedeltaIndex(["NaT", "1 days"]),
        })
        assert_frame_equal(result, expected)
Beispiel #20
0
def process_covidtracking_data(data: pd.DataFrame, run_date: pd.Timestamp):
    """ Processes raw COVIDTracking data to be in a form for the GenerativeModel.
        In many cases, we need to correct data errors or obvious outliers."""
    data = data.rename(columns={"state": "region"})
    data["date"] = pd.to_datetime(data["date"], format="%Y%m%d")
    data = data.set_index(["region", "date"]).sort_index()
    data = data.loc[idx[:, :run_date], ["positive", "total"]]

    # Too little data or unreliable reporting in the data source.
    data = data.drop(["MP", "GU", "AS", "PR", "VI"])

    # On Jun 5 Covidtracking started counting probable cases too
    # which increases the amount by 5014.
    # https://covidtracking.com/screenshots/MI/MI-20200605-184320.png
    data.loc[idx["MI", pd.Timestamp("2020-06-05"):], "positive"] -= 5014

    # From CT: On June 19th, LDH removed 1666 duplicate and non resident cases
    # after implementing a new de-duplicaton process.
    data.loc[idx["LA", pd.Timestamp("2020-06-19"):], :] += 1666

    # Now work with daily counts
    data = data.diff().dropna().clip(0, None)

    # Michigan missed 6/18 totals and lumped them into 6/19 so we've
    # divided the totals in two and equally distributed to both days.
    data.loc[idx["MI", pd.Timestamp("2020-06-18")], "total"] = 14871
    data.loc[idx["MI", pd.Timestamp("2020-06-19")], "total"] = 14871

    # Note that when we set total to zero, the model ignores that date. See
    # the likelihood function in GenerativeModel.build

    # Huge outlier in NJ causing sampling issues.
    data.loc[idx["NJ", pd.Timestamp("2020-05-11")], :] = 0

    # Huge outlier in CA causing sampling issues.
    data.loc[idx["CA", pd.Timestamp("2020-04-22")], :] = 0

    # A bunch of incorrect values for WA data so nulling them out.
    data.loc[idx["WA",
                 pd.Timestamp("2020-06-05"):pd.Timestamp("2020-06-07")], :] = 0
    data.loc[idx["WA",
                 pd.Timestamp("2020-06-20"):pd.Timestamp("2020-06-21")], :] = 0

    # Outlier dates in PA
    data.loc[idx["PA", [
        pd.Timestamp("2020-06-03"),
        pd.Timestamp("2020-04-21"),
        pd.Timestamp("2020-05-20"),
    ], ], :, ] = 0

    return data
Beispiel #21
0
    def test_diff_timedelta64_with_nat(self):
        # GH#32441
        arr = np.arange(6).reshape(3, 2).astype("timedelta64[ns]")
        arr[:, 0] = np.timedelta64("NaT", "ns")

        df = DataFrame(arr)
        result = df.diff(1, axis=0)

        expected = DataFrame({
            0: df[0],
            1: [pd.NaT, pd.Timedelta(2),
                pd.Timedelta(2)]
        })
        tm.assert_equal(result, expected)

        result = df.diff(0)
        expected = df - df
        assert expected[0].isna().all()
        tm.assert_equal(result, expected)

        result = df.diff(-1, axis=1)
        expected = df * np.nan
        tm.assert_equal(result, expected)
Beispiel #22
0
def rsi(values, period):
    """
    Wilder の RSI を計算するのです。
    * values: 調整後終値を指定するのです。
    * period: 期間なのです。
    * return: Wilder の RSI の値なのです。
    """
    _values = DataFrame(values)
    # 前日との差
    _diff = _values.diff(1)
    # 上がったやつ
    _posi = _diff.clip_lower(0).ewm(alpha=1 / period).mean()
    # 下がったやつ
    _nega = _diff.clip_upper(0).ewm(alpha=1 / period).mean()
    return _posi / (_posi - _nega)
Beispiel #23
0
    def test_diff_integer_na(self, axis, expected):
        # GH#24171 IntegerNA Support for DataFrame.diff()
        df = DataFrame(
            {
                "a": np.repeat([0, 1, np.nan, 2], 2),
                "b": np.tile([0, 1, np.nan, 2], 2),
                "c": np.repeat(np.nan, 8),
                "d": np.arange(1, 9) ** 2,
            },
            dtype="Int64",
        )

        # Test case for default behaviour of diff
        result = df.diff(axis=axis)
        tm.assert_frame_equal(result, expected)
Beispiel #24
0
 def makePortfolio(self, start=None, end=None, capital=None):
     if not start:
         start = self.signal.index[0]
     if not end:
         end = self.signal.index[-1]
     if not capital:
         capital = self.signal.loc[start, 'Price']
     positions = DataFrame(index=self.signal.index).fillna(0.0)
     portfolio = DataFrame(index=self.signal.index).fillna(0.0)
     positions[self.symbol] = self.signal['Signal']
     portfolio['positions'] = (positions.multiply(self.signal['Price'],
                                                  axis=0))
     portfolio['cash'] = capital - (positions.diff().multiply(
         self.signal['Price'], axis=0)).cumsum()
     portfolio['total'] = portfolio['positions'] + portfolio['cash']
     self.portfolio = portfolio
Beispiel #25
0
def data_preprocessing(table: pd.DataFrame,
                       country_set: Set[str],
                       test_split: float = 0.1):
    """
    split train test set

    return a numpy array which each row are the data difference of each country
    """

    country_to_drop = set(table.index) - country_set
    table.drop(country_to_drop, inplace=True)
    diff_table = table.diff(axis=1).dropna(axis=1)
    all_data = np.array(diff_table)

    X_train, X_test = train_test_split(all_data,
                                       test_size=test_split,
                                       random_state=RANDOM_SEED,
                                       shuffle=False)

    return X_train, X_test, all_data, diff_table.index
Beispiel #26
0
def create_parent_draws(parent_draws: pd.DataFrame) -> pd.DataFrame:
    n_child_locations = parent_draws.reset_index()['location_id'].unique().size
    parent_id = parent_draws['parent_id'].unique().item()
    del parent_draws['parent_id']
    if parent_draws.index.names != ['location_id', 'date']:
        raise ValueError(
            "Multi-index differs from expected (['location_id', 'date']).")
    parent_draws_count = parent_draws.groupby(level=1).count().iloc[:, 0]
    keep_idx = parent_draws_count[parent_draws_count ==
                                  n_child_locations].index
    nulls = parent_draws.isnull().groupby(level=1).sum() > 0
    parent_draws = parent_draws.groupby(level=1).sum()
    parent_draws = parent_draws.cumsum()
    parent_draws[nulls] = np.nan
    parent_draws = parent_draws.loc[keep_idx]
    parent_draws = parent_draws.diff().fillna(parent_draws)
    parent_draws['location_id'] = parent_id
    parent_draws = (parent_draws.reset_index().set_index(
        ['location_id', 'date']).sort_index())

    return parent_draws
def dataFrameMathTest():
    #Note : The methods that return a series default to working on columns.
    df = DataFrame()
    # Load a DataFrame from a CSV file    
    org_df = pd.read_csv('mlg.csv')
    df = org_df.iloc[:,1:7]
    
    resAbs = df.abs() # absolute values
    print(resAbs)
    #resAdd = df.add(o) # add df, Series or value
    #print(resAdd)
    resCount = df.count() # non NA/null values
    print(resCount)
    resCumMax = df.cummax() # (cols default axis)
    print(resCumMax)
    resCumMin = df.cummin() # (cols default axis)
    print(resCumMin)
    resCumSum = df.cumsum() # (cols default axis)
    print(resCumSum)
    resDiff = df.diff() # 1st diff (col def axis)
    print(resDiff)
    resDiv = df.div(12) # div by df, Series, value
    print(resDiv)
    #resDot = df.dot(13) # matrix dot product
    #print(resDot)
    resMax = df.max() # max of axis (col def)
    print(resMax)
    resMean = df.mean() # mean (col default axis)
    print(resMean)
    resMedian = df.median()# median (col default)
    print(resMedian)
    resMin = df.min() # min of axis (col def)
    print(resMin)
    resMul = df.mul(2) # mul by df Series val
    print(resMul)
    resSum = df.sum() # sum axis (cols default)
    print(resSum)
    resWhere = df.where(df > 0.5, other=np.nan)
    print(resWhere)
Beispiel #28
0
def process_covidtracking_data(data: pd.DataFrame, run_date: pd.Timestamp):
    """ Processes raw COVIDTracking data to be in a form for the GenerativeModel.
        In many cases, we need to correct data errors or obvious outliers."""
    data = data.rename(columns={"provincia": "region"})
    data = data.rename(columns={"casos_acum": "positive"})
    data = data.rename(columns={"fecha": "date"})
    data = data.rename(columns={"procesadas": "total"})
    data["date"] = pd.to_datetime(data["date"], format="%d/%m/%Y")
    data = data.set_index(["region", "date"]).sort_index()
    data = data[["positive", "total"]]

    # Now work with daily counts
    data = data.diff().dropna().clip(0, None).sort_index()

    zero_filter = (data.positive >= data.total)
    data.loc[zero_filter, :] = 0
    data.loc[idx["La Romana", pd.Timestamp("2020-12-02")], :] = 0

    # At the real time of `run_date`, the data for `run_date` is not yet available!
    # Cutting it away is important for backtesting!
    return data.loc[idx[:, :(run_date - pd.DateOffset(1))],
                    ["positive", "total"]]
Beispiel #29
0
def create_energy_dataframe(acm_dataframe: pd.DataFrame,
                            aggregation_count_threshold: int,
                            max_successive_time_diff: str,
                            aggregation_time: str) -> pd.DataFrame:
    """
    Creates energy feature from raw accelerometer data and returns result as a dataframe.

    :param acm_dataframe: raw accelerometer dataframe
    :param aggregation_count_threshold: threshold above which we compute energy.
    :param max_successive_time_diff: maximum difference between successive timestamp below which we compute energy
    :param aggregation_time: time by which we aggreagte energy result.
    :return energy_dataframe: pandas dataframe object with 1 column, the energy, indexed by time
    """
    max_successive_time_diff_boolean_mask = acm_dataframe["time"].diff(
        periods=1) < max_successive_time_diff
    consecutive_differences_dataframe = acm_dataframe.diff(
        periods=1)[max_successive_time_diff_boolean_mask].drop(["time"],
                                                               axis=1)

    squared_differences_dataframe = consecutive_differences_dataframe**2
    triaxial_sum_series = squared_differences_dataframe.apply(sum, axis=1)
    triaxial_sqrt_dataframe = triaxial_sum_series.apply(np.sqrt).to_frame()

    acm_dataframe_index = acm_dataframe[max_successive_time_diff_boolean_mask][
        "time"]
    triaxial_sqrt_dataframe.index = acm_dataframe_index
    triaxial_sqrt_dataframe.index.name = "timestamp"

    count_threshold_boolean_mask = triaxial_sqrt_dataframe.resample(
        aggregation_time, label="right").count() > aggregation_count_threshold
    energy_dataframe = triaxial_sqrt_dataframe.resample(aggregation_time,
                                                        label="right").sum()
    energy_dataframe = energy_dataframe[count_threshold_boolean_mask].dropna()
    energy_dataframe = energy_dataframe.rename(
        columns={0: "energy_by_{}".format(aggregation_time)})

    return energy_dataframe
def create_energy_dataframe(acm_dataframe: pd.DataFrame,
                            aggregation_count_threshold: int,
                            max_successive_time_diff: str,
                            aggregation_time: str) -> pd.DataFrame:
    """
    TODO
    :param acm_dataframe:
    :param aggregation_count_threshold:
    :param max_successive_time_diff:
    :param aggregation_time:
    :return:
    """
    max_successive_time_diff_boolean_mask = acm_dataframe["time"].diff(
        periods=1) < max_successive_time_diff
    consecutive_differences_dataframe = acm_dataframe.diff(
        periods=1)[max_successive_time_diff_boolean_mask].drop(["time"],
                                                               axis=1)

    squared_differences_dataframe = consecutive_differences_dataframe**2
    triaxial_sum_series = squared_differences_dataframe.apply(sum, axis=1)
    triaxial_sqrt_dataframe = triaxial_sum_series.apply(np.sqrt).to_frame()

    acm_dataframe_index = acm_dataframe[max_successive_time_diff_boolean_mask][
        "time"]
    triaxial_sqrt_dataframe.index = acm_dataframe_index
    triaxial_sqrt_dataframe.index.name = "timestamp"

    count_threshold_boolean_mask = triaxial_sqrt_dataframe.resample(
        aggregation_time, label="right").count() > aggregation_count_threshold
    energy_dataframe = triaxial_sqrt_dataframe.resample(aggregation_time,
                                                        label="right").sum()
    energy_dataframe = energy_dataframe[count_threshold_boolean_mask].dropna()
    energy_dataframe = energy_dataframe.rename(
        columns={0: "energy_by_{}".format(aggregation_time)})

    return energy_dataframe
Beispiel #31
0
def precip_hyetograph_nrcs(df: pd.DataFrame) -> pd.DataFrame:
    """This function takes the dataframe precipitation table extracted from NOAA Atlas 14 and calculates the nested 
       hyetograph for storm events classified by recurrence intervals. The function first retrieves the ratio of 
       rainfall and incremental intensity; then proceeds to get the ratio, slope, and slope difference; and finally fits 
       a parabolic curve from 0 to 9 hours that passes through the ratios at 0, 6, and 9 hours. The function then fits 
       curves for the remaining data until 12 hours. NOTE: this function is limited to 24 hours and needs to be updated
       to be flexible for dfferent storm durations.
    """
    ratio_to_24h = pd.DataFrame(np.arange(start=0, stop=241, step=1), columns = ['time']).set_index(['time'])
    dif = df.diff()
    dif.at['05m','value'] = df.at['05m','value']
    df['ratio'] = df/df.at['24h','value']
    i_val = {'05m': 12, '10m': 12, '15m': 12, '30m': 4, '60m': 2, '02h': 1, '03h': 1, '06h': 1./3., '12h': 1./6., 
             '24h': 1./12.}
    intensity_val = pd.DataFrame.from_dict(i_val, orient='index')
    df.insert(1, 'increm_intensity', dif['value']*intensity_val[0], True)
    raw_rf = {'time':[0, 6, 9, 10.5, 11, 11.5, 11.75, 11.875, 11.917, 12, 12.083, 12.125, 12.25, 12.5, 13, 13.5, 15, 18, 
                      24]}
    raw_df = pd.DataFrame(raw_rf, columns = ['time'])
    temp_0 = 0.5 - df.sort_values('ratio', ascending=False)['ratio']*0.5  
    temp_12 = 0.5
    temp_24 = 1 - temp_0.sort_values(0, ascending=False)
    raw_df.loc[0:9, 'ratio']= temp_0.values
    raw_df.loc[9:18, 'ratio'] = temp_24.values
    raw_df.loc[9, 'ratio'] = temp_12
    raw_df['slope_raw'] = raw_df['ratio'].diff()/raw_df['time'].diff()
    raw_df.loc[0, 'slope_raw'] = 0
    raw_df['slope_dif'] = raw_df.loc[0:9]['slope_raw'].diff() 
    df2 = raw_df.set_index(['time'])
    a = ((2.0/3.0)*df2.at[9.0, 'ratio']-df2.at[6.0, 'ratio'])/18.0
    b = (df2.at[6.0,'ratio']-36.0*a)/6.0
    low_12h = 4.0*df.loc['24h','value']*(1.0/36.0+2.0/9.0*df.loc['06h','value']/df.loc['24h','value'])
    up_12h = 2.0/3.0*df.loc['24h','value']*(5.0/6.0+2.0/3.0*df.loc['06h','value']/df.loc['24h','value'])
    if b < 0.0:
        a=df2.at[9.0,'ratio']/81.0
        b=0.0
    if 18.0*a+b<0:
        a=(-1.0*b/18.0)
        b=df2.at[9.0,'ratio']/4.5       
    a2 = (9.0/10.5*df2.at[10.5,'ratio']-df2.at[9.0,'ratio'])/13.5
    b2 = (df2.at[9.0,'ratio']-81.0*a2)/9.0
    up_2 = 2.0*df.loc['24h','value']*(0.5-(df2.at[11.5, 'ratio']+3.0*df2.at[10.5, 'ratio'])/4.0)+0.01
    low_2 = 2.0*df.loc['24h','value']*(0.5-(3.0*df2.at[11.5, 'ratio']+df2.at[10.5, 'ratio'])/4.0)+0.01
    if df.loc['02h', 'value']<low_2:
        test1 = low_2
    else:
        test1 = df.loc['02h', 'value']
    if df.loc['02h', 'value']> up_2:
        test2 = up_2
    else:
        test2 = df.loc['02h','value']
    if test1 > test2:
        test3 = test1
    else:
        test3 = test2
    if test2 > test3:
        test4 = test2
    else:
        test4 = test3
    if test4>up_2:
        test_f = up_2
    else:
        test_f = test4
    a3 = 2.0*(df2.at[11.5, 'ratio']-2*(0.5-0.5*test_f/df.loc['24h', 'value'])+ df2.at[10.5, 'ratio'])
    b3 = df2.at[11.5, 'ratio']-df2.at[10.5, 'ratio']-22.0*a3
    c3 = (0.5-0.5*test_f/df.loc['24h','value'])-121.0*a3-11.0*b3  
    ratio_to_24h.loc[0:90, 'ratio'] = a*np.power(ratio_to_24h.loc[0:90].index/10.0, 2)+\
                                      b*ratio_to_24h.loc[0:90].index/10.0
    ratio_to_24h.loc[91:105, 'ratio'] = a2*np.power(ratio_to_24h.loc[91:105].index/10.0, 2)+\
                                        b2*ratio_to_24h.loc[91:105].index/10.0
    ratio_to_24h.loc[106:115, 'ratio'] = a3*np.power(ratio_to_24h.loc[106:115].index/10.0, 2)+\
                                         b3*ratio_to_24h.loc[106:115].index/10.0 + c3
    ratio_to_24h['slope'] = ratio_to_24h['ratio'].diff()/0.1                                                               
    if -0.867*ratio_to_24h.loc[115, 'slope']+0.4337 < 0.399: 
        fac_116 = -0.867*ratio_to_24h.loc[115, 'slope']+0.4337
    else:
        fac_116 = 0.399
    if -0.4917*ratio_to_24h.loc[115,'slope']+0.8182 < 0.799: 
        fac_117 = -0.4917*ratio_to_24h.loc[115,'slope']+0.8182
    else:
        fac_117 = 0.799
    ratio_to_24h.at[116, 'ratio'] = df2.at[11.5, 'ratio']+fac_116*(df2.at[11.75,'ratio']-df2.at[11.5, 'ratio'])
    ratio_to_24h.at[117, 'ratio'] = df2.at[11.5, 'ratio']+fac_117*(df2.at[11.75,'ratio']-df2.at[11.5, 'ratio'])                                                                                                                
    ratio_to_24h.at[118, 'ratio'] = df2.at[11.75, 'ratio']+0.4*(df2.at[11.875,'ratio']-df2.at[11.75, 'ratio'])
    ratio_to_24h.at[119, 'ratio'] = df2.at[11.875, 'ratio']+0.6*(df2.at[11.917,'ratio']-df2.at[11.875, 'ratio'])
    ratio_to_24h.loc[121:240, 'ratio'] = 1-ratio_to_24h.loc[0:119, 'ratio'].sort_index(ascending=False).values
    ratio_to_24h.loc[120, 'ratio'] = ratio_to_24h.at[121, 'ratio']-(df.at['05m', 'ratio']+1.0/5.0*
                                                                   (df.at['10m','ratio']-df.at['05m','ratio']))
    ratio_to_24h.loc[0, 'ratio'] = 0
    ratio_to_24h['slope'] = ratio_to_24h['ratio'].diff()/0.1
    ratio_to_24h.at[0, 'slope'] = 0
    ratio_to_24h['t_step'] = ratio_to_24h.index*0.1
    ratio_to_24h.index = ratio_to_24h.index*0.1
    return ratio_to_24h
Beispiel #32
0
    def test_diff_mixed_dtype(self):
        df = DataFrame(np.random.randn(5, 3))
        df['A'] = np.array([1, 2, 3, 4, 5], dtype=object)

        result = df.diff()
        self.assertEqual(result[0].dtype, np.float64)
Beispiel #33
0
class Portfolio:
    """ Class for storing portfolio information

        Parameters
        ----------
        tickers : string
            asset ticker
        earnings_dir: string (default = None)
            address of .xls file storing earnings information
        vol_params : dict with volatility params

        Attributes
        ----------
        open_data_, close_data_, high_data_, low_data_, volume_data_ : pandas dataframe
            Data on daily prices
        open_to_open_, open_to_close_, close_to_open_: pandas dataframes
        vol_ : pandas dataframe
            daily vol
        positions : pandas dataframe
            columns = tickers
        trades : pandas dataframe
            columns = ['date', 'ticker', 'transaction'], transaction > 0 for buy and < 0 for sell

        """
    def __init__(self, tickers, vol_params=VOL_PARAMS):
        self.tickers = array(tickers)

        #self.dates = Series(self.OpenPrices.index)

        #self.positions = DataFrame(index = self.dates, columns = tickers, data = 0)

        #self.vol = self.returns.ewm(alpha = volAlpha).std()
        #self.volNormRets = self.returns / self.vol.shift(-1)
        #self.cov = self.returns.rolling(window = covWindow)

    def load_data(self, db_file=None):
        """ Loads online data by default, or loads from database if db_file is specified. """
        open, close, high, low, volume = dict(), dict(), dict(), dict(), dict()
        tickers_loaded = []
        for ticker in self.tickers:
            try:
                if db_file:
                    data = getDataFromDB(ticker, db_file)['daily_data']
                    tickers_loaded.append(ticker)
                else:
                    data = getYahooData(ticker).rename(columns={
                        'Adj Close': 'Close',
                        'Close': 'Unadj Close'
                    })
                    tickers_loaded.append(ticker)
                open[ticker], close[ticker], high[ticker], low[ticker], volume[ticker] = \
                    data['Open'], data['Close'], data['High'], data['Low'], data['Volume']
            except Exception as e:
                print('Could not load data for {0}: {1}'.format(ticker, e))
        print('Loaded data for:\n {}'.format(tickers_loaded))
        self.open_, self.close_, self.high_, self.low_, self.volume_ = \
            DataFrame(open), DataFrame(close), DataFrame(high), DataFrame(low), DataFrame(volume)
        self.dates_ = Series(self.open_.index)

    def compute_returns(self):
        """ Computes open_to_open, close_to_close, open_to_close and overnight returns. """
        self.open_to_open_ = self.open_.diff()
        self.close_to_close_ = self.close_.diff()
        self.open_to_close_ = self.close_ - self.open_
        self.overnight_ = self.open_ - self.close_.shift(1)

    def compute_vol(self,
                    method='exponential',
                    alpha=0.05,
                    window=20,
                    returns='open'):
        """ Computes vol of assets in portfolio. """
        if (method not in ['exponential', 'rolling'
                           ]) or (returns not in ['open', 'close']):
            raise ValueError('Wrong inputs for computing vol!')
        self.vol_params_ = {'method': method, 'returns': returns}
        returns = self.open_to_open_ if returns == 'open' else self.close_to_close_
        if method == 'exponential':
            self.vol_ = returns.ewm(alpha=alpha).std()
            self.vol_params_['alpha'] = alpha
        elif method == 'rolling':
            self.vol_ = returns.rolling(window=window).std()
            self.vol_params_['window'] = window

    def compute_cov(self,
                    method='rolling',
                    alpha='0.025',
                    window=50,
                    returns='open'):
        """ Computes vol of assets in portfolio. """
        if (method not in ['exponential', 'rolling'
                           ]) or (returns not in ['open', 'close']):
            raise ValueError('Wrong inputs for computing vol!')
        self.cov_params_ = {'method': method, 'returns': returns}
        returns = self.open_to_open_ if returns == 'open' else self.close_to_close_
        if method == 'exponential':
            self.cov_ = returns.ewm(alpha=alpha).cov()
            self.corr_ = returns.ewm(alpha=alpha).corr()
            self.cov_params_['alpha'] = alpha
        elif method == 'rolling':
            self.cov_ = returns.rolling(window=window).cov()
            self.corr_ = returns.rolling(window=window).corr()
            self.cov_params_['window'] = window

    def load_earnings_data(self, earnings_dir=EARNINGS_DIR):
        """ Loads earnings data. """
        self.eps_, self.revenue_ = dict(), dict()
        tickers_loaded = []
        for ticker in self.tickers:
            try:
                earnings_file = fullfile(earnings_dir, ticker + '.xls')
                df = read_excel(earnings_file)
                # split dataframe into ETS and load info
                self.eps_[ticker] = df.loc[:'Revenue'].loc[
                    'Wall St.':'Actual'].T
                self.revenue_[ticker] = df.loc['Revenue':].loc[
                    'Wall St.':'Actual'].T
                tickers_loaded.append(ticker)
            except Exception as e:
                print('Could not load earnings data for {0}: {1}'.format(
                    ticker, e))
        print('Loaded earnings data for:\n {}'.format(tickers_loaded))

    def load_transactions(self,
                          transactions,
                          augment_tickers=False,
                          date_format='%Y%m%d'):
        # Loads trade and dividend info from transactions dataframe
        trades, dividends, cash = format_transactions(transactions,
                                                      date_format=date_format)
        # Handle ticker info
        trades_tickers = unique(trades['ticker'])
        if augment_tickers:
            self.tickers = union1d(self.tickers, trades_tickers)
            print('The portfolio was augmented with teh following tickers:')
            print(setdiff1d(trades_tickers, self.tickers))
        else:
            print(
                'The trades for the following assets were not included in the portfolio:'
            )
            print(setdiff1d(trades_tickers, self.tickers))

        # Compute positions from trade info.
        df = trades.groupby('ticker').apply(lambda x: x.resample('D').sum())
        self.positions_ = df['shares'].unstack(
            level='ticker').shift(1).fillna(0).cumsum().applymap(int)

        # Load cash flow and cash dataframes
        self.cash_flow_ = cash.resample('D').sum().fillna(0)
        self.cash_ = self.cash_flow_.cumsum()

        # Load dividend dataframe
        self.dividends_ = dividends.resample('D').sum()
Beispiel #34
0
def process_covidtracking_data(data: pd.DataFrame, run_date: pd.Timestamp):
    """ Processes raw COVIDTracking data to be in a form for the GenerativeModel.
        In many cases, we need to correct data errors or obvious outliers."""
    data = data.rename(columns={"state": "region"})
    data["date"] = pd.to_datetime(data["date"], format="%Y%m%d")
    data = data.set_index(["region", "date"]).sort_index()
    data = data[["positive", "total"]]

    # Too little data or unreliable reporting in the data source.
    data = data.drop(["MP", "GU", "AS", "PR", "VI"])

    # On Jun 5 Covidtracking started counting probable cases too
    # which increases the amount by 5014.
    # https://covidtracking.com/screenshots/MI/MI-20200605-184320.png
    data.loc[idx["MI", pd.Timestamp("2020-06-05") :], "positive"] -= 5014

    # From CT: On June 19th, LDH removed 1666 duplicate and non resident cases
    # after implementing a new de-duplicaton process.
    data.loc[idx["LA", pd.Timestamp("2020-06-19") :], :] += 1666

    # Now work with daily counts
    data = data.diff().dropna().clip(0, None).sort_index()

    # Michigan missed 6/18 totals and lumped them into 6/19 so we've
    # divided the totals in two and equally distributed to both days.
    data.loc[idx["MI", pd.Timestamp("2020-06-18")], "total"] = 14871
    data.loc[idx["MI", pd.Timestamp("2020-06-19")], "total"] = 14871

    # Note that when we set total to zero, the model ignores that date. See
    # the likelihood function in GenerativeModel.build

    # Huge outlier in NJ causing sampling issues.
    data.loc[idx["NJ", pd.Timestamp("2020-05-11")], :] = 0
    # Same tests and positives, nulling out
    data.loc[idx["NJ", pd.Timestamp("2020-07-25")], :] = 0

    # Huge outlier in CA causing sampling issues.
    data.loc[idx["CA", pd.Timestamp("2020-04-22")], :] = 0

    # Huge outlier in CA causing sampling issues.
    # TODO: generally should handle when # tests == # positives and that
    # is not an indication of positive rate.
    data.loc[idx["SC", pd.Timestamp("2020-06-26")], :] = 0

    # Two days of no new data then lumped sum on third day with lack of new total tests
    data.loc[idx["OR", pd.Timestamp("2020-06-26") : pd.Timestamp("2020-06-28")], 'positive'] = 174
    data.loc[idx["OR", pd.Timestamp("2020-06-26") : pd.Timestamp("2020-06-28")], 'total'] = 3296

    #https://twitter.com/OHdeptofhealth/status/1278768987292209154
    data.loc[idx["OH", pd.Timestamp("2020-07-01")], :] = 0
    data.loc[idx["OH", pd.Timestamp("2020-07-09")], :] = 0

    # Nevada didn't report total tests this day
    data.loc[idx["NV", pd.Timestamp("2020-07-02")], :] = 0

    # A bunch of incorrect values for WA data so nulling them out.
    data.loc[idx["WA", pd.Timestamp("2020-06-05") : pd.Timestamp("2020-06-07")], :] = 0
    data.loc[idx["WA", pd.Timestamp("2020-06-20") : pd.Timestamp("2020-06-21")], :] = 0

    # AL reported tests == positives
    data.loc[idx["AL", pd.Timestamp("2020-07-09")], :] = 0

    # Low reported tests
    data.loc[idx["AR", pd.Timestamp("2020-07-10")], :] = 0

    # Positives == tests
    data.loc[idx["MS", pd.Timestamp("2020-07-12")], :] = 0

    # Positive == Tests; lumpy reporting for CT
    data.loc[idx["CT", pd.Timestamp("2020-07-17")], :] = 0
    data.loc[idx["CT", pd.Timestamp("2020-07-21")], :] = 0

    data.loc[idx["DC", pd.Timestamp("2020-08-04")], :] = 0

    # Outlier dates in PA
    data.loc[
        idx[
            "PA",
            [
                pd.Timestamp("2020-06-03"),
                pd.Timestamp("2020-04-21"),
                pd.Timestamp("2020-05-20"),
            ],
        ],
        :,
    ] = 0

    data.loc[idx["HI", pd.Timestamp("2020-08-07")], :] = 0
    data.loc[idx["TX", pd.Timestamp("2020-08-08")], :] = 0
    data.loc[idx["TX", pd.Timestamp("2020-08-11")], :] = 0

    data.loc[idx["DE", pd.Timestamp("2020-08-14")], :] = 0

    data.loc[idx["SD", pd.Timestamp("2020-08-26")], :] = 0

    data.loc[idx["WA", pd.Timestamp("2020-09-22"):pd.Timestamp("2020-09-24")], :] = 0

    # Zero out any rows where positive tests equal or exceed total reported tests
    # Do not act on Wyoming as they report positive==total most days
    filtering_date = pd.Timestamp('2020-07-27')
    zero_filter = (data.positive >= data.total) & \
        (data.index.get_level_values('date') >= filtering_date) & \
        (~data.index.get_level_values('region').isin(['WY']))
    data.loc[zero_filter, :] = 0

    # At the real time of `run_date`, the data for `run_date` is not yet available!
    # Cutting it away is important for backtesting!
    return data.loc[idx[:, :(run_date - pd.DateOffset(1))], ["positive", "total"]]
Beispiel #35
0
    def test_diff_mixed_dtype(self):
        df = DataFrame(np.random.randn(5, 3))
        df['A'] = np.array([1, 2, 3, 4, 5], dtype=object)

        result = df.diff()
        self.assertEqual(result[0].dtype, np.float64)
# Plot buy and sell signals
# up arrow when we buy one share
plt.plot(buys.index[-100:], MLDataFrame.loc[buys.index]['close'][-100:], '^', markersize=10, color='red', lw=2., label='Buy');
# down arrow when we sell one share
plt.plot(sells.index[-100:], MLDataFrame.loc[sells.index]['close'][-100:], 'v', markersize = 10, color='green', lw=2., label='Sell');
plt.ylabel('Price (USD)'); plt.xlabel('Date');
plt.title('Last 100 Buy and Sell signals'); plt.legend(loc='best');
plt.show()

initial_capital = float(10000.0)
positions = DataFrame(index=MLDataFrame.index).fillna(0.0)
portfolio = DataFrame(index=MLDataFrame.index).fillna(0.0)

positions['bitcoin'] = MLDataFrame['positions']
portfolio['positions'] = (positions.multiply(MLDataFrame['close'],axis=0))
portfolio['cash'] = initial_capital - (positions.diff().multiply(MLDataFrame['close'], axis=0)).cumsum()
portfolio['total'] = portfolio['positions'] + portfolio['cash']

plt.plot(portfolio)
plt.legend()
plt.show()



...
prices = MLDataFrame.copy()

prices.drop(['signal'], 1, inplace=True)

OHLCV = ['open', 'high', 'low', 'close', 'volume']