Ejemplo n.º 1
0
 def vintage_yr(self, df, x, y):
     df[y] = pd.DatetimeIndex(
         df[x]).year  #convert the data to a data format, apply year method.
     return df
Ejemplo n.º 2
0
import matplotlib.pyplot as plt
import mplleaflet
import pandas as pd

df = pd.read_csv(
    'data/C2A2_data/BinnedCsvs_d400/9ad03d45b78bef0fe159c09aef98bda55e72a4bc59168beba135db95.csv'
)
df.head()

# In[2]:

import datetime
df['Date'] = pd.to_datetime(df['Date'])

df['year'] = pd.DatetimeIndex(df['Date']).year
df['month'] = pd.DatetimeIndex(df['Date']).month
df['day'] = pd.DatetimeIndex(df['Date']).day
df['Data_Value'] = df['Data_Value'] * .1
df['Data_Value'].head()

# In[3]:

import numpy as np

days = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
months = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

minTemp = []
maxTemp = []
minX = []
Ejemplo n.º 3
0
 def test_constructor_wrong_precision_raises(self):
     with pytest.raises(ValueError):
         pd.DatetimeIndex(["2000"], dtype="datetime64[us]")
Ejemplo n.º 4
0
def f_dni_extra(times):
    times = pd.DatetimeIndex(times)
    return pvlib.irradiance.extraradiation(times)
Ejemplo n.º 5
0
 def test_days_at_time(self, day, day_offset, time_offset, tz, expected):
     days = pd.DatetimeIndex([pd.Timestamp(day, tz=tz)])
     result = days_at_time(days, time_offset, tz, day_offset)[0]
     expected = pd.Timestamp(expected, tz=tz).tz_convert(UTC)
     self.assertEqual(result, expected)
usaGoodsAndServices.set_index('DATE', inplace=True)
usaGoodsAndServicesEXP = usaGoodsAndServices[usaGoodsAndServices['SUBJECT'] ==
                                             'EXP']
usaGoodsAndServicesIMP = usaGoodsAndServices[usaGoodsAndServices['SUBJECT'] ==
                                             'IMP']
usaGoodsAndServices = usaGoodsAndServicesEXP
usaGoodsAndServices.rename(columns={'Value': 'Exports'}, inplace=True)
usaGoodsAndServices['Imports'] = usaGoodsAndServicesIMP[['Value'
                                                         ]].values.flatten()
usaGoodsAndServices['Total Trade Volume'] = usaGoodsAndServices[
    'Exports'] + usaGoodsAndServices['Imports']
usaGoodsAndServices = usaGoodsAndServices[['Total Trade Volume']]

# INFLATION
inflation = pd.read_csv('./Data/Inflation Data (Long).csv')
inflation['DATE'] = pd.DatetimeIndex(inflation['DATE']).year
inflation.set_index('DATE', inplace=True)

# CORRELATIONS (-.63)
flatInflation = inflation.values.flatten()
corrDf = pd.DataFrame()
corrDf['Total Trade Volume'] = usaGoodsAndServices['Total Trade Volume']
corrDf['Inflation'] = flatInflation[10:]
goodsAndServicesInflationCorr = corrDf.corr().values[0][1]
print(goodsAndServicesInflationCorr)

# PLOT TRADE
fig = plt.figure()
ax = plt.gca()
ax2 = ax.twinx()
ax.plot(usaGoodsAndServices.index,
Ejemplo n.º 7
0
    def plot(self, symbol=None, engine='plotly', notebook=False):
        if engine == 'plotly':
            if type(symbol) == str:
                df = pd.DataFrame(self.latest_bar_dict[symbol])
                df.set_index('date', inplace=True)
                df.index = pd.DatetimeIndex(df.index)
                p_symbol = go.Scatter(x=df.index,
                                      y=df.close,
                                      xaxis='x3',
                                      yaxis='y3',
                                      name=symbol)
                p_volume = go.Bar(x=df.index,
                                  y=df['volume'],
                                  xaxis='x3',
                                  yaxis='y5',
                                  opacity=0.5,
                                  name='volume')
                self.data.append(p_symbol)
                self.data.append(p_volume)

            if type(symbol) == list:
                for i in symbol:
                    df = pd.DataFrame(self.latest_bar_dict[i])
                    df.set_index('date', inplace=True)
                    df.index = pd.DatetimeIndex(df.index)
                    p_symbol = go.Scatter(x=df.index,
                                          y=df.close,
                                          xaxis='x3',
                                          yaxis='y3',
                                          name=i)
                    p_volume = go.Bar(x=df.index,
                                      y=df['volume'],
                                      xaxis='x3',
                                      yaxis='y5',
                                      opacity=0.5,
                                      name=i + 'volume')
                    self.data.append(p_symbol)
                    self.data.append(p_volume)

            for i in self.holdings:
                p_holdings = go.Scatter(x=self.holdings.index,
                                        y=self.holdings[i],
                                        xaxis='x2',
                                        yaxis='y2',
                                        name=i)
                self.data.append(p_holdings)

            p_returns = go.Scatter(x=self.enquity_curve.index,
                                   y=self.enquity_curve.returns,
                                   xaxis='x4',
                                   yaxis='y4',
                                   name='returns')
            self.data.append(p_returns)

            layout = go.Layout(xaxis2=dict(
                domain=[0, 1],
                anchor='y2',
            ),
                               xaxis3=dict(domain=[0, 1], anchor='y3'),
                               xaxis4=dict(domain=[0, 1], anchor='y4'),
                               yaxis2=dict(domain=[0, 0.2], ),
                               yaxis3=dict(domain=[0.2, 0.8]),
                               yaxis4=dict(domain=[0.8, 1], ),
                               yaxis5=dict(
                                   domain=[0.2, 0.8],
                                   side='right',
                                   range=[0, 10000000],
                                   overlaying='y3',
                                   tickvals=[0, 1000000, 2000000, 2500000],
                                   showgrid=False))
            fig = go.Figure(data=self.data, layout=layout)
            if notebook:
                import plotly
                plotly.offline.init_notebook_mode()
                py.iplot(fig, filename='testplot', validate=False)
            else:
                py.plot(fig, filename='testplot', validate=False)
Ejemplo n.º 8
0
 def test_constructor_wrong_precision_raises(self):
     msg = "Unexpected value for 'dtype': 'datetime64\\[us\\]'"
     with pytest.raises(ValueError, match=msg):
         pd.DatetimeIndex(["2000"], dtype="datetime64[us]")
Ejemplo n.º 9
0
    def current(self, security, field):
        now_secs = datetime.now().second
        if now_secs < 10:
            # we need to wait 10 seconds after the minute to load current data... this is so the source can be ready.
            time.sleep(10 - now_secs)

        if not isinstance(security, Iterable):
            if security not in self._current_security_bars:
                security_bars = self.history(security, bar_count=1, frequency=self._data_frequency, field=None)
                self._current_security_bars[security] = security_bars

            if self._current_security_bars[security] is None or self._current_security_bars[security].empty:
                quote_date = datetime.now()
                quote_date = quote_date.replace(second=0, microsecond=0)
                self._current_security_bars[security] = pd.DataFrame(index=pd.DatetimeIndex([quote_date]),
                                                                     data={'price': float("nan"),
                                                                           'open': float("nan"),
                                                                           'high': float("nan"),
                                                                           'low': float("nan"),
                                                                           'close': float("nan"),
                                                                           'volume': int(0)})

            # print("price %s " % self._current_security_bars[security].iloc[-1]["price"])
            if self._current_security_bars[security] is not None:  # and (not self._current_security_bars[security].empty or self._current_security_bars[security].iloc[-1]["price"] == float["nan"]):
                last_price_list = self.rh_session.get_quote_list(security.symbol, 'symbol,last_trade_price,bid_price,bid_size,ask_price,ask_size')
                if last_price_list and len(last_price_list) > 0:
                    self._current_security_bars[security]["price"] = float(last_price_list[0][1])
                    self._current_security_bars[security]["bid_price"] = float(last_price_list[0][2])
                    self._current_security_bars[security]["bid_size"] = float(last_price_list[0][3])
                    self._current_security_bars[security]["ask_price"] = float(last_price_list[0][4])
                    self._current_security_bars[security]["ask_size"] = float(last_price_list[0][5])
                else:
                    # self._current_security_bars[security]["price"] = float("nan")
                    self._current_security_bars[security]["bid_price"] = float("nan")
                    self._current_security_bars[security]["bid_size"] = float("nan")
                    self._current_security_bars[security]["ask_price"] = float("nan")
                    self._current_security_bars[security]["ask_size"] = float("nan")

            if not field:
                return self._current_security_bars[security].iloc[-1]

            # log.info("security_bars(%s): %s" % (security.symbol, self._current_security_bars[security]))
            return self._current_security_bars[security].iloc[-1][field]

        else:
            symbol_list_map = {}
            return_bars = {}
            for sec in security:
                symbol_list_map[sec.symbol] = sec
                if sec not in self._current_security_bars:
                    security_bars = self.history(sec, bar_count=1, frequency=self._data_frequency, field=None)

                    if not security_bars or sec not in security_bars:
                        quote_date = datetime.now()
                        quote_date = quote_date.replace(second=0, microsecond=0)
                        security_bars[sec] = pd.DataFrame(index=pd.DatetimeIndex([quote_date]),
                                                          data={'price': float("nan"),
                                                                'open': float("nan"),
                                                                'high': float("nan"),
                                                                'low': float("nan"),
                                                                'close': float("nan"),
                                                                'volume': int(0)})

                    self._current_security_bars[sec] = security_bars[sec]

                if self._current_security_bars[sec] is not None:  # and (not self._current_security_bars[sec].empty or self._current_security_bars[sec].iloc[-1]["price"] == float["nan"]):
                    last_price_list = self.rh_session.get_quote_list(sec.symbol, 'symbol,last_trade_price,bid_price,bid_size,ask_price,ask_size')
                    if last_price_list and len(last_price_list) > 0:
                        if sec in self._current_security_bars:
                            self._current_security_bars[sec]["price"] = float(last_price_list[0][1])
                            self._current_security_bars[sec]["bid_price"] = float(last_price_list[0][2])
                            self._current_security_bars[sec]["bid_size"] = float(last_price_list[0][3])
                            self._current_security_bars[sec]["ask_price"] = float(last_price_list[0][4])
                            self._current_security_bars[sec]["ask_size"] = float(last_price_list[0][5])

                if not field:
                    return_bars[sec] = self._current_security_bars[sec].iloc[-1]
                return_bars[sec] = self._current_security_bars[sec].iloc[-1][field]
            return return_bars
    country, station = l[istation].split(',')[3], l[istation].split(',')[2]
    date, startTime = l[idate].split(',')[1], l[idate].split(',')[2]
    dateplus1 = str(pd.to_datetime(date) + pd.Timedelta('1 day'))[:10]
    datestr = date[:4]+date[5:7]+date[8:]
    datestrplus1 = dateplus1[:4]+dateplus1[5:7]+dateplus1[8:]
    
    try:
        ## Then use Pandas to read in the sonde profile
        if footer>0:
            df = pd.read_csv(osf, header=header, skipfooter=footer)
        else:
            df = pd.read_csv(osf, header=header)
        df['lat'], df['lon'] = lat, lon ## Appending to the df makes the lat-lon the right dimension for .nc input
        df = df[df['Duration']>-0.0001] ## Remove "pre-sonde" information
        df['Date_Time'] = list(map(calc_DT, df['Duration']))
        df.set_index(pd.DatetimeIndex(df['Date_Time'], dayfirst=True), inplace=True)
        mth = str(df.index.month[0])
        
        df['O3_sonde'] = 1e9*((df['O3PartialPressure']/1000)/(df['Pressure']*100))
        
        print('<><><><><> '+date+', '+station+', '+country+' <><><><><>')
        
        gcr1, gcr2 = pd.DataFrame(), pd.DataFrame()
        nr = Dataset(path+'ozonesondes/obsPack_output/no_rockets/GEOSChem.ObsPack.'+datestr+'_0000z.nc4')
        wr = Dataset(path+'ozonesondes/obsPack_output/with_rockets/GEOSChem.ObsPack.'+datestr+'_0000z.nc4')

        gcr1['P_gchem'] = wr.variables['pressure'][:]
        gcr1['O3_gcnr'] = nr.variables['O3'][:]*1e9
        gcr1['O3_gcwr'] = wr.variables['O3'][:]*1e9
        gcr1['lat'], gcr1['lon'] = wr.variables['lat'][:], wr.variables['lon'][:]
        gcr1 = gcr1.astype('float')
Ejemplo n.º 11
0
def get_data(data_select, columns_select):
    # create connect
    mydb = sql.connect(host='10.120.14.100',
                       database='hospitaldb',
                       user='******',
                       password='')
    db_cursor = mydb.cursor(buffered=True)

    #get record
    db_cursor.execute(data_select)
    resoult = db_cursor.fetchall()

    # get columns_name
    db_cursor.execute(columns_select)
    columns_name = [column[0] for column in db_cursor.fetchall()]

    # turn to dataframe
    ctdata = pd.DataFrame(resoult, columns=columns_name)

    # close connect
    mydb.close()

    # --- ETL for chart---------------------------------------------------------#

    # ---上午&下午 -> AM&PM -> timetype ---#
    ctdata['BDATE'] = ctdata['BDATE'].str.replace('上午', 'AM').str.replace(
        '下午', 'PM')
    ctdata['BDATE'] = pd.to_datetime(ctdata['BDATE'],
                                     format="%Y/%m/%d %p %I:%M:%S")
    ctdata['EDATE'] = ctdata['EDATE'].str.replace('上午', 'AM').str.replace(
        '下午', 'PM')
    ctdata['EDATE'] = pd.to_datetime(ctdata['EDATE'],
                                     format="%Y/%m/%d %p %I:%M:%S")
    # ---上午&下午 -> AM&PM -> timetype ---#
    # ---create Spend time ---#
    ctdata['second'] = ctdata['EDATE'] - ctdata['BDATE']
    ctdata['second'] = ctdata['second'].dt.total_seconds()  # time to second
    # ---create Spend time ---#

    # ---create Check YEAR & MONTH & DAY ---#
    ctdata['CDATE'] = ctdata['CDATE'].str.replace('上午', 'AM').str.replace(
        '下午', 'PM')
    ctdata['CDATE'] = pd.to_datetime(ctdata['CDATE'],
                                     format="%Y/%m/%d %p %I:%M:%S")
    ctdata['YEAR'] = pd.DatetimeIndex(ctdata['CDATE']).year
    ctdata['MONTH'] = pd.DatetimeIndex(ctdata['CDATE']).month
    ctdata['DAY'] = pd.DatetimeIndex(ctdata['CDATE']).day
    # ---create Check YEAR & MONTH & DAY ---#

    ctdata = ctdata[ctdata['AGE'] <= 100]  # remove over 100 years old
    ctdata = ctdata[(ctdata['second'] >= 300) &
                    (ctdata['second'] <= 6000)]  # take 300~ 6000 second record
    # ctdata = ctdata.dropna() # remove NULL values

    # --- ETL for chart---------------------------------------------------------#
    df = ctdata[[
        'YEAR', 'MONTH', 'DAY', 'ITEM', 'MODEL_NAME', 'AMOUNT', 'IO', 'SEX',
        'AGE', 'second'
    ]]
    df = pd.DataFrame(df)

    return df
Ejemplo n.º 12
0
import pandas as pd
import lightgbm as lgbm
import warnings
warnings.filterwarnings("ignore")

# read train data file and holiday table
data_df = pd.read_csv("train.csv")
holiday_df = pd.read_csv("holiday.csv")

#Data preprocessing for train data
data_df = data_df.set_index('id')
data_df['date'] = pd.to_datetime((data_df['date']), format='%d/%m/%Y %H:%M')
data_df['hour'] = pd.DatetimeIndex(data_df['date']).hour
data_df['year'] = pd.DatetimeIndex(data_df['date']).year
data_df['month'] = pd.DatetimeIndex(data_df['date']).month
data_df['day'] = pd.DatetimeIndex(data_df['date']).day
data_df['weekday'] = (data_df['date'].dt.dayofweek)
data_df['workingday'] = (data_df['date'].dt.dayofweek < 5).astype(int)

#Import weather condition data from https://www.worldweatheronline.com/
weather_df = pd.read_csv('hkweather.csv')
weather_df['date_time'] = pd.to_datetime((weather_df['date_time']),
                                         format='%Y-%m-%d')
weather_df = weather_df[[
    'date_time', 'cloudcover', 'humidity', 'tempC', 'visibility',
    'winddirDegree', 'windspeedKmph', 'WindChillC'
]]
weather_df['year'] = pd.DatetimeIndex(weather_df['date_time']).year
weather_df['month'] = pd.DatetimeIndex(weather_df['date_time']).month
weather_df['day'] = pd.DatetimeIndex(weather_df['date_time']).day
weather_df = weather_df.drop(columns=['date_time'], axis=1)
    def cusum_filter(self, raw_time_series, threshold, time_stamps=True):
        """
        Snippet 2.4, page 39, The Symmetric Dynamic/Fixed CUSUM Filter.
        The CUSUM filter is a quality-control method, designed to detect a shift in the
        mean value of a measured quantity away from a target value. The filter is set up to
        identify a sequence of upside or downside divergences from any reset level zero.
        We sample a bar t if and only if S_t >= threshold, at which point S_t is reset to 0.
        One practical aspect that makes CUSUM filters appealing is that multiple events are not
        triggered by raw_time_series hovering around a threshold level, which is a flaw suffered by popular
        market signals such as Bollinger Bands. It will require a full run of length threshold for
        raw_time_series to trigger an event.
        Once we have obtained this subset of event-driven bars, we will let the ML algorithm determine
        whether the occurrence of such events constitutes actionable intelligence.
        Below is an implementation of the Symmetric CUSUM filter.
        Note: As per the book this filter is applied to closing prices but we extended it to also work on other
        time series such as volatility.
        :param raw_time_series: (series) of close prices (or other time series, e.g. volatility).
        :param threshold: (float or pd.Series) when the abs(change) is larger than the threshold, the function captures
        it as an event, can be dynamic if threshold is pd.Series
        :param time_stamps: (bool) default is to return a DateTimeIndex, change to false to have it return a list.
        :return: (datetime index vector) vector of datetimes when the events occurred. This is used later to sample.
        """

        t_events = []
        s_pos = 0
        s_neg = 0

        # log returns
        raw_time_series = pd.DataFrame(raw_time_series)  # Convert to DataFrame
        raw_time_series.columns = ['price']
        raw_time_series['log_ret'] = raw_time_series.price.apply(np.log).diff()
        if isinstance(threshold, (float, int)):
            raw_time_series['threshold'] = threshold
        elif isinstance(threshold, pd.Series):
            raw_time_series.loc[threshold.index, 'threshold'] = threshold
        else:
            raise ValueError('threshold is neither float nor pd.Series!')

        raw_time_series = raw_time_series.iloc[1:]  # Drop first na values

        # Get event time stamps for the entire series
        for tup in raw_time_series.itertuples():
            thresh = tup.threshold
            pos = float(s_pos + tup.log_ret)
            neg = float(s_neg + tup.log_ret)
            s_pos = max(0.0, pos)
            s_neg = min(0.0, neg)

            if s_neg < -thresh:
                s_neg = 0
                t_events.append(tup.Index)

            elif s_pos > thresh:
                s_pos = 0
                t_events.append(tup.Index)

        # Return DatetimeIndex or list
        if time_stamps:
            event_timestamps = pd.DatetimeIndex(t_events)
            return event_timestamps

        return t_events
Ejemplo n.º 14
0
 def vintage_mnth(self, df, x, y):
     df[y] = pd.DatetimeIndex(
         df[x]
     ).month  #convert the data to a data format, apply month method.
     return df
Ejemplo n.º 15
0
def main(args):

    # writing log file for reproducibility
    logfile = '%s_log.txt' % args.op[:-4]
    os.system("rm %s" % logfile)
    outF = open(logfile, 'w')
    outF.write('Input arguments to getminmax.py: ')
    outF.write('\n')
    outF.write('\n')
    print(args, file=outF)
    outF.close()

    # checking to see if we should use a logarithmic y scale
    plotlog = True
    if args.nl is not None:
        if args.nl == 'y':
            plotlog = False

    # checking see whether filtered quantities should be calculated
    p1dm = True
    p15dm = False
    p30dm = False
    if args.p15dm is not None:
        if args.p15dm == 'y':
            p15dm = True
    if args.p30dm is not None:
        if args.p30dm == 'y':
            p30dm = True

    # reading in file, converting datetime column to datetime type
    if args.infile_snotel is not None:
        dfstreams = pd.read_csv(args.infile_stream)
        dfsnowtel = pd.read_csv(args.infile_snotel)
        df1 = pd.merge(left=dfstreams,
                       right=dfsnowtel,
                       how='left',
                       left_on='datetime',
                       right_on='datetime')
    else:
        df1 = pd.read_csv(args.infile_stream)

    df1['datetime'] = df1['datetime'].astype('datetime64[ns]')
    df1['month'] = pd.DatetimeIndex(df1['datetime']).month
    df1['year'] = pd.DatetimeIndex(df1['datetime']).year
    df1['day'] = pd.DatetimeIndex(df1['datetime']).day
    df1['doy'] = pd.DatetimeIndex(df1['datetime']).dayofyear

    # trimming dataframe to only include bounding dates
    mindate = args.bounds[0]
    maxdate = args.bounds[1]

    minmonth = mindate[0:2]
    minday = mindate[3:5]
    minyear = mindate[6:10]

    maxmonth = maxdate[0:2]
    maxday = maxdate[3:5]
    maxyear = maxdate[6:10]

    years = np.arange(int(minyear), int(maxyear) + 1, 1)

    minvals = []
    maxvals = []
    mindates = []
    maxdates = []
    filter = []

    fig = plt.figure(figsize=(6.5, 5))

    if plotlog:
        plt.yscale('log')

    for i in range(len(years) - 1):
        mindatei = '%s-%s-%s' % (minmonth, minday, years[i])
        maxdatei = '%s-%s-%s' % (maxmonth, maxday, years[i + 1])

        print(mindatei, maxdatei)

        df = df1[(df1.datetime >= mindatei) & (df1.datetime <= maxdatei)]

        try:
            minyear = df['year'].min()
            maxyear = df['year'].max()
            print(minyear, maxyear)

            # restricting range to search for min/max values to avoid values in adjacent water years
            dfmaxsearch = df[((df.month > 10) & (df.year == minyear)) |
                             ((df.month < 9) & (df.year == maxyear))]
            dfminsearch = df[(df.month > 4) & (df.year == maxyear)]
            minval = dfminsearch[args.c1].min()
            maxval = dfmaxsearch[args.c1].max()

            minval_date = dfminsearch[dfminsearch[args.c1] ==
                                      minval]['datetime'].values[0]
            maxval_date = dfmaxsearch[dfmaxsearch[args.c1] ==
                                      maxval]['datetime'].values[0]
        except:
            print('insufficient data for this year: ', mindatei, maxdatei)
            print('Skipping .....')
            continue

        minvals.append(minval)
        maxvals.append(maxval)
        mindates.append(minval_date)
        maxdates.append(maxval_date)
        filter.append('1')

        # creating smoothed curve for entered parameter
        if p30dm:
            sigma = 30  # days
            gridsp = 1  # days
            npts = sigma / 2 / gridsp
            df['smooth_30'] = ndimage.filters.gaussian_filter(
                df[args.c1].values, npts)

            minval30 = df['smooth_30'].min()
            maxval30 = df['smooth_30'].max()
            minval_date30 = df[df['smooth_30'] ==
                               minval30]['datetime'].values[0]
            maxval_date30 = df[df['smooth_30'] ==
                               maxval30]['datetime'].values[0]

            minvals.append(minval30)
            maxvals.append(maxval30)
            mindates.append(minval_date30)
            maxdates.append(maxval_date30)
            filter.append('30')

        if p15dm:
            sigma = 15  # days
            gridsp = 1  # days
            npts = sigma / 2 / gridsp
            df['smooth_15'] = ndimage.filters.gaussian_filter(
                df[args.c1].values, npts)

            minval15 = df['smooth_15'].min()
            maxval15 = df['smooth_15'].max()
            minval_date15 = df[df['smooth_15'] ==
                               minval15]['datetime'].values[0]
            maxval_date15 = df[df['smooth_15'] ==
                               maxval15]['datetime'].values[0]

            minvals.append(minval15)
            maxvals.append(maxval15)
            mindates.append(minval_date15)
            maxdates.append(maxval_date15)
            filter.append('15')

        if plotlog:
            alphan = 0.05
        else:
            alphan = 0.5
        dosy = np.arange(0, len(df))
        df['day_of_snow_year'] = dosy
        if args.mm == '30d':
            if not p30dm:
                print('must include flag "-p30dm y" to plot 30 day mean')
                print('exiting ... ')
                exit()
            plt.plot(df.day_of_snow_year,
                     df.smooth_30,
                     color='gray',
                     linewidth=3,
                     alpha=alphan,
                     label='yearly data')
        elif args.mm == '15d':
            if not p15dm:
                print('must include flag "-p15dm y" to plot 15 day mean')
                print('exiting ... ')
                exit()
            plt.plot(df.day_of_snow_year,
                     df.smooth_15,
                     color='gray',
                     linewidth=3,
                     alpha=alphan,
                     label='yearly data')
        else:
            plt.plot(df.day_of_snow_year,
                     df[args.c1],
                     color='gray',
                     linewidth=3,
                     alpha=alphan,
                     label='yearly data')

    # calculating statistics for each day in the year
    min_vals = []
    max_vals = []
    median_vals = []
    doys = []
    for doy in set(list(df1['doy'].values)):
        print(doy)
        df3 = df1[df1.doy == doy]
        median_val = df3[args.c1].median()
        min_val = df3[args.c1].min()
        max_val = df3[args.c1].max()
        median_vals.append(median_val)
        min_vals.append(min_val)
        max_vals.append(max_val)
        doys.append(doy)

    stats_df = pd.DataFrame({
        'doy': doys,
        'min_vals': min_vals,
        'max_vals': max_vals,
        'median_vals': median_vals
    })

    # extracting a year we know has good/complete data
    mindatei = '%s-%s-%s' % (minmonth, minday, 2005)
    maxdatei = '%s-%s-%s' % (maxmonth, maxday, 2006)
    df = df1[(df1.datetime >= mindatei) & (df1.datetime <= maxdatei)]
    dosy = np.arange(0, len(df))
    df['day_of_snow_year'] = dosy

    min_dosys = []
    max_dosys = []
    med_dosys = []
    dosys = []
    doys = []
    for dosy in df['day_of_snow_year'].values:
        df2 = df[df.day_of_snow_year == dosy]
        doy = df2['doy'].values[0]
        stats2 = stats_df[stats_df.doy == doy]
        min_dosy = stats2['min_vals'].values[0]
        max_dosy = stats2['max_vals'].values[0]
        median_dosy = stats2['median_vals'].values[0]
        min_dosys.append(min_dosy)
        max_dosys.append(max_dosy)
        med_dosys.append(median_dosy)
        dosys.append(dosy)
        doys.append(doy)

    begmonths = df[df.day == 1]
    begs = begmonths['day_of_snow_year'].values
    begs = begs[::2]
    ends = begs + 30
    for i in range(len(begs)):
        plt.axvspan(begs[i], ends[i], alpha=0.05, color='gray')

    if args.plotlowyear is not None:
        if args.plotlowyear == 'y':
            # extracting a year we know had very low snow
            mindatei = '%s-%s-%s' % (minmonth, minday, 2014)
            maxdatei = '%s-%s-%s' % (maxmonth, maxday, 2015)
            df1415 = df1[(df1.datetime >= mindatei)
                         & (df1.datetime <= maxdatei)]
            dosy = np.arange(0, len(df1415))
            df1415['day_of_snow_year'] = dosy
            plt.plot(dosy,
                     df1415[args.c1].values,
                     color='red',
                     linewidth=2,
                     alpha=0.2,
                     label='2014-2015 snow year')
    if args.plothighyear is not None:
        if args.plothighyear == 'y':
            # extracting a year we know had very high snow
            mindatei = '%s-%s-%s' % (minmonth, minday, 2007)
            maxdatei = '%s-%s-%s' % (maxmonth, maxday, 2008)
            df0708 = df1[(df1.datetime >= mindatei)
                         & (df1.datetime <= maxdatei)]
            dosy = np.arange(0, len(df0708))
            df0708['day_of_snow_year'] = dosy
            plt.plot(dosy,
                     df0708[args.c1].values,
                     color='blue',
                     linewidth=2,
                     alpha=0.2,
                     label='2007-2008 snow year')
    if args.plotmedyear is not None:
        if args.plotmedyear == 'y':
            # extracting a year we know had very high snow
            mindatei = '%s-%s-%s' % (minmonth, minday, 2005)
            maxdatei = '%s-%s-%s' % (maxmonth, maxday, 2006)
            df0708 = df1[(df1.datetime >= mindatei)
                         & (df1.datetime <= maxdatei)]
            dosy = np.arange(0, len(df0708))
            df0708['day_of_snow_year'] = dosy
            plt.plot(dosy,
                     df0708[args.c1].values,
                     color='yellow',
                     linewidth=2,
                     alpha=0.4,
                     label='2005-2006 snow year')

    # trimming dataframe to only include bounding dates
    mindate = args.bounds[0]
    maxdate = args.bounds[1]
    minyear = int(mindate[6:10])
    maxyear = int(maxdate[6:10])

    minyear2, maxyear2 = int(df1['year'].min()), int(df1['year'].max())
    if maxyear2 < maxyear:
        maxyear = maxyear2
    if minyear2 > minyear:
        minyear = minyear2

    plt.title('All years %s - %s' % (minyear, maxyear))
    plt.xlabel('day of snow year')
    plt.ylabel('daily %s %s' % (args.c1, args.u1))
    plt.plot(dosys, med_dosys, color='black', linewidth=3, label='median')
    plt.legend(loc='best')
    fig.savefig('%s_all_years.png' % args.op[:-4],
                format='png',
                bbox_inches='tight',
                pad_inches=0.5,
                dpi=300)
    plt.close()

    # putting min max values in one dataframe
    minmaxs = pd.DataFrame({
        'minvals': minvals,
        'maxvals': maxvals,
        'mindates': mindates,
        'maxdates': maxdates,
        'filter': filter
    })
    print(minmaxs)

    if args.mm is not None:
        if args.mm == '30d':
            minmaxs = minmaxs[minmaxs['filter'] == '30']
        elif args.mm == '15d':
            minmaxs = minmaxs[minmaxs['filter'] == '15']
        else:
            minmaxs = minmaxs[minmaxs['filter'] == '1']
    else:
        minmaxs = minmaxs[minmaxs['filter'] == '1']

    # adding julian day values
    minjul = []
    maxjul = []
    yearmins = []
    yearmaxs = []
    for index, row in minmaxs.iterrows():
        mindate, maxdate = row['mindates'], row['maxdates']
        ttmin, ttmax = mindate.timetuple(), maxdate.timetuple()
        jmin = ttmin.tm_yday
        jmax = ttmax.tm_yday
        yearmin = ttmin.tm_year
        yearmax = ttmax.tm_year
        yearmins.append(yearmin)
        yearmaxs.append(yearmax)
        minjul.append(jmin)
        maxjul.append(jmax)

    print(len(minjul))
    print(len(minmaxs))
    minmaxs['minjul'] = minjul
    minmaxs['maxjul'] = maxjul
    minmaxs['yearmin'] = yearmins
    minmaxs['yearmax'] = yearmaxs

    minmaxs['minmonth'] = pd.DatetimeIndex(minmaxs['mindates']).month
    minmaxs['maxmonth'] = pd.DatetimeIndex(minmaxs['maxdates']).month
    minmaxs['minyear'] = pd.DatetimeIndex(minmaxs['mindates']).year
    minmaxs['maxyear'] = pd.DatetimeIndex(minmaxs['maxdates']).year
    minmaxs['minday'] = pd.DatetimeIndex(minmaxs['mindates']).day
    minmaxs['maxday'] = pd.DatetimeIndex(minmaxs['maxdates']).day

    print(minmaxs.info())
    print(minmaxs.mindates)
    print(minmaxs.minvals)

    fig = plt.figure(figsize=(8, 4))
    ax1 = fig.add_subplot(111)

    if plotlog:
        plt.yscale('log')

    ax1.plot_date(x=minmaxs.mindates,
                  y=minmaxs.minvals,
                  marker='o',
                  color='red',
                  label='minimums')
    ax1.plot_date(x=minmaxs.maxdates,
                  y=minmaxs.maxvals,
                  marker='o',
                  color='blue',
                  label='maximums')

    ax1.set_xlabel('date')
    ax1.set_ylabel('daily %s %s' % (args.c1, args.u1))
    ax1.legend(loc='best')
    fig.savefig(args.op,
                format='png',
                bbox_inches='tight',
                pad_inches=0.5,
                dpi=300)
    plt.close()

    fig = plt.figure(figsize=(8, 6))
    ax1 = fig.add_subplot(211)
    ax2 = fig.add_subplot(212)
    if plotlog:
        plt.yscale('log')

    fig = plt.figure(figsize=(8, 4))
    ax1 = fig.add_subplot(111)
    if plotlog:
        plt.yscale('log')

    ax1.scatter(minmaxs.minjul,
                minmaxs.minvals,
                marker='o',
                color='red',
                s=25,
                label='minimums')
    ax1.scatter(minmaxs.maxjul,
                minmaxs.maxvals,
                marker='o',
                color='blue',
                s=25,
                label='maximums')
    con = ax1.scatter(minmaxs.maxjul,
                      minmaxs.maxvals,
                      c=minmaxs.yearmax,
                      s=15,
                      edgecolors='none',
                      cmap='Greys')
    ax1.scatter(minmaxs.minjul,
                minmaxs.minvals,
                c=minmaxs.yearmin,
                s=15,
                edgecolors='none',
                cmap='Greys')

    cbar = fig.colorbar(con)
    cbar.set_label('year')
    ax1.set_xlabel('julian day (days since Jan 1st)')
    ax1.set_ylabel('daily %s %s' % (args.c1, args.u1))
    ax1.set_xlim([0, 365])
    ax1.legend(loc='best')
    fig.savefig('%s_juliandays.png' % args.op[:-4],
                format='png',
                bbox_inches='tight',
                pad_inches=0.5,
                dpi=300)
    plt.close()

    fig = plt.figure(figsize=(8, 4))
    ax1 = fig.add_subplot(121)
    ax2 = fig.add_subplot(122)
    #plt.yscale('log')

    ax1.scatter(minmaxs.minyear,
                minmaxs.minvals,
                marker='o',
                color='red',
                s=25,
                label='minimums')
    ax2.scatter(minmaxs.maxyear,
                minmaxs.maxvals,
                marker='o',
                color='blue',
                s=25,
                label='maximums')
    ax1.set_title('yearly minimum values')
    ax2.set_title('yearly maximum values')
    ax1.set_xlabel('year')
    ax2.set_xlabel('year')
    ax1.set_ylabel('daily %s %s' % (args.c1, args.u1))
    #ax2.set_ylabel('daily %s %s' % (args.c1, args.u1))
    fig.savefig('%s_by_year.png' % args.op[:-4],
                format='png',
                bbox_inches='tight',
                pad_inches=0.5,
                dpi=300)
    plt.close()

    fig = plt.figure(figsize=(8, 4))
    ax1 = fig.add_subplot(121)
    ax2 = fig.add_subplot(122)
    #plt.yscale('log')

    ax1.scatter(minmaxs.minyear,
                minmaxs.minjul,
                marker='o',
                color='red',
                s=25,
                label='minimums')
    ax2.scatter(minmaxs.maxyear,
                minmaxs.maxjul,
                marker='o',
                color='blue',
                s=25,
                label='maximums')
    ax1.set_title('julian day of minimum')
    ax2.set_title('julian day of maximum')
    ax1.set_xlabel('year')
    ax2.set_xlabel('year')
    ax1.set_ylabel('julian day (days since Jan 1st)')
    ax1.set_ylim([0, 360])
    ax2.set_ylim([0, 360])
    #ax2.set_ylabel('daily %s %s' % (args.c1, args.u1))
    fig.savefig('%s_JD_x_year.png' % args.op[:-4],
                format='png',
                bbox_inches='tight',
                pad_inches=0.5,
                dpi=300)
    plt.close()
Ejemplo n.º 16
0
def cacu_BaseData(para_dict):
    '''
    计算产品和基准每日行业权重、收益率
    '''
    out = dict()
# =============================================================================
# #计算产品数据
# =============================================================================
    st = datetime.datetime.strptime(para_dict['startdate'],'%Y-%m-%d')
    et = datetime.datetime.strptime(para_dict['enddate'],'%Y-%m-%d')


#    t0=time.time()
#    #提取产品净值 根据产品代码,开始结束时间,查出产品的单位净值
#    fundnav = funddata.getFundNAV1([para_dict['port_code']],para_dict['startdate'],para_dict['enddate'])
#    if fundnav.empty:
#        return dict()
#    print(time.time()-t0)
##    提取持仓 根据产品代码,开始结束时间,查出产品的持仓   
#    fund_stk_hlds = funddata.getFundStockHoldings1([para_dict['port_code']],para_dict['startdate'],para_dict['enddate'])
####    # 当持仓为空,就不做计算
#    if fund_stk_hlds.empty:
#        return dict()
###
#    #分解产品的大类资产配置 : A股,港股,其他
#     #新增将总资产改为净资产
#    fund_assetallocation,fund_ag_stk_holdings,fund_hk_stk_holdings = funddata.cacu_FundAssetsAllocation(fundnav[['port_code','enddate','fund_cumulative_nav','total_asset']] ,fund_stk_hlds)
    FOFproductList = getfofproductlist()
    if para_dict['port_code'] in FOFproductList:   
        fund_assetallocation = derivativedata.getassetallocation(para_dict['port_code'], para_dict['startdate'], para_dict['enddate'], 'fes_mom_stock_weight')
    else:
        fund_assetallocation = derivativedata.getassetallocation(para_dict['port_code'], para_dict['startdate'], para_dict['enddate'], 'fes_stock_weight')
    if fund_assetallocation.empty:
        return dict()

    dateindex = pd.DatetimeIndex(fund_assetallocation.enddate)

    #分解产品的行业配置及收益率
    fund_ag_stock_hy_df = derivativedata.getFundClassWeight(para_dict['port_code'],st,et,para_dict['standard'],'AG')
    if not fund_ag_stock_hy_df.empty:

        # 捕获异常,是为了防止某一天港股或者A股突然新增或者抛掉,又或者数据缺失
        try:
            fund_ag_stock_hy_r = pd.pivot_table(fund_ag_stock_hy_df,index='enddate',columns='classname',values='rtn').loc[dateindex,:].replace(np.nan,0)
            fund_ag_stock_hy_w = pd.pivot_table(fund_ag_stock_hy_df,index='enddate',columns='classname',values='weight').loc[dateindex,:].replace(np.nan,0).ffill()
        except:
            fund_ag_stock_hy_r = pd.pivot_table(fund_ag_stock_hy_df,index='enddate',columns='classname',values='rtn').replace(np.nan,0)
            fund_ag_stock_hy_w = pd.pivot_table(fund_ag_stock_hy_df,index='enddate',columns='classname',values='weight').replace(np.nan,0).ffill()
    else:
        fund_ag_stock_hy_r = pd.DataFrame()
        fund_ag_stock_hy_w = pd.DataFrame()


    fund_hk_stock_hy_df = derivativedata.getFundClassWeight(para_dict['port_code'],st,et,para_dict['standard'],'HK')
    if not fund_hk_stock_hy_df.empty:
        # 捕获异常,是为了防止某一天港股或者A股突然新增或者抛掉,又或者数据缺失
        try:
            fund_hk_stock_hy_r = pd.pivot_table(fund_hk_stock_hy_df,index='enddate',columns='classname',values='rtn').loc[dateindex,:].replace(np.nan,0)
            fund_hk_stock_hy_w = pd.pivot_table(fund_hk_stock_hy_df,index='enddate',columns='classname',values='weight').loc[dateindex,:].replace(np.nan,0).ffill()
        except:
            fund_hk_stock_hy_r = pd.pivot_table(fund_hk_stock_hy_df,index='enddate',columns='classname',values='rtn').replace(np.nan,0)
            fund_hk_stock_hy_w = pd.pivot_table(fund_hk_stock_hy_df,index='enddate',columns='classname',values='weight').replace(np.nan,0).ffill()

    else:
        fund_hk_stock_hy_r = pd.DataFrame()
        fund_hk_stock_hy_w = pd.DataFrame()

    fund_assetsrtn = funddata.cacu_FundAssetsReturn(fund_assetallocation,fund_ag_stock_hy_w, fund_ag_stock_hy_r,fund_hk_stock_hy_w,fund_hk_stock_hy_r)

    out['fund_ag_stock_hy_r']=fund_ag_stock_hy_r
    out['fund_ag_stock_hy_w']=fund_ag_stock_hy_w
    out['fund_hk_stock_hy_r']=fund_hk_stock_hy_r
    out['fund_hk_stock_hy_w']=fund_hk_stock_hy_w


# =============================================================================
# #虚拟构造一个业绩基准
# =============================================================================

    benchmarklist_ag = para_dict['benchmarklist_ag']
    benchmarklist_hk = para_dict['benchmarklist_hk']

    #计算指数的的行业配置及收益率

    benchmark_ag_stock_hy_df = derivativedata.getIndexClassWeight(benchmarklist_ag,dateindex[0],dateindex[-1],para_dict['standard'])
    if not benchmark_ag_stock_hy_df.empty:

        benchmark_ag_stock_hy_r = pd.pivot_table(benchmark_ag_stock_hy_df,index='enddate',columns='classname',values='rtn').loc[dateindex,:].replace(np.nan,0)
        benchmark_ag_stock_hy_w = pd.pivot_table(benchmark_ag_stock_hy_df,index='enddate',columns='classname',values='weight').loc[dateindex,:].replace(np.nan,0).ffill()

    else:
        benchmark_ag_stock_hy_r = pd.DataFrame()
        benchmark_ag_stock_hy_w = pd.DataFrame()

    

    benchmark_hk_stock_hy_df = derivativedata.getIndexClassWeight(benchmarklist_hk,dateindex[0],dateindex[-1],para_dict['standard'])
    if not benchmark_hk_stock_hy_df.empty:

        benchmark_hk_stock_hy_r = pd.pivot_table(benchmark_hk_stock_hy_df,index='enddate',columns='classname',values='rtn').loc[dateindex,:].replace(np.nan,0)
        benchmark_hk_stock_hy_w = pd.pivot_table(benchmark_hk_stock_hy_df,index='enddate',columns='classname',values='weight').loc[dateindex,:].replace(np.nan,0).ffill()
    else:
        benchmark_hk_stock_hy_r = pd.DataFrame()
        benchmark_hk_stock_hy_w = pd.DataFrame()


    out['benchmark_ag_stock_hy_r']=benchmark_ag_stock_hy_r
    out['benchmark_ag_stock_hy_w']=benchmark_ag_stock_hy_w
    out['benchmark_hk_stock_hy_r']=benchmark_hk_stock_hy_r
    out['benchmark_hk_stock_hy_w']=benchmark_hk_stock_hy_w





    #业绩基准的大类资产权重和产品的一致。
    benchmark_nav = fund_assetallocation[['enddate','ag_stk_wght','hk_stk_wght','other_wght']]


    #计算基准的收益率
    benchmark_ag_stock_hy_w = benchmark_ag_stock_hy_w.replace(np.nan,0)
    ag_stk_rtn=(benchmark_ag_stock_hy_r*benchmark_ag_stock_hy_w.shift(1)).sum(axis = 1).reset_index()
    ag_stk_rtn.columns=['enddate','ag_stock_rtn']

    benchmark_hk_stock_hy_w = benchmark_hk_stock_hy_w.replace(np.nan,0)
    hk_stk_rtn=(benchmark_hk_stock_hy_r*benchmark_hk_stock_hy_w.shift(1)).sum(axis = 1).reset_index()
    hk_stk_rtn.columns=['enddate','hk_stock_rtn']


    benchmark_nav = pd.merge(benchmark_nav,ag_stk_rtn,how='left',on=['enddate']).replace(np.nan,0)
    benchmark_nav = pd.merge(benchmark_nav,hk_stk_rtn,how='left',on=['enddate']).replace(np.nan,0)
    benchmark_nav = pd.merge(benchmark_nav,fund_assetsrtn[['enddate','other_rtn']],how='left',on=['enddate']).replace(np.nan,0)
    benchmark_nav['total_rtn']=benchmark_nav['ag_stk_wght'].shift(1)*benchmark_nav['ag_stock_rtn']+\
                                benchmark_nav['hk_stk_wght'].shift(1)*benchmark_nav['hk_stock_rtn']+\
                                benchmark_nav['other_wght'].shift(1)*benchmark_nav['other_rtn']
    benchmark_nav['total_rtn']=benchmark_nav['total_rtn'].replace(np.nan,0)


    #构造出基准的净值曲线
    benchmark_nav['fund_cumulative_nav']=(benchmark_nav['total_rtn']+1).cumprod()
    benchmark_nav['total_asset']=benchmark_nav['fund_cumulative_nav']
    benchmark_nav['port_code']=benchmarklist_ag+'-'+benchmarklist_hk



    benchmark_assetallocation = benchmark_nav[['port_code', 'enddate', 'fund_cumulative_nav', 'total_asset',
        'ag_stk_wght', 'hk_stk_wght','other_wght', 'total_rtn']]

    benchmark_assetallocation.loc[:,'ag_stk_mkv']=benchmark_assetallocation['total_asset']*benchmark_assetallocation['ag_stk_wght']
    benchmark_assetallocation.loc[:,'hk_stk_mkv']=benchmark_assetallocation['total_asset']*benchmark_assetallocation['hk_stk_wght']
    benchmark_assetallocation.loc[:,'other_mkv']=benchmark_assetallocation['total_asset']*benchmark_assetallocation['other_wght']


    #计算基准产品收益率
    benchmark_assetsrtn = funddata.cacu_FundAssetsReturn(benchmark_assetallocation,benchmark_ag_stock_hy_w, benchmark_ag_stock_hy_r,benchmark_hk_stock_hy_w,benchmark_hk_stock_hy_r)




   #调整后的brison#计算调整系数矩阵
    kk = brison._cacu_k(fund_assetsrtn['total_rtn_cum'].iloc[-1],benchmark_assetsrtn['total_rtn_cum'].iloc[-1])
    kt=[]
    for i in range(0,len(fund_assetsrtn)):
        k1 = brison._cacu_k(fund_assetsrtn['total_rtn'][i],benchmark_assetsrtn['total_rtn'][i])
        k2 = fund_assetsrtn['enddate'][i]
        ki=[kk,k1,k2]
        kt.append(ki)
    kt_df = pd.DataFrame(kt,columns=['kk','k_t','enddate'])

    out['kt_df']=kt_df

    #合并产品和基准指数基本要输
    out['fund_ctr_df']=pd.merge(fund_assetallocation,fund_assetsrtn,on=['port_code','enddate','total_rtn'])
    out['benchmark_ctr_df']=pd.merge(benchmark_assetallocation,benchmark_assetsrtn,on=['port_code','enddate','total_rtn'])
    return  out
Ejemplo n.º 17
0
    def QueryOrLoad(self, start_date='01-01-2015', end_date='01-01-2017'):

        if path.exists('keys/pecanstkey.txt'):
            initial_path = ''
        else:
            initial_path = '../'

        fp = initial_path + 'data/netloadsolaridentify_{}_{}.csv'.format(
            start_date, end_date)
        fw = initial_path + 'data/weather_netloadsolaridentify_{}_{}.csv'.format(
            start_date, end_date)

        ## Close any open connections.
        import gc
        for obj in gc.get_objects():
            if isinstance(obj, sq.engine.base.Engine):
                obj.dispose()

        # Read the keys
        with open(initial_path + 'keys/pecanstkey.txt', 'r') as f:
            key = f.read().strip()
            f.close()

        # Mayank:
        engine = sq.create_engine(
            "postgresql+psycopg2://{}@dataport.pecanstreet.org:5434/postgres".
            format(key))

        if not path.exists(fp):
            ti = t_clock()
            # Find sites with complete data for the requested time period and join
            print('determining sites with full data...')
            query = """
                SELECT e.dataid
                FROM university.electricity_egauge_15min e
                WHERE local_15min
                BETWEEN '{}' AND '{}'
                AND e.dataid IN (
                    SELECT m.dataid
                    FROM university.metadata m
                    WHERE m.city = 'Austin'
                )
        
                GROUP BY dataid
                HAVING count(e.use) = (
                    SELECT MAX(A.CNT)
                    FROM (
                        SELECT dataid, COUNT(use) as CNT
                        FROM university.electricity_egauge_15min
                        WHERE local_15min
                        BETWEEN '{}' AND '{}'
                        GROUP BY dataid
                    ) AS A
                );
            """.format(start_date, end_date, start_date, end_date)
            metadata = pd.read_sql_query(query, engine)
            duse = metadata.values.squeeze()
            print('querying load and generation data...')
            query = """
                SELECT dataid, local_15min, use, gen 
                FROM university.electricity_egauge_15min
                WHERE local_15min
                BETWEEN '{}' AND '{}'
                AND electricity_egauge_15min.dataid in (
            """.format(start_date, end_date) + ','.join([str(d)
                                                         for d in duse]) + """)
                ORDER BY local_15min;
            """
            load_data = pd.read_sql_query(query, engine)
            tf = t_clock()
            deltat = (tf - ti) / 60.
            print('query of {} values took {:.2f} minutes'.format(
                load_data.size, deltat))
            load_data.to_csv(fp)

            # Weather data
            print('querying ambient temperature data from weather table...')
            locs = pd.read_sql_query(
                """
                SELECT distinct(latitude,longitude), latitude
                FROM university.weather
                ORDER BY latitude
                LIMIT 10;
                """, engine)
            locs['location'] = ['Austin', 'San Diego',
                                'Boulder']  # Ascending order by latitude
            locs.set_index('location', inplace=True)
            weather = pd.read_sql_query(
                """
                SELECT localhour, temperature
                FROM university.weather
                WHERE localhour
                BETWEEN '{}' and '{}'
                AND latitude = {}
                ORDER BY localhour;
                """.format(start_date, end_date,
                           locs.loc['Austin']['latitude']), engine)
            weather.rename(columns={'localhour': 'time'},
                           inplace=True)  # Rename
            weather['time'] = weather['time'].map(
                lambda x: x.replace(tzinfo=None))
            weather['time'] = pd.to_datetime(weather['time'])
            weather.set_index('time', inplace=True)
            weather = weather[~weather.index.duplicated(keep='first')]
            weather = weather.asfreq('15Min').interpolate(
                'linear')  # Upsample from 1hr to 15min to match load data
            weather.to_csv(fw)
        else:
            ti = t_clock()
            load_data = pd.read_csv(fp)
            weather = pd.read_csv(fw, index_col='time')
            tf = t_clock()
            deltat = (tf - ti)
            print('reading {} values from csv took {:.2f} seconds'.format(
                load_data.size, deltat))

        #Load Setup - set index and fill na
        load_data.rename(columns={'local_15min': 'time'}, inplace=True)
        load_data['time'] = pd.DatetimeIndex(load_data['time'])
        load_data.set_index('time', inplace=True)
        load_data.fillna(value=0, inplace=True)
        if 'Unnamed: 0' in load_data.columns:
            del load_data['Unnamed: 0']  # useless column


#        # Weather Setup
#        weather['time'] = pd.DatetimeIndex(weather['time'])
        weather.set_index(pd.DatetimeIndex(weather.index), inplace=True)

        # Redefine start_date and end_date so that the weather and load_data dataset match in time stamps and you take the dates common to both.
        start_date = max(weather.index[0], load_data.index[0])
        end_date = min(weather.index[-1], load_data.index[-1])
        weather = weather[(weather.index >= pd.to_datetime(start_date))
                          & (weather.index <= pd.to_datetime(end_date))]
        lst = list(
            set(weather.index) - set(load_data['use'].index)
        )  # when you interpolate hourly data to 15m resolution it also       interpolates in the changing time hours. This code inidividuates those times and then I drop them
        weather = weather.drop(lst)
        load_data = load_data[(load_data.index >= pd.to_datetime(start_date))
                              & (load_data.index <= pd.to_datetime(end_date))]

        # NetLoad
        load_data['netload'] = load_data['use'] - load_data['gen']
        load_data.head()

        self.load_data = load_data
        self.weather = weather
Ejemplo n.º 18
0
import dash
import dash_core_components as dcc
import dash_html_components as html
import pandas as pd
import plotly.graph_objs as go
from dash.dependencies import Input, Output


df_aapl_raw = pd.read_csv("data/AAPL.csv")
df_spc_raw = pd.read_csv("data/GSPC.csv")

df = df_aapl_raw[2:].reset_index()
df1 = df_spc_raw[:-3].reset_index()

df['Year'] = pd.DatetimeIndex(df['Date']).year
df1['Year'] = pd.DatetimeIndex(df1['Date']).year

app = dash.Dash(__name__)


app.layout = html.Div([
    html.Div([html.H1("Moving Average Crossover Strategy For Apple Stocks ")], style={'textAlign': "center"}),
    html.Div([
        html.Div([
            html.Div([dcc.Graph(id="my-graph")], className="row", style={"margin": "auto"}),
            html.Div([html.Div(dcc.RangeSlider(id="year selection", updatemode='drag',
                                               marks={i: '{}'.format(i) for i in df.Year.unique().tolist()},
                                               min=df1.Year.min(), max=df1.Year.max(), value=[2014, 2019]),
                               className="row", style={"padding-bottom": 30,"width":"60%","margin":"auto"}),
                      html.Span("Moving Average : Select Window Interval", className="row",
                                style={"padding-top": 30,"padding-left": 40,"display":"block",
Ejemplo n.º 19
0
# Decompose & visualize this time series
# It looks like this may or may not have captured the seasonality
# We will have to see how the models turn out.
# Residuals plot looks pretty good, however.
ts_2014_copy = ts['2015-05-01 00:00:00':'2015-05-09 23:59:59']
plt.rc("figure", figsize=(32, 30))
result = seasonal.seasonal_decompose(ts_2014_copy, model='additive')
result.plot()
plt.show()

# Format ts for forecasting
admits_df = ts_2014_copy.reset_index()
admits_df.columns = ["Time", "Admits"]
admits_df["trend"] = admits_df['Time'].map(result.trend)
admits_df["month_name"] = pd.DatetimeIndex(admits_df['Time']).month_name()
admits_df["month_name"] = admits_df.month_name.astype("category")
admits_df["month"] = pd.DatetimeIndex(admits_df['Time']).month
admits_df["day"] = pd.DatetimeIndex(admits_df["Time"]).day
admits_df["hour"] = pd.DatetimeIndex(admits_df["Time"]).hour

# Allocate 20% of the data for testing
admits_train, admits_test = split_train_test_df(admits_df, 0.2)
ts_train, ts_test = split_train_test_ts(ts_2014, 0.2)

##################################
#  TREND MODEL
##################################
trend_model = api.ols('Admits ~ trend', data=admits_train).fit()
p = trend_model.params
print(trend_model.summary())
Ejemplo n.º 20
0
import numpy as np
import statsmodels.api as sm
import pandas as pd

mdatagen = sm.datasets.macrodata.load().data
mdata = mdatagen[['realgdp','realcons','realinv']]
names = mdata.dtype.names
start = pd.datetime(1959, 3, 31)
end = pd.datetime(2009, 9, 30)
#qtr = pd.DatetimeIndex(start=start, end=end, freq=pd.datetools.BQuarterEnd())
qtr = pd.DatetimeIndex(start=start, end=end, freq='BQ-MAR')
data = pd.DataFrame(mdata, index=qtr)
data = (np.log(data)).diff().dropna()

#define structural inputs
A = np.asarray([[1, 0, 0],['E', 1, 0],['E', 'E', 1]])
B = np.asarray([['E', 0, 0], [0, 'E', 0], [0, 0, 'E']])
A_guess = np.asarray([0.5, 0.25, -0.38])
B_guess = np.asarray([0.5, 0.1, 0.05])
mymodel = SVAR(data, svar_type='AB', A=A, B=B, freq='Q')
res = mymodel.fit(maxlags=3, maxiter=10000, maxfun=10000, solver='bfgs')
res.irf(periods=30).plot(impulse='realgdp', plot_stderr=True,
                         stderr_type='mc', repl=100)
Ejemplo n.º 21
0
def f_linketurbidity(times, latitude, longitude):
    times = pd.DatetimeIndex(times)
    # latitude and longitude must be scalar or else linke turbidity lookup fails
    latitude, longitude = latitude.item(), longitude.item()
    tl = pvlib.clearsky.lookup_linke_turbidity(times, latitude, longitude)
    return tl.values.reshape(1, -1)
Ejemplo n.º 22
0
class TestSeriesMap:
    def test_map(self, datetime_series):
        index, data = tm.getMixedTypeDict()

        source = Series(data["B"], index=data["C"])
        target = Series(data["C"][:4], index=data["D"][:4])

        merged = target.map(source)

        for k, v in merged.items():
            assert v == source[target[k]]

        # input could be a dict
        merged = target.map(source.to_dict())

        for k, v in merged.items():
            assert v == source[target[k]]

        # function
        result = datetime_series.map(lambda x: x * 2)
        tm.assert_series_equal(result, datetime_series * 2)

        # GH 10324
        a = Series([1, 2, 3, 4])
        b = Series(["even", "odd", "even", "odd"], dtype="category")
        c = Series(["even", "odd", "even", "odd"])

        exp = Series(["odd", "even", "odd", np.nan], dtype="category")
        tm.assert_series_equal(a.map(b), exp)
        exp = Series(["odd", "even", "odd", np.nan])
        tm.assert_series_equal(a.map(c), exp)

        a = Series(["a", "b", "c", "d"])
        b = Series([1, 2, 3, 4],
                   index=pd.CategoricalIndex(["b", "c", "d", "e"]))
        c = Series([1, 2, 3, 4], index=Index(["b", "c", "d", "e"]))

        exp = Series([np.nan, 1, 2, 3])
        tm.assert_series_equal(a.map(b), exp)
        exp = Series([np.nan, 1, 2, 3])
        tm.assert_series_equal(a.map(c), exp)

        a = Series(["a", "b", "c", "d"])
        b = Series(
            ["B", "C", "D", "E"],
            dtype="category",
            index=pd.CategoricalIndex(["b", "c", "d", "e"]),
        )
        c = Series(["B", "C", "D", "E"], index=Index(["b", "c", "d", "e"]))

        exp = Series(
            pd.Categorical([np.nan, "B", "C", "D"],
                           categories=["B", "C", "D", "E"]))
        tm.assert_series_equal(a.map(b), exp)
        exp = Series([np.nan, "B", "C", "D"])
        tm.assert_series_equal(a.map(c), exp)

    @pytest.mark.parametrize("index", tm.all_index_generator(10))
    def test_map_empty(self, index):
        s = Series(index)
        result = s.map({})

        expected = pd.Series(np.nan, index=s.index)
        tm.assert_series_equal(result, expected)

    def test_map_compat(self):
        # related GH 8024
        s = Series([True, True, False], index=[1, 2, 3])
        result = s.map({True: "foo", False: "bar"})
        expected = Series(["foo", "foo", "bar"], index=[1, 2, 3])
        tm.assert_series_equal(result, expected)

    def test_map_int(self):
        left = Series({"a": 1.0, "b": 2.0, "c": 3.0, "d": 4})
        right = Series({1: 11, 2: 22, 3: 33})

        assert left.dtype == np.float_
        assert issubclass(right.dtype.type, np.integer)

        merged = left.map(right)
        assert merged.dtype == np.float_
        assert isna(merged["d"])
        assert not isna(merged["c"])

    def test_map_type_inference(self):
        s = Series(range(3))
        s2 = s.map(lambda x: np.where(x == 0, 0, 1))
        assert issubclass(s2.dtype.type, np.integer)

    def test_map_decimal(self, string_series):
        from decimal import Decimal

        result = string_series.map(lambda x: Decimal(str(x)))
        assert result.dtype == np.object_
        assert isinstance(result[0], Decimal)

    def test_map_na_exclusion(self):
        s = Series([1.5, np.nan, 3, np.nan, 5])

        result = s.map(lambda x: x * 2, na_action="ignore")
        exp = s * 2
        tm.assert_series_equal(result, exp)

    def test_map_dict_with_tuple_keys(self):
        """
        Due to new MultiIndex-ing behaviour in v0.14.0,
        dicts with tuple keys passed to map were being
        converted to a multi-index, preventing tuple values
        from being mapped properly.
        """
        # GH 18496
        df = pd.DataFrame({"a": [(1, ), (2, ), (3, 4), (5, 6)]})
        label_mappings = {(1, ): "A", (2, ): "B", (3, 4): "A", (5, 6): "B"}

        df["labels"] = df["a"].map(label_mappings)
        df["expected_labels"] = pd.Series(["A", "B", "A", "B"], index=df.index)
        # All labels should be filled now
        tm.assert_series_equal(df["labels"],
                               df["expected_labels"],
                               check_names=False)

    def test_map_counter(self):
        s = Series(["a", "b", "c"], index=[1, 2, 3])
        counter = Counter()
        counter["b"] = 5
        counter["c"] += 1
        result = s.map(counter)
        expected = Series([0, 5, 1], index=[1, 2, 3])
        tm.assert_series_equal(result, expected)

    def test_map_defaultdict(self):
        s = Series([1, 2, 3], index=["a", "b", "c"])
        default_dict = defaultdict(lambda: "blank")
        default_dict[1] = "stuff"
        result = s.map(default_dict)
        expected = Series(["stuff", "blank", "blank"], index=["a", "b", "c"])
        tm.assert_series_equal(result, expected)

    def test_map_dict_subclass_with_missing(self):
        """
        Test Series.map with a dictionary subclass that defines __missing__,
        i.e. sets a default value (GH #15999).
        """
        class DictWithMissing(dict):
            def __missing__(self, key):
                return "missing"

        s = Series([1, 2, 3])
        dictionary = DictWithMissing({3: "three"})
        result = s.map(dictionary)
        expected = Series(["missing", "missing", "three"])
        tm.assert_series_equal(result, expected)

    def test_map_dict_subclass_without_missing(self):
        class DictWithoutMissing(dict):
            pass

        s = Series([1, 2, 3])
        dictionary = DictWithoutMissing({3: "three"})
        result = s.map(dictionary)
        expected = Series([np.nan, np.nan, "three"])
        tm.assert_series_equal(result, expected)

    def test_map_box(self):
        vals = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")]
        s = pd.Series(vals)
        assert s.dtype == "datetime64[ns]"
        # boxed value must be Timestamp instance
        res = s.map(
            lambda x: "{0}_{1}_{2}".format(x.__class__.__name__, x.day, x.tz))
        exp = pd.Series(["Timestamp_1_None", "Timestamp_2_None"])
        tm.assert_series_equal(res, exp)

        vals = [
            pd.Timestamp("2011-01-01", tz="US/Eastern"),
            pd.Timestamp("2011-01-02", tz="US/Eastern"),
        ]
        s = pd.Series(vals)
        assert s.dtype == "datetime64[ns, US/Eastern]"
        res = s.map(
            lambda x: "{0}_{1}_{2}".format(x.__class__.__name__, x.day, x.tz))
        exp = pd.Series(["Timestamp_1_US/Eastern", "Timestamp_2_US/Eastern"])
        tm.assert_series_equal(res, exp)

        # timedelta
        vals = [pd.Timedelta("1 days"), pd.Timedelta("2 days")]
        s = pd.Series(vals)
        assert s.dtype == "timedelta64[ns]"
        res = s.map(lambda x: "{0}_{1}".format(x.__class__.__name__, x.days))
        exp = pd.Series(["Timedelta_1", "Timedelta_2"])
        tm.assert_series_equal(res, exp)

        # period
        vals = [
            pd.Period("2011-01-01", freq="M"),
            pd.Period("2011-01-02", freq="M")
        ]
        s = pd.Series(vals)
        assert s.dtype == "Period[M]"
        res = s.map(
            lambda x: "{0}_{1}".format(x.__class__.__name__, x.freqstr))
        exp = pd.Series(["Period_M", "Period_M"])
        tm.assert_series_equal(res, exp)

    def test_map_categorical(self):
        values = pd.Categorical(list("ABBABCD"),
                                categories=list("DCBA"),
                                ordered=True)
        s = pd.Series(values, name="XX", index=list("abcdefg"))

        result = s.map(lambda x: x.lower())
        exp_values = pd.Categorical(list("abbabcd"),
                                    categories=list("dcba"),
                                    ordered=True)
        exp = pd.Series(exp_values, name="XX", index=list("abcdefg"))
        tm.assert_series_equal(result, exp)
        tm.assert_categorical_equal(result.values, exp_values)

        result = s.map(lambda x: "A")
        exp = pd.Series(["A"] * 7, name="XX", index=list("abcdefg"))
        tm.assert_series_equal(result, exp)
        assert result.dtype == np.object

        with pytest.raises(NotImplementedError):
            s.map(lambda x: x, na_action="ignore")

    def test_map_datetimetz(self):
        values = pd.date_range("2011-01-01", "2011-01-02",
                               freq="H").tz_localize("Asia/Tokyo")
        s = pd.Series(values, name="XX")

        # keep tz
        result = s.map(lambda x: x + pd.offsets.Day())
        exp_values = pd.date_range("2011-01-02", "2011-01-03",
                                   freq="H").tz_localize("Asia/Tokyo")
        exp = pd.Series(exp_values, name="XX")
        tm.assert_series_equal(result, exp)

        # change dtype
        # GH 14506 : Returned dtype changed from int32 to int64
        result = s.map(lambda x: x.hour)
        exp = pd.Series(list(range(24)) + [0], name="XX", dtype=np.int64)
        tm.assert_series_equal(result, exp)

        with pytest.raises(NotImplementedError):
            s.map(lambda x: x, na_action="ignore")

        # not vectorized
        def f(x):
            if not isinstance(x, pd.Timestamp):
                raise ValueError
            return str(x.tz)

        result = s.map(f)
        exp = pd.Series(["Asia/Tokyo"] * 25, name="XX")
        tm.assert_series_equal(result, exp)

    @pytest.mark.parametrize(
        "vals,mapping,exp",
        [
            (list("abc"), {
                np.nan: "not NaN"
            }, [np.nan] * 3 + ["not NaN"]),
            (list("abc"), {
                "a": "a letter"
            }, ["a letter"] + [np.nan] * 3),
            (list(range(3)), {
                0: 42
            }, [42] + [np.nan] * 3),
        ],
    )
    def test_map_missing_mixed(self, vals, mapping, exp):
        # GH20495
        s = pd.Series(vals + [np.nan])
        result = s.map(mapping)

        tm.assert_series_equal(result, pd.Series(exp))

    @pytest.mark.parametrize(
        "dti,exp",
        [
            (
                Series([1, 2], index=pd.DatetimeIndex([0, 31536000000])),
                DataFrame(np.repeat([[1, 2]], 2, axis=0), dtype="int64"),
            ),
            (
                tm.makeTimeSeries(nper=30),
                DataFrame(np.repeat([[1, 2]], 30, axis=0), dtype="int64"),
            ),
        ],
    )
    def test_apply_series_on_date_time_index_aware_series(self, dti, exp):
        # GH 25959
        # Calling apply on a localized time series should not cause an error
        index = dti.tz_localize("UTC").index
        result = pd.Series(index).apply(lambda x: pd.Series([1, 2]))
        tm.assert_frame_equal(result, exp)

    def test_apply_scaler_on_date_time_index_aware_series(self):
        # GH 25959
        # Calling apply on a localized time series should not cause an error
        series = tm.makeTimeSeries(nper=30).tz_localize("UTC")
        result = pd.Series(series.index).apply(lambda x: 1)
        tm.assert_series_equal(result, pd.Series(np.ones(30), dtype="int64"))
Ejemplo n.º 23
0
    variable_df2 = pd.concat([counter, variable_df], axis=1)

    return variable_df2


#%% SWE observed data T4
with open("input_SWE_T4.csv") as scvd:
    reader = csv.reader(scvd)
    raw_swe = [r for r in reader]
sc_swe_column = []
for csv_counter1 in range(len(raw_swe)):
    for csv_counter2 in range(2):
        sc_swe_column.append(raw_swe[csv_counter1][csv_counter2])
sc_swe = np.reshape(sc_swe_column, (len(raw_swe), 2))
sc_swe = sc_swe[1:]
sc_swe_obs_date = pd.DatetimeIndex(sc_swe[:, 0])
sc_swe_obs = [float(value) for value in sc_swe[:, 1]]
swe_obs_df = pd.DataFrame(sc_swe_obs, columns=['observed swe'])
swe_obs_df.set_index(sc_swe_obs_date, inplace=True)
#counter = pd.DataFrame(np.arange(0,len(swe_obs_df)), columns = ['counter']); counter.set_index(sc_swe_obs_date,inplace=True)
#swe_obs_df2 = pd.concat([counter, swe_obs_df], axis=1)

maxSwe1 = swe_obs_df['observed swe'][0:50000].max()
maxSwe2 = swe_obs_df['observed swe'][50000:].max()
maxSwe_date1 = swe_obs_df['observed swe'][0:50000].idxmax()
maxSwe_date2 = swe_obs_df['observed swe'][50000:].idxmax()

#%% open scenario-Params test
p1 = [273.16]  #tempCritRain

p2 = [4]  #mw_exp exponent for meltwater flow
Ejemplo n.º 24
0
Archivo: d_atr.py Proyecto: dxcv/qqqhhh
# 开仓条件
df = df.dropna(axis=0)
df['高于前两天高点'] = np.where(df.h >= df.nhh, 1, None)   # 看当天 
df['低于前两天低点'] = np.where(df.l <= df.nll, 1, None)
df['开仓'] = np.where(df['高于前两天高点'] == 1, 'bk', None)
df['开仓'] = np.where(df['低于前两天低点'] == 1, 'sk', df['开仓'] )
#
'''
--------------------------趋势判断end---------------------------------
'''

#平仓的同时不反向开仓
#df['开仓'] = np.where(df['平仓'].isnull(), df['开仓'], None)

# 这个不能少
dates = pd.DatetimeIndex(df.date)
df.index = dates
df = df.drop('date', axis=1)

df['bk总手数'] = 0
df['bkprice'] = 0
#df['b持仓均价'] = 0  # 
#df['b保证金'] = 0  #
#df['b合约金额'] = 0  # 比如3000点买的螺纹, 实际合约价值是10吨,3万元
df['是b止损'] = None
df['b止损'] = None
df['sk总手数'] = 0
df['skprice'] = 0 
#df['s保证金'] = 0
#df['s持仓均价'] = 0  #
#df['s合约金额'] = 0
Ejemplo n.º 25
0
def dekad_index(begin, end=None):
    """Creates a pandas datetime index on a decadal basis.

    Parameters
    ----------
    begin : datetime
        Datetime index start date.
    end : datetime, optional
        Datetime index end date, set to current date if None.

    Returns
    -------
    dtindex : pandas.DatetimeIndex
        Dekadal datetime index.
    """

    if end is None:
        end = datetime.now()

    mon_begin = datetime(begin.year, begin.month, 1)
    mon_end = datetime(end.year, end.month, 1)

    daterange = pd.date_range(mon_begin, mon_end, freq='MS')

    dates = []

    for i, dat in enumerate(daterange):
        lday = calendar.monthrange(dat.year, dat.month)[1]
        if i == 0 and begin.day > 1:
            if begin.day < 11:
                if daterange.size == 1:
                    if end.day < 11:
                        dekads = [10]
                    elif end.day >= 11 and end.day < 21:
                        dekads = [10, 20]
                    else:
                        dekads = [10, 20, lday]
                else:
                    dekads = [10, 20, lday]
            elif begin.day >= 11 and begin.day < 21:
                if daterange.size == 1:
                    if end.day < 21:
                        dekads = [20]
                    else:
                        dekads = [20, lday]
                else:
                    dekads = [20, lday]
            else:
                dekads = [lday]
        elif i == (len(daterange) - 1) and end.day < 21:
            if end.day < 11:
                dekads = [10]
            else:
                dekads = [10, 20]
        else:
            dekads = [10, 20, lday]

        for j in dekads:
            dates.append(datetime(dat.year, dat.month, j))

    dtindex = pd.DatetimeIndex(dates)

    return dtindex
Ejemplo n.º 26
0
ax = sns.barplot(x="birthState", y="points", hue="playerID", data=asd)
plt.legend('')
plt.show()

#.sort_values(["score"],ascending = False).reset_index()
#pd.set_option('display.max_columns', None)
#asd = asd[asd.birthState == "NC"]
#print(asd)
"""
"""
nba_df = nba_df[nba_df.birthCountry == 'USA']
nba_df = nba_df[nba_df.birthDate != "0000-00-00"]
nba_df = nba_df[nba_df.height > 0]
print(nba_df)
nba_df['born_year'] = pd.DatetimeIndex(nba_df['birthDate']).year
nba_df['age'] = nba_df['year'] - nba_df['born_year']
print(nba_df)
nba_ht = nba_df[[
    'points', 'rebounds', 'assists', 'age', 'playerID', 'pos', 'birthState',
    'year', 'height', 'threeMade', 'steals'
]]
print(nba_ht)
print(nba_ht.corr(method='pearson'))

#ax = sns.heatmap(heat_data,cmap="YlGnBu",linewidths=.5)

test = nba_df[['points', 'rebounds', 'assists', 'height']]
"""
corr = test.corr()
corr[np.abs(corr)<.2] = 0
dataframe['consumption_dayahead_increase_lastWeek'] = np.nan
dataframe['price_intraday_increase_lastWeek'] = np.nan
dataframe['consumption_intraday_increase_lastWeek'] = np.nan

for index, row in dataframe.iterrows():
    # skip the first 8 days
    daysToSkip = 8
    rowsToSkip = (96 * daysToSkip)
    if index > rowsToSkip - 1:
        hour = dataframe.iloc[index]['datetime'].hour
        minute = dataframe.iloc[index]['datetime'].minute
        # go to the index of the last day at 11:30
        lastAuctionDate = pd.to_datetime(
            dataframe.iloc[index]['datetime'].date() - datetime.timedelta(1))
        indexBeforePrediction = dataframe.loc[
            (pd.DatetimeIndex(dataframe['datetime']).hour == 11)
            & (pd.DatetimeIndex(dataframe['datetime']).minute == 30) &
            (pd.DatetimeIndex(dataframe['datetime']).day
             == lastAuctionDate.day) & (pd.DatetimeIndex(
                 dataframe['datetime']).month == lastAuctionDate.month) &
            (pd.DatetimeIndex(dataframe['datetime']).year
             == lastAuctionDate.year)].index[0]
        indexBeforePrediction = indexBeforePrediction.astype(np.int32)

        lastWeekdf = dataframe[indexBeforePrediction -
                               671:indexBeforePrediction + 1]
        lastWeekdf = lastWeekdf.loc[
            (pd.DatetimeIndex(lastWeekdf['datetime']).hour == hour)
            & (pd.DatetimeIndex(lastWeekdf['datetime']).minute == minute)]
        lastDaydf = lastWeekdf.iloc[-1]
        # last 24 hours before 11:45 on the day of the last auction
Ejemplo n.º 28
0
def clean(output, data, kind=None, debug=True):
    """
    Checks the output and fix common errors:
        - liquidity
        - missed dates
        - exposure
        - normalization
    :param output:
    :param data:
    :param kind:
    :return:
    """
    import qnt.stats as qns
    import qnt.exposure as qne
    from qnt.data.common import ds, f, track_event

    if kind is None:
        kind = data.name

    output = output.drop(ds.FIELD, errors='ignore')

    with LogSettings(err2info=True):
        log_info("Output cleaning...")

        single_day = ds.TIME not in output.dims

        if not single_day:
            track_event("OUTPUT_CLEAN")

        log_info("fix uniq")
        if not single_day:
            # uniq time fix
            val, idx = np.unique(output.time, return_index=True)
            output = output.isel(time=idx)
        # uniq asset fix
        val, idx = np.unique(output.asset, return_index=True)
        output = output.isel(asset=idx)

        if single_day:
            output = output.drop(ds.TIME, errors='ignore')
            output = xr.concat([output],
                               pd.Index([data.coords[ds.TIME].values.max()],
                                        name=ds.TIME))
        else:
            log_info("ffill if the current price is None...")
            output = output.fillna(0)
            output = output.where(np.isfinite(data.sel(field='close')))
            output = output.ffill('time')
            output = output.fillna(0)

        if kind == "stocks" or kind == "stocks_long" \
                or kind == 'crypto_daily' or kind == 'cryptodaily'\
                or kind == 'crypto_daily_long' or kind == 'crypto_daily_long_short':
            log_info("Check liquidity...")
            non_liquid = qns.calc_non_liquid(data, output)
            if len(non_liquid.coords[ds.TIME]) > 0:
                log_info("WARNING! Strategy trades non-liquid assets.")
                log_info("Fix liquidity...")
                is_liquid = data.sel(field=f.IS_LIQUID)
                is_liquid = xr.align(is_liquid, output, join='right')[0]
                output = xr.where(is_liquid == 0, 0, output)
            log_info("Ok.")

        if not single_day:
            log_info("Check missed dates...")
            missed_dates = qns.find_missed_dates(output, data)
            if len(missed_dates) > 0:
                log_info("WARNING! Output contain missed dates.")
                log_info("Adding missed dates and set zero...")
                add = xr.concat([output.isel(time=-1)] * len(missed_dates),
                                pd.DatetimeIndex(missed_dates, name="time"))
                add = xr.full_like(add, np.nan)
                output = xr.concat([output, add], dim='time')
                output = output.fillna(0)
                if kind == "stocks" or kind == "stocks_long" \
                        or kind == 'crypto_daily' or kind == 'cryptodaily' \
                        or kind == 'crypto_daily_long' or kind == 'crypto_daily_long_short':
                    output = output.where(data.sel(field='is_liquid') > 0)
                output = output.dropna('asset', 'all').dropna('time',
                                                              'all').fillna(0)
                output = normalize(output)
            else:
                log_info("Ok.")

        if kind == 'stocks_long' or kind == 'crypto_daily_long':
            log_info("Check positive positions...")
            neg = output.where(output < 0).dropna(ds.TIME, 'all')
            if len(neg.time) > 0:
                log_info(
                    "WARNING! Output contains negative positions. Clean...")
                output = output.where(output >= 0).fillna(0)
            else:
                log_info("Ok.")

        if kind == "stocks" or kind == "stocks_long":
            log_info("Check exposure...")
            if not qns.check_exposure(output):
                log_info("Cut big positions...")
                output = qne.cut_big_positions(output)
                log_info("Check exposure...")
                if not qns.check_exposure(output):
                    log_info("Drop bad days...")
                    output = qne.drop_bad_days(output)

        if kind == "crypto":
            log_info("Check BTC...")
            if output.where(output != 0).dropna(
                    "asset",
                    "all").coords[ds.ASSET].values.tolist() != ['BTC']:
                log_info("WARNING! Output contains not only BTC.")
                log_info("Fixing...")
                output = output.sel(asset=['BTC'])
            else:
                log_info("Ok.")

        log_info("Normalization...")
        output = normalize(output)
        log_info("Output cleaning is complete.")

    return output
Ejemplo n.º 29
0
df.loc[::3, ::3]
df.iloc[::3, ::3]
df.iloc[::3, ['mpg', 'cyl']]  #error
df.iloc[-1:, -1:]  #last row & last col
df.iloc[-1:, ]  #last row
df.iloc[:, -1:]  #last col
df.iloc[[3, 4], [1, 2]]  #access by integer pos
# retrieving all rows and some columns by iloc method
df.iloc[:, [1, 2]]
df.ix["Car1"]
df.ix[["Car1", 'Car5']]
df.ix[["Car1", 'Car5'], ['mpg', 'cyl']]

#%%
#index
df2 = df.set_index(pd.DatetimeIndex(df['mDate']), drop=False, inplace=False)
df2
#between certain dates
df2['2018-1-1':'2020-1-1']

df.columns
df.index
df3 = df.set_index(['mDate'], append=True, inplace=False)
df3.head()

#%%%
#df.isin()
filter1 = df["gear"].isin([4])
filter2 = df["cyl"].isin([4, 6])  #4,6,8
df[filter1]
df[filter1 & filter2]
Ejemplo n.º 30
0
    def test_combine_first_timezone(self):
        # see gh-7630
        data1 = pd.to_datetime('20100101 01:01').tz_localize('UTC')
        df1 = pd.DataFrame(columns=['UTCdatetime', 'abc'],
                           data=data1,
                           index=pd.date_range('20140627', periods=1))
        data2 = pd.to_datetime('20121212 12:12').tz_localize('UTC')
        df2 = pd.DataFrame(columns=['UTCdatetime', 'xyz'],
                           data=data2,
                           index=pd.date_range('20140628', periods=1))
        res = df2[['UTCdatetime']].combine_first(df1)
        exp = pd.DataFrame(
            {
                'UTCdatetime': [
                    pd.Timestamp('2010-01-01 01:01', tz='UTC'),
                    pd.Timestamp('2012-12-12 12:12', tz='UTC')
                ],
                'abc': [pd.Timestamp('2010-01-01 01:01:00', tz='UTC'), pd.NaT]
            },
            columns=['UTCdatetime', 'abc'],
            index=pd.date_range('20140627', periods=2, freq='D'))
        tm.assert_frame_equal(res, exp)
        assert res['UTCdatetime'].dtype == 'datetime64[ns, UTC]'
        assert res['abc'].dtype == 'datetime64[ns, UTC]'

        # see gh-10567
        dts1 = pd.date_range('2015-01-01', '2015-01-05', tz='UTC')
        df1 = pd.DataFrame({'DATE': dts1})
        dts2 = pd.date_range('2015-01-03', '2015-01-05', tz='UTC')
        df2 = pd.DataFrame({'DATE': dts2})

        res = df1.combine_first(df2)
        tm.assert_frame_equal(res, df1)
        assert res['DATE'].dtype == 'datetime64[ns, UTC]'

        dts1 = pd.DatetimeIndex(
            ['2011-01-01', 'NaT', '2011-01-03', '2011-01-04'], tz='US/Eastern')
        df1 = pd.DataFrame({'DATE': dts1}, index=[1, 3, 5, 7])
        dts2 = pd.DatetimeIndex(['2012-01-01', '2012-01-02', '2012-01-03'],
                                tz='US/Eastern')
        df2 = pd.DataFrame({'DATE': dts2}, index=[2, 4, 5])

        res = df1.combine_first(df2)
        exp_dts = pd.DatetimeIndex([
            '2011-01-01', '2012-01-01', 'NaT', '2012-01-02', '2011-01-03',
            '2011-01-04'
        ],
                                   tz='US/Eastern')
        exp = pd.DataFrame({'DATE': exp_dts}, index=[1, 2, 3, 4, 5, 7])
        tm.assert_frame_equal(res, exp)

        # different tz
        dts1 = pd.date_range('2015-01-01', '2015-01-05', tz='US/Eastern')
        df1 = pd.DataFrame({'DATE': dts1})
        dts2 = pd.date_range('2015-01-03', '2015-01-05')
        df2 = pd.DataFrame({'DATE': dts2})

        # if df1 doesn't have NaN, keep its dtype
        res = df1.combine_first(df2)
        tm.assert_frame_equal(res, df1)
        assert res['DATE'].dtype == 'datetime64[ns, US/Eastern]'

        dts1 = pd.date_range('2015-01-01', '2015-01-02', tz='US/Eastern')
        df1 = pd.DataFrame({'DATE': dts1})
        dts2 = pd.date_range('2015-01-01', '2015-01-03')
        df2 = pd.DataFrame({'DATE': dts2})

        res = df1.combine_first(df2)
        exp_dts = [
            pd.Timestamp('2015-01-01', tz='US/Eastern'),
            pd.Timestamp('2015-01-02', tz='US/Eastern'),
            pd.Timestamp('2015-01-03')
        ]
        exp = pd.DataFrame({'DATE': exp_dts})
        tm.assert_frame_equal(res, exp)
        assert res['DATE'].dtype == 'object'