def vintage_yr(self, df, x, y): df[y] = pd.DatetimeIndex( df[x]).year #convert the data to a data format, apply year method. return df
import matplotlib.pyplot as plt import mplleaflet import pandas as pd df = pd.read_csv( 'data/C2A2_data/BinnedCsvs_d400/9ad03d45b78bef0fe159c09aef98bda55e72a4bc59168beba135db95.csv' ) df.head() # In[2]: import datetime df['Date'] = pd.to_datetime(df['Date']) df['year'] = pd.DatetimeIndex(df['Date']).year df['month'] = pd.DatetimeIndex(df['Date']).month df['day'] = pd.DatetimeIndex(df['Date']).day df['Data_Value'] = df['Data_Value'] * .1 df['Data_Value'].head() # In[3]: import numpy as np days = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] months = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] minTemp = [] maxTemp = [] minX = []
def test_constructor_wrong_precision_raises(self): with pytest.raises(ValueError): pd.DatetimeIndex(["2000"], dtype="datetime64[us]")
def f_dni_extra(times): times = pd.DatetimeIndex(times) return pvlib.irradiance.extraradiation(times)
def test_days_at_time(self, day, day_offset, time_offset, tz, expected): days = pd.DatetimeIndex([pd.Timestamp(day, tz=tz)]) result = days_at_time(days, time_offset, tz, day_offset)[0] expected = pd.Timestamp(expected, tz=tz).tz_convert(UTC) self.assertEqual(result, expected)
usaGoodsAndServices.set_index('DATE', inplace=True) usaGoodsAndServicesEXP = usaGoodsAndServices[usaGoodsAndServices['SUBJECT'] == 'EXP'] usaGoodsAndServicesIMP = usaGoodsAndServices[usaGoodsAndServices['SUBJECT'] == 'IMP'] usaGoodsAndServices = usaGoodsAndServicesEXP usaGoodsAndServices.rename(columns={'Value': 'Exports'}, inplace=True) usaGoodsAndServices['Imports'] = usaGoodsAndServicesIMP[['Value' ]].values.flatten() usaGoodsAndServices['Total Trade Volume'] = usaGoodsAndServices[ 'Exports'] + usaGoodsAndServices['Imports'] usaGoodsAndServices = usaGoodsAndServices[['Total Trade Volume']] # INFLATION inflation = pd.read_csv('./Data/Inflation Data (Long).csv') inflation['DATE'] = pd.DatetimeIndex(inflation['DATE']).year inflation.set_index('DATE', inplace=True) # CORRELATIONS (-.63) flatInflation = inflation.values.flatten() corrDf = pd.DataFrame() corrDf['Total Trade Volume'] = usaGoodsAndServices['Total Trade Volume'] corrDf['Inflation'] = flatInflation[10:] goodsAndServicesInflationCorr = corrDf.corr().values[0][1] print(goodsAndServicesInflationCorr) # PLOT TRADE fig = plt.figure() ax = plt.gca() ax2 = ax.twinx() ax.plot(usaGoodsAndServices.index,
def plot(self, symbol=None, engine='plotly', notebook=False): if engine == 'plotly': if type(symbol) == str: df = pd.DataFrame(self.latest_bar_dict[symbol]) df.set_index('date', inplace=True) df.index = pd.DatetimeIndex(df.index) p_symbol = go.Scatter(x=df.index, y=df.close, xaxis='x3', yaxis='y3', name=symbol) p_volume = go.Bar(x=df.index, y=df['volume'], xaxis='x3', yaxis='y5', opacity=0.5, name='volume') self.data.append(p_symbol) self.data.append(p_volume) if type(symbol) == list: for i in symbol: df = pd.DataFrame(self.latest_bar_dict[i]) df.set_index('date', inplace=True) df.index = pd.DatetimeIndex(df.index) p_symbol = go.Scatter(x=df.index, y=df.close, xaxis='x3', yaxis='y3', name=i) p_volume = go.Bar(x=df.index, y=df['volume'], xaxis='x3', yaxis='y5', opacity=0.5, name=i + 'volume') self.data.append(p_symbol) self.data.append(p_volume) for i in self.holdings: p_holdings = go.Scatter(x=self.holdings.index, y=self.holdings[i], xaxis='x2', yaxis='y2', name=i) self.data.append(p_holdings) p_returns = go.Scatter(x=self.enquity_curve.index, y=self.enquity_curve.returns, xaxis='x4', yaxis='y4', name='returns') self.data.append(p_returns) layout = go.Layout(xaxis2=dict( domain=[0, 1], anchor='y2', ), xaxis3=dict(domain=[0, 1], anchor='y3'), xaxis4=dict(domain=[0, 1], anchor='y4'), yaxis2=dict(domain=[0, 0.2], ), yaxis3=dict(domain=[0.2, 0.8]), yaxis4=dict(domain=[0.8, 1], ), yaxis5=dict( domain=[0.2, 0.8], side='right', range=[0, 10000000], overlaying='y3', tickvals=[0, 1000000, 2000000, 2500000], showgrid=False)) fig = go.Figure(data=self.data, layout=layout) if notebook: import plotly plotly.offline.init_notebook_mode() py.iplot(fig, filename='testplot', validate=False) else: py.plot(fig, filename='testplot', validate=False)
def test_constructor_wrong_precision_raises(self): msg = "Unexpected value for 'dtype': 'datetime64\\[us\\]'" with pytest.raises(ValueError, match=msg): pd.DatetimeIndex(["2000"], dtype="datetime64[us]")
def current(self, security, field): now_secs = datetime.now().second if now_secs < 10: # we need to wait 10 seconds after the minute to load current data... this is so the source can be ready. time.sleep(10 - now_secs) if not isinstance(security, Iterable): if security not in self._current_security_bars: security_bars = self.history(security, bar_count=1, frequency=self._data_frequency, field=None) self._current_security_bars[security] = security_bars if self._current_security_bars[security] is None or self._current_security_bars[security].empty: quote_date = datetime.now() quote_date = quote_date.replace(second=0, microsecond=0) self._current_security_bars[security] = pd.DataFrame(index=pd.DatetimeIndex([quote_date]), data={'price': float("nan"), 'open': float("nan"), 'high': float("nan"), 'low': float("nan"), 'close': float("nan"), 'volume': int(0)}) # print("price %s " % self._current_security_bars[security].iloc[-1]["price"]) if self._current_security_bars[security] is not None: # and (not self._current_security_bars[security].empty or self._current_security_bars[security].iloc[-1]["price"] == float["nan"]): last_price_list = self.rh_session.get_quote_list(security.symbol, 'symbol,last_trade_price,bid_price,bid_size,ask_price,ask_size') if last_price_list and len(last_price_list) > 0: self._current_security_bars[security]["price"] = float(last_price_list[0][1]) self._current_security_bars[security]["bid_price"] = float(last_price_list[0][2]) self._current_security_bars[security]["bid_size"] = float(last_price_list[0][3]) self._current_security_bars[security]["ask_price"] = float(last_price_list[0][4]) self._current_security_bars[security]["ask_size"] = float(last_price_list[0][5]) else: # self._current_security_bars[security]["price"] = float("nan") self._current_security_bars[security]["bid_price"] = float("nan") self._current_security_bars[security]["bid_size"] = float("nan") self._current_security_bars[security]["ask_price"] = float("nan") self._current_security_bars[security]["ask_size"] = float("nan") if not field: return self._current_security_bars[security].iloc[-1] # log.info("security_bars(%s): %s" % (security.symbol, self._current_security_bars[security])) return self._current_security_bars[security].iloc[-1][field] else: symbol_list_map = {} return_bars = {} for sec in security: symbol_list_map[sec.symbol] = sec if sec not in self._current_security_bars: security_bars = self.history(sec, bar_count=1, frequency=self._data_frequency, field=None) if not security_bars or sec not in security_bars: quote_date = datetime.now() quote_date = quote_date.replace(second=0, microsecond=0) security_bars[sec] = pd.DataFrame(index=pd.DatetimeIndex([quote_date]), data={'price': float("nan"), 'open': float("nan"), 'high': float("nan"), 'low': float("nan"), 'close': float("nan"), 'volume': int(0)}) self._current_security_bars[sec] = security_bars[sec] if self._current_security_bars[sec] is not None: # and (not self._current_security_bars[sec].empty or self._current_security_bars[sec].iloc[-1]["price"] == float["nan"]): last_price_list = self.rh_session.get_quote_list(sec.symbol, 'symbol,last_trade_price,bid_price,bid_size,ask_price,ask_size') if last_price_list and len(last_price_list) > 0: if sec in self._current_security_bars: self._current_security_bars[sec]["price"] = float(last_price_list[0][1]) self._current_security_bars[sec]["bid_price"] = float(last_price_list[0][2]) self._current_security_bars[sec]["bid_size"] = float(last_price_list[0][3]) self._current_security_bars[sec]["ask_price"] = float(last_price_list[0][4]) self._current_security_bars[sec]["ask_size"] = float(last_price_list[0][5]) if not field: return_bars[sec] = self._current_security_bars[sec].iloc[-1] return_bars[sec] = self._current_security_bars[sec].iloc[-1][field] return return_bars
country, station = l[istation].split(',')[3], l[istation].split(',')[2] date, startTime = l[idate].split(',')[1], l[idate].split(',')[2] dateplus1 = str(pd.to_datetime(date) + pd.Timedelta('1 day'))[:10] datestr = date[:4]+date[5:7]+date[8:] datestrplus1 = dateplus1[:4]+dateplus1[5:7]+dateplus1[8:] try: ## Then use Pandas to read in the sonde profile if footer>0: df = pd.read_csv(osf, header=header, skipfooter=footer) else: df = pd.read_csv(osf, header=header) df['lat'], df['lon'] = lat, lon ## Appending to the df makes the lat-lon the right dimension for .nc input df = df[df['Duration']>-0.0001] ## Remove "pre-sonde" information df['Date_Time'] = list(map(calc_DT, df['Duration'])) df.set_index(pd.DatetimeIndex(df['Date_Time'], dayfirst=True), inplace=True) mth = str(df.index.month[0]) df['O3_sonde'] = 1e9*((df['O3PartialPressure']/1000)/(df['Pressure']*100)) print('<><><><><> '+date+', '+station+', '+country+' <><><><><>') gcr1, gcr2 = pd.DataFrame(), pd.DataFrame() nr = Dataset(path+'ozonesondes/obsPack_output/no_rockets/GEOSChem.ObsPack.'+datestr+'_0000z.nc4') wr = Dataset(path+'ozonesondes/obsPack_output/with_rockets/GEOSChem.ObsPack.'+datestr+'_0000z.nc4') gcr1['P_gchem'] = wr.variables['pressure'][:] gcr1['O3_gcnr'] = nr.variables['O3'][:]*1e9 gcr1['O3_gcwr'] = wr.variables['O3'][:]*1e9 gcr1['lat'], gcr1['lon'] = wr.variables['lat'][:], wr.variables['lon'][:] gcr1 = gcr1.astype('float')
def get_data(data_select, columns_select): # create connect mydb = sql.connect(host='10.120.14.100', database='hospitaldb', user='******', password='') db_cursor = mydb.cursor(buffered=True) #get record db_cursor.execute(data_select) resoult = db_cursor.fetchall() # get columns_name db_cursor.execute(columns_select) columns_name = [column[0] for column in db_cursor.fetchall()] # turn to dataframe ctdata = pd.DataFrame(resoult, columns=columns_name) # close connect mydb.close() # --- ETL for chart---------------------------------------------------------# # ---上午&下午 -> AM&PM -> timetype ---# ctdata['BDATE'] = ctdata['BDATE'].str.replace('上午', 'AM').str.replace( '下午', 'PM') ctdata['BDATE'] = pd.to_datetime(ctdata['BDATE'], format="%Y/%m/%d %p %I:%M:%S") ctdata['EDATE'] = ctdata['EDATE'].str.replace('上午', 'AM').str.replace( '下午', 'PM') ctdata['EDATE'] = pd.to_datetime(ctdata['EDATE'], format="%Y/%m/%d %p %I:%M:%S") # ---上午&下午 -> AM&PM -> timetype ---# # ---create Spend time ---# ctdata['second'] = ctdata['EDATE'] - ctdata['BDATE'] ctdata['second'] = ctdata['second'].dt.total_seconds() # time to second # ---create Spend time ---# # ---create Check YEAR & MONTH & DAY ---# ctdata['CDATE'] = ctdata['CDATE'].str.replace('上午', 'AM').str.replace( '下午', 'PM') ctdata['CDATE'] = pd.to_datetime(ctdata['CDATE'], format="%Y/%m/%d %p %I:%M:%S") ctdata['YEAR'] = pd.DatetimeIndex(ctdata['CDATE']).year ctdata['MONTH'] = pd.DatetimeIndex(ctdata['CDATE']).month ctdata['DAY'] = pd.DatetimeIndex(ctdata['CDATE']).day # ---create Check YEAR & MONTH & DAY ---# ctdata = ctdata[ctdata['AGE'] <= 100] # remove over 100 years old ctdata = ctdata[(ctdata['second'] >= 300) & (ctdata['second'] <= 6000)] # take 300~ 6000 second record # ctdata = ctdata.dropna() # remove NULL values # --- ETL for chart---------------------------------------------------------# df = ctdata[[ 'YEAR', 'MONTH', 'DAY', 'ITEM', 'MODEL_NAME', 'AMOUNT', 'IO', 'SEX', 'AGE', 'second' ]] df = pd.DataFrame(df) return df
import pandas as pd import lightgbm as lgbm import warnings warnings.filterwarnings("ignore") # read train data file and holiday table data_df = pd.read_csv("train.csv") holiday_df = pd.read_csv("holiday.csv") #Data preprocessing for train data data_df = data_df.set_index('id') data_df['date'] = pd.to_datetime((data_df['date']), format='%d/%m/%Y %H:%M') data_df['hour'] = pd.DatetimeIndex(data_df['date']).hour data_df['year'] = pd.DatetimeIndex(data_df['date']).year data_df['month'] = pd.DatetimeIndex(data_df['date']).month data_df['day'] = pd.DatetimeIndex(data_df['date']).day data_df['weekday'] = (data_df['date'].dt.dayofweek) data_df['workingday'] = (data_df['date'].dt.dayofweek < 5).astype(int) #Import weather condition data from https://www.worldweatheronline.com/ weather_df = pd.read_csv('hkweather.csv') weather_df['date_time'] = pd.to_datetime((weather_df['date_time']), format='%Y-%m-%d') weather_df = weather_df[[ 'date_time', 'cloudcover', 'humidity', 'tempC', 'visibility', 'winddirDegree', 'windspeedKmph', 'WindChillC' ]] weather_df['year'] = pd.DatetimeIndex(weather_df['date_time']).year weather_df['month'] = pd.DatetimeIndex(weather_df['date_time']).month weather_df['day'] = pd.DatetimeIndex(weather_df['date_time']).day weather_df = weather_df.drop(columns=['date_time'], axis=1)
def cusum_filter(self, raw_time_series, threshold, time_stamps=True): """ Snippet 2.4, page 39, The Symmetric Dynamic/Fixed CUSUM Filter. The CUSUM filter is a quality-control method, designed to detect a shift in the mean value of a measured quantity away from a target value. The filter is set up to identify a sequence of upside or downside divergences from any reset level zero. We sample a bar t if and only if S_t >= threshold, at which point S_t is reset to 0. One practical aspect that makes CUSUM filters appealing is that multiple events are not triggered by raw_time_series hovering around a threshold level, which is a flaw suffered by popular market signals such as Bollinger Bands. It will require a full run of length threshold for raw_time_series to trigger an event. Once we have obtained this subset of event-driven bars, we will let the ML algorithm determine whether the occurrence of such events constitutes actionable intelligence. Below is an implementation of the Symmetric CUSUM filter. Note: As per the book this filter is applied to closing prices but we extended it to also work on other time series such as volatility. :param raw_time_series: (series) of close prices (or other time series, e.g. volatility). :param threshold: (float or pd.Series) when the abs(change) is larger than the threshold, the function captures it as an event, can be dynamic if threshold is pd.Series :param time_stamps: (bool) default is to return a DateTimeIndex, change to false to have it return a list. :return: (datetime index vector) vector of datetimes when the events occurred. This is used later to sample. """ t_events = [] s_pos = 0 s_neg = 0 # log returns raw_time_series = pd.DataFrame(raw_time_series) # Convert to DataFrame raw_time_series.columns = ['price'] raw_time_series['log_ret'] = raw_time_series.price.apply(np.log).diff() if isinstance(threshold, (float, int)): raw_time_series['threshold'] = threshold elif isinstance(threshold, pd.Series): raw_time_series.loc[threshold.index, 'threshold'] = threshold else: raise ValueError('threshold is neither float nor pd.Series!') raw_time_series = raw_time_series.iloc[1:] # Drop first na values # Get event time stamps for the entire series for tup in raw_time_series.itertuples(): thresh = tup.threshold pos = float(s_pos + tup.log_ret) neg = float(s_neg + tup.log_ret) s_pos = max(0.0, pos) s_neg = min(0.0, neg) if s_neg < -thresh: s_neg = 0 t_events.append(tup.Index) elif s_pos > thresh: s_pos = 0 t_events.append(tup.Index) # Return DatetimeIndex or list if time_stamps: event_timestamps = pd.DatetimeIndex(t_events) return event_timestamps return t_events
def vintage_mnth(self, df, x, y): df[y] = pd.DatetimeIndex( df[x] ).month #convert the data to a data format, apply month method. return df
def main(args): # writing log file for reproducibility logfile = '%s_log.txt' % args.op[:-4] os.system("rm %s" % logfile) outF = open(logfile, 'w') outF.write('Input arguments to getminmax.py: ') outF.write('\n') outF.write('\n') print(args, file=outF) outF.close() # checking to see if we should use a logarithmic y scale plotlog = True if args.nl is not None: if args.nl == 'y': plotlog = False # checking see whether filtered quantities should be calculated p1dm = True p15dm = False p30dm = False if args.p15dm is not None: if args.p15dm == 'y': p15dm = True if args.p30dm is not None: if args.p30dm == 'y': p30dm = True # reading in file, converting datetime column to datetime type if args.infile_snotel is not None: dfstreams = pd.read_csv(args.infile_stream) dfsnowtel = pd.read_csv(args.infile_snotel) df1 = pd.merge(left=dfstreams, right=dfsnowtel, how='left', left_on='datetime', right_on='datetime') else: df1 = pd.read_csv(args.infile_stream) df1['datetime'] = df1['datetime'].astype('datetime64[ns]') df1['month'] = pd.DatetimeIndex(df1['datetime']).month df1['year'] = pd.DatetimeIndex(df1['datetime']).year df1['day'] = pd.DatetimeIndex(df1['datetime']).day df1['doy'] = pd.DatetimeIndex(df1['datetime']).dayofyear # trimming dataframe to only include bounding dates mindate = args.bounds[0] maxdate = args.bounds[1] minmonth = mindate[0:2] minday = mindate[3:5] minyear = mindate[6:10] maxmonth = maxdate[0:2] maxday = maxdate[3:5] maxyear = maxdate[6:10] years = np.arange(int(minyear), int(maxyear) + 1, 1) minvals = [] maxvals = [] mindates = [] maxdates = [] filter = [] fig = plt.figure(figsize=(6.5, 5)) if plotlog: plt.yscale('log') for i in range(len(years) - 1): mindatei = '%s-%s-%s' % (minmonth, minday, years[i]) maxdatei = '%s-%s-%s' % (maxmonth, maxday, years[i + 1]) print(mindatei, maxdatei) df = df1[(df1.datetime >= mindatei) & (df1.datetime <= maxdatei)] try: minyear = df['year'].min() maxyear = df['year'].max() print(minyear, maxyear) # restricting range to search for min/max values to avoid values in adjacent water years dfmaxsearch = df[((df.month > 10) & (df.year == minyear)) | ((df.month < 9) & (df.year == maxyear))] dfminsearch = df[(df.month > 4) & (df.year == maxyear)] minval = dfminsearch[args.c1].min() maxval = dfmaxsearch[args.c1].max() minval_date = dfminsearch[dfminsearch[args.c1] == minval]['datetime'].values[0] maxval_date = dfmaxsearch[dfmaxsearch[args.c1] == maxval]['datetime'].values[0] except: print('insufficient data for this year: ', mindatei, maxdatei) print('Skipping .....') continue minvals.append(minval) maxvals.append(maxval) mindates.append(minval_date) maxdates.append(maxval_date) filter.append('1') # creating smoothed curve for entered parameter if p30dm: sigma = 30 # days gridsp = 1 # days npts = sigma / 2 / gridsp df['smooth_30'] = ndimage.filters.gaussian_filter( df[args.c1].values, npts) minval30 = df['smooth_30'].min() maxval30 = df['smooth_30'].max() minval_date30 = df[df['smooth_30'] == minval30]['datetime'].values[0] maxval_date30 = df[df['smooth_30'] == maxval30]['datetime'].values[0] minvals.append(minval30) maxvals.append(maxval30) mindates.append(minval_date30) maxdates.append(maxval_date30) filter.append('30') if p15dm: sigma = 15 # days gridsp = 1 # days npts = sigma / 2 / gridsp df['smooth_15'] = ndimage.filters.gaussian_filter( df[args.c1].values, npts) minval15 = df['smooth_15'].min() maxval15 = df['smooth_15'].max() minval_date15 = df[df['smooth_15'] == minval15]['datetime'].values[0] maxval_date15 = df[df['smooth_15'] == maxval15]['datetime'].values[0] minvals.append(minval15) maxvals.append(maxval15) mindates.append(minval_date15) maxdates.append(maxval_date15) filter.append('15') if plotlog: alphan = 0.05 else: alphan = 0.5 dosy = np.arange(0, len(df)) df['day_of_snow_year'] = dosy if args.mm == '30d': if not p30dm: print('must include flag "-p30dm y" to plot 30 day mean') print('exiting ... ') exit() plt.plot(df.day_of_snow_year, df.smooth_30, color='gray', linewidth=3, alpha=alphan, label='yearly data') elif args.mm == '15d': if not p15dm: print('must include flag "-p15dm y" to plot 15 day mean') print('exiting ... ') exit() plt.plot(df.day_of_snow_year, df.smooth_15, color='gray', linewidth=3, alpha=alphan, label='yearly data') else: plt.plot(df.day_of_snow_year, df[args.c1], color='gray', linewidth=3, alpha=alphan, label='yearly data') # calculating statistics for each day in the year min_vals = [] max_vals = [] median_vals = [] doys = [] for doy in set(list(df1['doy'].values)): print(doy) df3 = df1[df1.doy == doy] median_val = df3[args.c1].median() min_val = df3[args.c1].min() max_val = df3[args.c1].max() median_vals.append(median_val) min_vals.append(min_val) max_vals.append(max_val) doys.append(doy) stats_df = pd.DataFrame({ 'doy': doys, 'min_vals': min_vals, 'max_vals': max_vals, 'median_vals': median_vals }) # extracting a year we know has good/complete data mindatei = '%s-%s-%s' % (minmonth, minday, 2005) maxdatei = '%s-%s-%s' % (maxmonth, maxday, 2006) df = df1[(df1.datetime >= mindatei) & (df1.datetime <= maxdatei)] dosy = np.arange(0, len(df)) df['day_of_snow_year'] = dosy min_dosys = [] max_dosys = [] med_dosys = [] dosys = [] doys = [] for dosy in df['day_of_snow_year'].values: df2 = df[df.day_of_snow_year == dosy] doy = df2['doy'].values[0] stats2 = stats_df[stats_df.doy == doy] min_dosy = stats2['min_vals'].values[0] max_dosy = stats2['max_vals'].values[0] median_dosy = stats2['median_vals'].values[0] min_dosys.append(min_dosy) max_dosys.append(max_dosy) med_dosys.append(median_dosy) dosys.append(dosy) doys.append(doy) begmonths = df[df.day == 1] begs = begmonths['day_of_snow_year'].values begs = begs[::2] ends = begs + 30 for i in range(len(begs)): plt.axvspan(begs[i], ends[i], alpha=0.05, color='gray') if args.plotlowyear is not None: if args.plotlowyear == 'y': # extracting a year we know had very low snow mindatei = '%s-%s-%s' % (minmonth, minday, 2014) maxdatei = '%s-%s-%s' % (maxmonth, maxday, 2015) df1415 = df1[(df1.datetime >= mindatei) & (df1.datetime <= maxdatei)] dosy = np.arange(0, len(df1415)) df1415['day_of_snow_year'] = dosy plt.plot(dosy, df1415[args.c1].values, color='red', linewidth=2, alpha=0.2, label='2014-2015 snow year') if args.plothighyear is not None: if args.plothighyear == 'y': # extracting a year we know had very high snow mindatei = '%s-%s-%s' % (minmonth, minday, 2007) maxdatei = '%s-%s-%s' % (maxmonth, maxday, 2008) df0708 = df1[(df1.datetime >= mindatei) & (df1.datetime <= maxdatei)] dosy = np.arange(0, len(df0708)) df0708['day_of_snow_year'] = dosy plt.plot(dosy, df0708[args.c1].values, color='blue', linewidth=2, alpha=0.2, label='2007-2008 snow year') if args.plotmedyear is not None: if args.plotmedyear == 'y': # extracting a year we know had very high snow mindatei = '%s-%s-%s' % (minmonth, minday, 2005) maxdatei = '%s-%s-%s' % (maxmonth, maxday, 2006) df0708 = df1[(df1.datetime >= mindatei) & (df1.datetime <= maxdatei)] dosy = np.arange(0, len(df0708)) df0708['day_of_snow_year'] = dosy plt.plot(dosy, df0708[args.c1].values, color='yellow', linewidth=2, alpha=0.4, label='2005-2006 snow year') # trimming dataframe to only include bounding dates mindate = args.bounds[0] maxdate = args.bounds[1] minyear = int(mindate[6:10]) maxyear = int(maxdate[6:10]) minyear2, maxyear2 = int(df1['year'].min()), int(df1['year'].max()) if maxyear2 < maxyear: maxyear = maxyear2 if minyear2 > minyear: minyear = minyear2 plt.title('All years %s - %s' % (minyear, maxyear)) plt.xlabel('day of snow year') plt.ylabel('daily %s %s' % (args.c1, args.u1)) plt.plot(dosys, med_dosys, color='black', linewidth=3, label='median') plt.legend(loc='best') fig.savefig('%s_all_years.png' % args.op[:-4], format='png', bbox_inches='tight', pad_inches=0.5, dpi=300) plt.close() # putting min max values in one dataframe minmaxs = pd.DataFrame({ 'minvals': minvals, 'maxvals': maxvals, 'mindates': mindates, 'maxdates': maxdates, 'filter': filter }) print(minmaxs) if args.mm is not None: if args.mm == '30d': minmaxs = minmaxs[minmaxs['filter'] == '30'] elif args.mm == '15d': minmaxs = minmaxs[minmaxs['filter'] == '15'] else: minmaxs = minmaxs[minmaxs['filter'] == '1'] else: minmaxs = minmaxs[minmaxs['filter'] == '1'] # adding julian day values minjul = [] maxjul = [] yearmins = [] yearmaxs = [] for index, row in minmaxs.iterrows(): mindate, maxdate = row['mindates'], row['maxdates'] ttmin, ttmax = mindate.timetuple(), maxdate.timetuple() jmin = ttmin.tm_yday jmax = ttmax.tm_yday yearmin = ttmin.tm_year yearmax = ttmax.tm_year yearmins.append(yearmin) yearmaxs.append(yearmax) minjul.append(jmin) maxjul.append(jmax) print(len(minjul)) print(len(minmaxs)) minmaxs['minjul'] = minjul minmaxs['maxjul'] = maxjul minmaxs['yearmin'] = yearmins minmaxs['yearmax'] = yearmaxs minmaxs['minmonth'] = pd.DatetimeIndex(minmaxs['mindates']).month minmaxs['maxmonth'] = pd.DatetimeIndex(minmaxs['maxdates']).month minmaxs['minyear'] = pd.DatetimeIndex(minmaxs['mindates']).year minmaxs['maxyear'] = pd.DatetimeIndex(minmaxs['maxdates']).year minmaxs['minday'] = pd.DatetimeIndex(minmaxs['mindates']).day minmaxs['maxday'] = pd.DatetimeIndex(minmaxs['maxdates']).day print(minmaxs.info()) print(minmaxs.mindates) print(minmaxs.minvals) fig = plt.figure(figsize=(8, 4)) ax1 = fig.add_subplot(111) if plotlog: plt.yscale('log') ax1.plot_date(x=minmaxs.mindates, y=minmaxs.minvals, marker='o', color='red', label='minimums') ax1.plot_date(x=minmaxs.maxdates, y=minmaxs.maxvals, marker='o', color='blue', label='maximums') ax1.set_xlabel('date') ax1.set_ylabel('daily %s %s' % (args.c1, args.u1)) ax1.legend(loc='best') fig.savefig(args.op, format='png', bbox_inches='tight', pad_inches=0.5, dpi=300) plt.close() fig = plt.figure(figsize=(8, 6)) ax1 = fig.add_subplot(211) ax2 = fig.add_subplot(212) if plotlog: plt.yscale('log') fig = plt.figure(figsize=(8, 4)) ax1 = fig.add_subplot(111) if plotlog: plt.yscale('log') ax1.scatter(minmaxs.minjul, minmaxs.minvals, marker='o', color='red', s=25, label='minimums') ax1.scatter(minmaxs.maxjul, minmaxs.maxvals, marker='o', color='blue', s=25, label='maximums') con = ax1.scatter(minmaxs.maxjul, minmaxs.maxvals, c=minmaxs.yearmax, s=15, edgecolors='none', cmap='Greys') ax1.scatter(minmaxs.minjul, minmaxs.minvals, c=minmaxs.yearmin, s=15, edgecolors='none', cmap='Greys') cbar = fig.colorbar(con) cbar.set_label('year') ax1.set_xlabel('julian day (days since Jan 1st)') ax1.set_ylabel('daily %s %s' % (args.c1, args.u1)) ax1.set_xlim([0, 365]) ax1.legend(loc='best') fig.savefig('%s_juliandays.png' % args.op[:-4], format='png', bbox_inches='tight', pad_inches=0.5, dpi=300) plt.close() fig = plt.figure(figsize=(8, 4)) ax1 = fig.add_subplot(121) ax2 = fig.add_subplot(122) #plt.yscale('log') ax1.scatter(minmaxs.minyear, minmaxs.minvals, marker='o', color='red', s=25, label='minimums') ax2.scatter(minmaxs.maxyear, minmaxs.maxvals, marker='o', color='blue', s=25, label='maximums') ax1.set_title('yearly minimum values') ax2.set_title('yearly maximum values') ax1.set_xlabel('year') ax2.set_xlabel('year') ax1.set_ylabel('daily %s %s' % (args.c1, args.u1)) #ax2.set_ylabel('daily %s %s' % (args.c1, args.u1)) fig.savefig('%s_by_year.png' % args.op[:-4], format='png', bbox_inches='tight', pad_inches=0.5, dpi=300) plt.close() fig = plt.figure(figsize=(8, 4)) ax1 = fig.add_subplot(121) ax2 = fig.add_subplot(122) #plt.yscale('log') ax1.scatter(minmaxs.minyear, minmaxs.minjul, marker='o', color='red', s=25, label='minimums') ax2.scatter(minmaxs.maxyear, minmaxs.maxjul, marker='o', color='blue', s=25, label='maximums') ax1.set_title('julian day of minimum') ax2.set_title('julian day of maximum') ax1.set_xlabel('year') ax2.set_xlabel('year') ax1.set_ylabel('julian day (days since Jan 1st)') ax1.set_ylim([0, 360]) ax2.set_ylim([0, 360]) #ax2.set_ylabel('daily %s %s' % (args.c1, args.u1)) fig.savefig('%s_JD_x_year.png' % args.op[:-4], format='png', bbox_inches='tight', pad_inches=0.5, dpi=300) plt.close()
def cacu_BaseData(para_dict): ''' 计算产品和基准每日行业权重、收益率 ''' out = dict() # ============================================================================= # #计算产品数据 # ============================================================================= st = datetime.datetime.strptime(para_dict['startdate'],'%Y-%m-%d') et = datetime.datetime.strptime(para_dict['enddate'],'%Y-%m-%d') # t0=time.time() # #提取产品净值 根据产品代码,开始结束时间,查出产品的单位净值 # fundnav = funddata.getFundNAV1([para_dict['port_code']],para_dict['startdate'],para_dict['enddate']) # if fundnav.empty: # return dict() # print(time.time()-t0) ## 提取持仓 根据产品代码,开始结束时间,查出产品的持仓 # fund_stk_hlds = funddata.getFundStockHoldings1([para_dict['port_code']],para_dict['startdate'],para_dict['enddate']) #### # 当持仓为空,就不做计算 # if fund_stk_hlds.empty: # return dict() ### # #分解产品的大类资产配置 : A股,港股,其他 # #新增将总资产改为净资产 # fund_assetallocation,fund_ag_stk_holdings,fund_hk_stk_holdings = funddata.cacu_FundAssetsAllocation(fundnav[['port_code','enddate','fund_cumulative_nav','total_asset']] ,fund_stk_hlds) FOFproductList = getfofproductlist() if para_dict['port_code'] in FOFproductList: fund_assetallocation = derivativedata.getassetallocation(para_dict['port_code'], para_dict['startdate'], para_dict['enddate'], 'fes_mom_stock_weight') else: fund_assetallocation = derivativedata.getassetallocation(para_dict['port_code'], para_dict['startdate'], para_dict['enddate'], 'fes_stock_weight') if fund_assetallocation.empty: return dict() dateindex = pd.DatetimeIndex(fund_assetallocation.enddate) #分解产品的行业配置及收益率 fund_ag_stock_hy_df = derivativedata.getFundClassWeight(para_dict['port_code'],st,et,para_dict['standard'],'AG') if not fund_ag_stock_hy_df.empty: # 捕获异常,是为了防止某一天港股或者A股突然新增或者抛掉,又或者数据缺失 try: fund_ag_stock_hy_r = pd.pivot_table(fund_ag_stock_hy_df,index='enddate',columns='classname',values='rtn').loc[dateindex,:].replace(np.nan,0) fund_ag_stock_hy_w = pd.pivot_table(fund_ag_stock_hy_df,index='enddate',columns='classname',values='weight').loc[dateindex,:].replace(np.nan,0).ffill() except: fund_ag_stock_hy_r = pd.pivot_table(fund_ag_stock_hy_df,index='enddate',columns='classname',values='rtn').replace(np.nan,0) fund_ag_stock_hy_w = pd.pivot_table(fund_ag_stock_hy_df,index='enddate',columns='classname',values='weight').replace(np.nan,0).ffill() else: fund_ag_stock_hy_r = pd.DataFrame() fund_ag_stock_hy_w = pd.DataFrame() fund_hk_stock_hy_df = derivativedata.getFundClassWeight(para_dict['port_code'],st,et,para_dict['standard'],'HK') if not fund_hk_stock_hy_df.empty: # 捕获异常,是为了防止某一天港股或者A股突然新增或者抛掉,又或者数据缺失 try: fund_hk_stock_hy_r = pd.pivot_table(fund_hk_stock_hy_df,index='enddate',columns='classname',values='rtn').loc[dateindex,:].replace(np.nan,0) fund_hk_stock_hy_w = pd.pivot_table(fund_hk_stock_hy_df,index='enddate',columns='classname',values='weight').loc[dateindex,:].replace(np.nan,0).ffill() except: fund_hk_stock_hy_r = pd.pivot_table(fund_hk_stock_hy_df,index='enddate',columns='classname',values='rtn').replace(np.nan,0) fund_hk_stock_hy_w = pd.pivot_table(fund_hk_stock_hy_df,index='enddate',columns='classname',values='weight').replace(np.nan,0).ffill() else: fund_hk_stock_hy_r = pd.DataFrame() fund_hk_stock_hy_w = pd.DataFrame() fund_assetsrtn = funddata.cacu_FundAssetsReturn(fund_assetallocation,fund_ag_stock_hy_w, fund_ag_stock_hy_r,fund_hk_stock_hy_w,fund_hk_stock_hy_r) out['fund_ag_stock_hy_r']=fund_ag_stock_hy_r out['fund_ag_stock_hy_w']=fund_ag_stock_hy_w out['fund_hk_stock_hy_r']=fund_hk_stock_hy_r out['fund_hk_stock_hy_w']=fund_hk_stock_hy_w # ============================================================================= # #虚拟构造一个业绩基准 # ============================================================================= benchmarklist_ag = para_dict['benchmarklist_ag'] benchmarklist_hk = para_dict['benchmarklist_hk'] #计算指数的的行业配置及收益率 benchmark_ag_stock_hy_df = derivativedata.getIndexClassWeight(benchmarklist_ag,dateindex[0],dateindex[-1],para_dict['standard']) if not benchmark_ag_stock_hy_df.empty: benchmark_ag_stock_hy_r = pd.pivot_table(benchmark_ag_stock_hy_df,index='enddate',columns='classname',values='rtn').loc[dateindex,:].replace(np.nan,0) benchmark_ag_stock_hy_w = pd.pivot_table(benchmark_ag_stock_hy_df,index='enddate',columns='classname',values='weight').loc[dateindex,:].replace(np.nan,0).ffill() else: benchmark_ag_stock_hy_r = pd.DataFrame() benchmark_ag_stock_hy_w = pd.DataFrame() benchmark_hk_stock_hy_df = derivativedata.getIndexClassWeight(benchmarklist_hk,dateindex[0],dateindex[-1],para_dict['standard']) if not benchmark_hk_stock_hy_df.empty: benchmark_hk_stock_hy_r = pd.pivot_table(benchmark_hk_stock_hy_df,index='enddate',columns='classname',values='rtn').loc[dateindex,:].replace(np.nan,0) benchmark_hk_stock_hy_w = pd.pivot_table(benchmark_hk_stock_hy_df,index='enddate',columns='classname',values='weight').loc[dateindex,:].replace(np.nan,0).ffill() else: benchmark_hk_stock_hy_r = pd.DataFrame() benchmark_hk_stock_hy_w = pd.DataFrame() out['benchmark_ag_stock_hy_r']=benchmark_ag_stock_hy_r out['benchmark_ag_stock_hy_w']=benchmark_ag_stock_hy_w out['benchmark_hk_stock_hy_r']=benchmark_hk_stock_hy_r out['benchmark_hk_stock_hy_w']=benchmark_hk_stock_hy_w #业绩基准的大类资产权重和产品的一致。 benchmark_nav = fund_assetallocation[['enddate','ag_stk_wght','hk_stk_wght','other_wght']] #计算基准的收益率 benchmark_ag_stock_hy_w = benchmark_ag_stock_hy_w.replace(np.nan,0) ag_stk_rtn=(benchmark_ag_stock_hy_r*benchmark_ag_stock_hy_w.shift(1)).sum(axis = 1).reset_index() ag_stk_rtn.columns=['enddate','ag_stock_rtn'] benchmark_hk_stock_hy_w = benchmark_hk_stock_hy_w.replace(np.nan,0) hk_stk_rtn=(benchmark_hk_stock_hy_r*benchmark_hk_stock_hy_w.shift(1)).sum(axis = 1).reset_index() hk_stk_rtn.columns=['enddate','hk_stock_rtn'] benchmark_nav = pd.merge(benchmark_nav,ag_stk_rtn,how='left',on=['enddate']).replace(np.nan,0) benchmark_nav = pd.merge(benchmark_nav,hk_stk_rtn,how='left',on=['enddate']).replace(np.nan,0) benchmark_nav = pd.merge(benchmark_nav,fund_assetsrtn[['enddate','other_rtn']],how='left',on=['enddate']).replace(np.nan,0) benchmark_nav['total_rtn']=benchmark_nav['ag_stk_wght'].shift(1)*benchmark_nav['ag_stock_rtn']+\ benchmark_nav['hk_stk_wght'].shift(1)*benchmark_nav['hk_stock_rtn']+\ benchmark_nav['other_wght'].shift(1)*benchmark_nav['other_rtn'] benchmark_nav['total_rtn']=benchmark_nav['total_rtn'].replace(np.nan,0) #构造出基准的净值曲线 benchmark_nav['fund_cumulative_nav']=(benchmark_nav['total_rtn']+1).cumprod() benchmark_nav['total_asset']=benchmark_nav['fund_cumulative_nav'] benchmark_nav['port_code']=benchmarklist_ag+'-'+benchmarklist_hk benchmark_assetallocation = benchmark_nav[['port_code', 'enddate', 'fund_cumulative_nav', 'total_asset', 'ag_stk_wght', 'hk_stk_wght','other_wght', 'total_rtn']] benchmark_assetallocation.loc[:,'ag_stk_mkv']=benchmark_assetallocation['total_asset']*benchmark_assetallocation['ag_stk_wght'] benchmark_assetallocation.loc[:,'hk_stk_mkv']=benchmark_assetallocation['total_asset']*benchmark_assetallocation['hk_stk_wght'] benchmark_assetallocation.loc[:,'other_mkv']=benchmark_assetallocation['total_asset']*benchmark_assetallocation['other_wght'] #计算基准产品收益率 benchmark_assetsrtn = funddata.cacu_FundAssetsReturn(benchmark_assetallocation,benchmark_ag_stock_hy_w, benchmark_ag_stock_hy_r,benchmark_hk_stock_hy_w,benchmark_hk_stock_hy_r) #调整后的brison#计算调整系数矩阵 kk = brison._cacu_k(fund_assetsrtn['total_rtn_cum'].iloc[-1],benchmark_assetsrtn['total_rtn_cum'].iloc[-1]) kt=[] for i in range(0,len(fund_assetsrtn)): k1 = brison._cacu_k(fund_assetsrtn['total_rtn'][i],benchmark_assetsrtn['total_rtn'][i]) k2 = fund_assetsrtn['enddate'][i] ki=[kk,k1,k2] kt.append(ki) kt_df = pd.DataFrame(kt,columns=['kk','k_t','enddate']) out['kt_df']=kt_df #合并产品和基准指数基本要输 out['fund_ctr_df']=pd.merge(fund_assetallocation,fund_assetsrtn,on=['port_code','enddate','total_rtn']) out['benchmark_ctr_df']=pd.merge(benchmark_assetallocation,benchmark_assetsrtn,on=['port_code','enddate','total_rtn']) return out
def QueryOrLoad(self, start_date='01-01-2015', end_date='01-01-2017'): if path.exists('keys/pecanstkey.txt'): initial_path = '' else: initial_path = '../' fp = initial_path + 'data/netloadsolaridentify_{}_{}.csv'.format( start_date, end_date) fw = initial_path + 'data/weather_netloadsolaridentify_{}_{}.csv'.format( start_date, end_date) ## Close any open connections. import gc for obj in gc.get_objects(): if isinstance(obj, sq.engine.base.Engine): obj.dispose() # Read the keys with open(initial_path + 'keys/pecanstkey.txt', 'r') as f: key = f.read().strip() f.close() # Mayank: engine = sq.create_engine( "postgresql+psycopg2://{}@dataport.pecanstreet.org:5434/postgres". format(key)) if not path.exists(fp): ti = t_clock() # Find sites with complete data for the requested time period and join print('determining sites with full data...') query = """ SELECT e.dataid FROM university.electricity_egauge_15min e WHERE local_15min BETWEEN '{}' AND '{}' AND e.dataid IN ( SELECT m.dataid FROM university.metadata m WHERE m.city = 'Austin' ) GROUP BY dataid HAVING count(e.use) = ( SELECT MAX(A.CNT) FROM ( SELECT dataid, COUNT(use) as CNT FROM university.electricity_egauge_15min WHERE local_15min BETWEEN '{}' AND '{}' GROUP BY dataid ) AS A ); """.format(start_date, end_date, start_date, end_date) metadata = pd.read_sql_query(query, engine) duse = metadata.values.squeeze() print('querying load and generation data...') query = """ SELECT dataid, local_15min, use, gen FROM university.electricity_egauge_15min WHERE local_15min BETWEEN '{}' AND '{}' AND electricity_egauge_15min.dataid in ( """.format(start_date, end_date) + ','.join([str(d) for d in duse]) + """) ORDER BY local_15min; """ load_data = pd.read_sql_query(query, engine) tf = t_clock() deltat = (tf - ti) / 60. print('query of {} values took {:.2f} minutes'.format( load_data.size, deltat)) load_data.to_csv(fp) # Weather data print('querying ambient temperature data from weather table...') locs = pd.read_sql_query( """ SELECT distinct(latitude,longitude), latitude FROM university.weather ORDER BY latitude LIMIT 10; """, engine) locs['location'] = ['Austin', 'San Diego', 'Boulder'] # Ascending order by latitude locs.set_index('location', inplace=True) weather = pd.read_sql_query( """ SELECT localhour, temperature FROM university.weather WHERE localhour BETWEEN '{}' and '{}' AND latitude = {} ORDER BY localhour; """.format(start_date, end_date, locs.loc['Austin']['latitude']), engine) weather.rename(columns={'localhour': 'time'}, inplace=True) # Rename weather['time'] = weather['time'].map( lambda x: x.replace(tzinfo=None)) weather['time'] = pd.to_datetime(weather['time']) weather.set_index('time', inplace=True) weather = weather[~weather.index.duplicated(keep='first')] weather = weather.asfreq('15Min').interpolate( 'linear') # Upsample from 1hr to 15min to match load data weather.to_csv(fw) else: ti = t_clock() load_data = pd.read_csv(fp) weather = pd.read_csv(fw, index_col='time') tf = t_clock() deltat = (tf - ti) print('reading {} values from csv took {:.2f} seconds'.format( load_data.size, deltat)) #Load Setup - set index and fill na load_data.rename(columns={'local_15min': 'time'}, inplace=True) load_data['time'] = pd.DatetimeIndex(load_data['time']) load_data.set_index('time', inplace=True) load_data.fillna(value=0, inplace=True) if 'Unnamed: 0' in load_data.columns: del load_data['Unnamed: 0'] # useless column # # Weather Setup # weather['time'] = pd.DatetimeIndex(weather['time']) weather.set_index(pd.DatetimeIndex(weather.index), inplace=True) # Redefine start_date and end_date so that the weather and load_data dataset match in time stamps and you take the dates common to both. start_date = max(weather.index[0], load_data.index[0]) end_date = min(weather.index[-1], load_data.index[-1]) weather = weather[(weather.index >= pd.to_datetime(start_date)) & (weather.index <= pd.to_datetime(end_date))] lst = list( set(weather.index) - set(load_data['use'].index) ) # when you interpolate hourly data to 15m resolution it also interpolates in the changing time hours. This code inidividuates those times and then I drop them weather = weather.drop(lst) load_data = load_data[(load_data.index >= pd.to_datetime(start_date)) & (load_data.index <= pd.to_datetime(end_date))] # NetLoad load_data['netload'] = load_data['use'] - load_data['gen'] load_data.head() self.load_data = load_data self.weather = weather
import dash import dash_core_components as dcc import dash_html_components as html import pandas as pd import plotly.graph_objs as go from dash.dependencies import Input, Output df_aapl_raw = pd.read_csv("data/AAPL.csv") df_spc_raw = pd.read_csv("data/GSPC.csv") df = df_aapl_raw[2:].reset_index() df1 = df_spc_raw[:-3].reset_index() df['Year'] = pd.DatetimeIndex(df['Date']).year df1['Year'] = pd.DatetimeIndex(df1['Date']).year app = dash.Dash(__name__) app.layout = html.Div([ html.Div([html.H1("Moving Average Crossover Strategy For Apple Stocks ")], style={'textAlign': "center"}), html.Div([ html.Div([ html.Div([dcc.Graph(id="my-graph")], className="row", style={"margin": "auto"}), html.Div([html.Div(dcc.RangeSlider(id="year selection", updatemode='drag', marks={i: '{}'.format(i) for i in df.Year.unique().tolist()}, min=df1.Year.min(), max=df1.Year.max(), value=[2014, 2019]), className="row", style={"padding-bottom": 30,"width":"60%","margin":"auto"}), html.Span("Moving Average : Select Window Interval", className="row", style={"padding-top": 30,"padding-left": 40,"display":"block",
# Decompose & visualize this time series # It looks like this may or may not have captured the seasonality # We will have to see how the models turn out. # Residuals plot looks pretty good, however. ts_2014_copy = ts['2015-05-01 00:00:00':'2015-05-09 23:59:59'] plt.rc("figure", figsize=(32, 30)) result = seasonal.seasonal_decompose(ts_2014_copy, model='additive') result.plot() plt.show() # Format ts for forecasting admits_df = ts_2014_copy.reset_index() admits_df.columns = ["Time", "Admits"] admits_df["trend"] = admits_df['Time'].map(result.trend) admits_df["month_name"] = pd.DatetimeIndex(admits_df['Time']).month_name() admits_df["month_name"] = admits_df.month_name.astype("category") admits_df["month"] = pd.DatetimeIndex(admits_df['Time']).month admits_df["day"] = pd.DatetimeIndex(admits_df["Time"]).day admits_df["hour"] = pd.DatetimeIndex(admits_df["Time"]).hour # Allocate 20% of the data for testing admits_train, admits_test = split_train_test_df(admits_df, 0.2) ts_train, ts_test = split_train_test_ts(ts_2014, 0.2) ################################## # TREND MODEL ################################## trend_model = api.ols('Admits ~ trend', data=admits_train).fit() p = trend_model.params print(trend_model.summary())
import numpy as np import statsmodels.api as sm import pandas as pd mdatagen = sm.datasets.macrodata.load().data mdata = mdatagen[['realgdp','realcons','realinv']] names = mdata.dtype.names start = pd.datetime(1959, 3, 31) end = pd.datetime(2009, 9, 30) #qtr = pd.DatetimeIndex(start=start, end=end, freq=pd.datetools.BQuarterEnd()) qtr = pd.DatetimeIndex(start=start, end=end, freq='BQ-MAR') data = pd.DataFrame(mdata, index=qtr) data = (np.log(data)).diff().dropna() #define structural inputs A = np.asarray([[1, 0, 0],['E', 1, 0],['E', 'E', 1]]) B = np.asarray([['E', 0, 0], [0, 'E', 0], [0, 0, 'E']]) A_guess = np.asarray([0.5, 0.25, -0.38]) B_guess = np.asarray([0.5, 0.1, 0.05]) mymodel = SVAR(data, svar_type='AB', A=A, B=B, freq='Q') res = mymodel.fit(maxlags=3, maxiter=10000, maxfun=10000, solver='bfgs') res.irf(periods=30).plot(impulse='realgdp', plot_stderr=True, stderr_type='mc', repl=100)
def f_linketurbidity(times, latitude, longitude): times = pd.DatetimeIndex(times) # latitude and longitude must be scalar or else linke turbidity lookup fails latitude, longitude = latitude.item(), longitude.item() tl = pvlib.clearsky.lookup_linke_turbidity(times, latitude, longitude) return tl.values.reshape(1, -1)
class TestSeriesMap: def test_map(self, datetime_series): index, data = tm.getMixedTypeDict() source = Series(data["B"], index=data["C"]) target = Series(data["C"][:4], index=data["D"][:4]) merged = target.map(source) for k, v in merged.items(): assert v == source[target[k]] # input could be a dict merged = target.map(source.to_dict()) for k, v in merged.items(): assert v == source[target[k]] # function result = datetime_series.map(lambda x: x * 2) tm.assert_series_equal(result, datetime_series * 2) # GH 10324 a = Series([1, 2, 3, 4]) b = Series(["even", "odd", "even", "odd"], dtype="category") c = Series(["even", "odd", "even", "odd"]) exp = Series(["odd", "even", "odd", np.nan], dtype="category") tm.assert_series_equal(a.map(b), exp) exp = Series(["odd", "even", "odd", np.nan]) tm.assert_series_equal(a.map(c), exp) a = Series(["a", "b", "c", "d"]) b = Series([1, 2, 3, 4], index=pd.CategoricalIndex(["b", "c", "d", "e"])) c = Series([1, 2, 3, 4], index=Index(["b", "c", "d", "e"])) exp = Series([np.nan, 1, 2, 3]) tm.assert_series_equal(a.map(b), exp) exp = Series([np.nan, 1, 2, 3]) tm.assert_series_equal(a.map(c), exp) a = Series(["a", "b", "c", "d"]) b = Series( ["B", "C", "D", "E"], dtype="category", index=pd.CategoricalIndex(["b", "c", "d", "e"]), ) c = Series(["B", "C", "D", "E"], index=Index(["b", "c", "d", "e"])) exp = Series( pd.Categorical([np.nan, "B", "C", "D"], categories=["B", "C", "D", "E"])) tm.assert_series_equal(a.map(b), exp) exp = Series([np.nan, "B", "C", "D"]) tm.assert_series_equal(a.map(c), exp) @pytest.mark.parametrize("index", tm.all_index_generator(10)) def test_map_empty(self, index): s = Series(index) result = s.map({}) expected = pd.Series(np.nan, index=s.index) tm.assert_series_equal(result, expected) def test_map_compat(self): # related GH 8024 s = Series([True, True, False], index=[1, 2, 3]) result = s.map({True: "foo", False: "bar"}) expected = Series(["foo", "foo", "bar"], index=[1, 2, 3]) tm.assert_series_equal(result, expected) def test_map_int(self): left = Series({"a": 1.0, "b": 2.0, "c": 3.0, "d": 4}) right = Series({1: 11, 2: 22, 3: 33}) assert left.dtype == np.float_ assert issubclass(right.dtype.type, np.integer) merged = left.map(right) assert merged.dtype == np.float_ assert isna(merged["d"]) assert not isna(merged["c"]) def test_map_type_inference(self): s = Series(range(3)) s2 = s.map(lambda x: np.where(x == 0, 0, 1)) assert issubclass(s2.dtype.type, np.integer) def test_map_decimal(self, string_series): from decimal import Decimal result = string_series.map(lambda x: Decimal(str(x))) assert result.dtype == np.object_ assert isinstance(result[0], Decimal) def test_map_na_exclusion(self): s = Series([1.5, np.nan, 3, np.nan, 5]) result = s.map(lambda x: x * 2, na_action="ignore") exp = s * 2 tm.assert_series_equal(result, exp) def test_map_dict_with_tuple_keys(self): """ Due to new MultiIndex-ing behaviour in v0.14.0, dicts with tuple keys passed to map were being converted to a multi-index, preventing tuple values from being mapped properly. """ # GH 18496 df = pd.DataFrame({"a": [(1, ), (2, ), (3, 4), (5, 6)]}) label_mappings = {(1, ): "A", (2, ): "B", (3, 4): "A", (5, 6): "B"} df["labels"] = df["a"].map(label_mappings) df["expected_labels"] = pd.Series(["A", "B", "A", "B"], index=df.index) # All labels should be filled now tm.assert_series_equal(df["labels"], df["expected_labels"], check_names=False) def test_map_counter(self): s = Series(["a", "b", "c"], index=[1, 2, 3]) counter = Counter() counter["b"] = 5 counter["c"] += 1 result = s.map(counter) expected = Series([0, 5, 1], index=[1, 2, 3]) tm.assert_series_equal(result, expected) def test_map_defaultdict(self): s = Series([1, 2, 3], index=["a", "b", "c"]) default_dict = defaultdict(lambda: "blank") default_dict[1] = "stuff" result = s.map(default_dict) expected = Series(["stuff", "blank", "blank"], index=["a", "b", "c"]) tm.assert_series_equal(result, expected) def test_map_dict_subclass_with_missing(self): """ Test Series.map with a dictionary subclass that defines __missing__, i.e. sets a default value (GH #15999). """ class DictWithMissing(dict): def __missing__(self, key): return "missing" s = Series([1, 2, 3]) dictionary = DictWithMissing({3: "three"}) result = s.map(dictionary) expected = Series(["missing", "missing", "three"]) tm.assert_series_equal(result, expected) def test_map_dict_subclass_without_missing(self): class DictWithoutMissing(dict): pass s = Series([1, 2, 3]) dictionary = DictWithoutMissing({3: "three"}) result = s.map(dictionary) expected = Series([np.nan, np.nan, "three"]) tm.assert_series_equal(result, expected) def test_map_box(self): vals = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")] s = pd.Series(vals) assert s.dtype == "datetime64[ns]" # boxed value must be Timestamp instance res = s.map( lambda x: "{0}_{1}_{2}".format(x.__class__.__name__, x.day, x.tz)) exp = pd.Series(["Timestamp_1_None", "Timestamp_2_None"]) tm.assert_series_equal(res, exp) vals = [ pd.Timestamp("2011-01-01", tz="US/Eastern"), pd.Timestamp("2011-01-02", tz="US/Eastern"), ] s = pd.Series(vals) assert s.dtype == "datetime64[ns, US/Eastern]" res = s.map( lambda x: "{0}_{1}_{2}".format(x.__class__.__name__, x.day, x.tz)) exp = pd.Series(["Timestamp_1_US/Eastern", "Timestamp_2_US/Eastern"]) tm.assert_series_equal(res, exp) # timedelta vals = [pd.Timedelta("1 days"), pd.Timedelta("2 days")] s = pd.Series(vals) assert s.dtype == "timedelta64[ns]" res = s.map(lambda x: "{0}_{1}".format(x.__class__.__name__, x.days)) exp = pd.Series(["Timedelta_1", "Timedelta_2"]) tm.assert_series_equal(res, exp) # period vals = [ pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M") ] s = pd.Series(vals) assert s.dtype == "Period[M]" res = s.map( lambda x: "{0}_{1}".format(x.__class__.__name__, x.freqstr)) exp = pd.Series(["Period_M", "Period_M"]) tm.assert_series_equal(res, exp) def test_map_categorical(self): values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True) s = pd.Series(values, name="XX", index=list("abcdefg")) result = s.map(lambda x: x.lower()) exp_values = pd.Categorical(list("abbabcd"), categories=list("dcba"), ordered=True) exp = pd.Series(exp_values, name="XX", index=list("abcdefg")) tm.assert_series_equal(result, exp) tm.assert_categorical_equal(result.values, exp_values) result = s.map(lambda x: "A") exp = pd.Series(["A"] * 7, name="XX", index=list("abcdefg")) tm.assert_series_equal(result, exp) assert result.dtype == np.object with pytest.raises(NotImplementedError): s.map(lambda x: x, na_action="ignore") def test_map_datetimetz(self): values = pd.date_range("2011-01-01", "2011-01-02", freq="H").tz_localize("Asia/Tokyo") s = pd.Series(values, name="XX") # keep tz result = s.map(lambda x: x + pd.offsets.Day()) exp_values = pd.date_range("2011-01-02", "2011-01-03", freq="H").tz_localize("Asia/Tokyo") exp = pd.Series(exp_values, name="XX") tm.assert_series_equal(result, exp) # change dtype # GH 14506 : Returned dtype changed from int32 to int64 result = s.map(lambda x: x.hour) exp = pd.Series(list(range(24)) + [0], name="XX", dtype=np.int64) tm.assert_series_equal(result, exp) with pytest.raises(NotImplementedError): s.map(lambda x: x, na_action="ignore") # not vectorized def f(x): if not isinstance(x, pd.Timestamp): raise ValueError return str(x.tz) result = s.map(f) exp = pd.Series(["Asia/Tokyo"] * 25, name="XX") tm.assert_series_equal(result, exp) @pytest.mark.parametrize( "vals,mapping,exp", [ (list("abc"), { np.nan: "not NaN" }, [np.nan] * 3 + ["not NaN"]), (list("abc"), { "a": "a letter" }, ["a letter"] + [np.nan] * 3), (list(range(3)), { 0: 42 }, [42] + [np.nan] * 3), ], ) def test_map_missing_mixed(self, vals, mapping, exp): # GH20495 s = pd.Series(vals + [np.nan]) result = s.map(mapping) tm.assert_series_equal(result, pd.Series(exp)) @pytest.mark.parametrize( "dti,exp", [ ( Series([1, 2], index=pd.DatetimeIndex([0, 31536000000])), DataFrame(np.repeat([[1, 2]], 2, axis=0), dtype="int64"), ), ( tm.makeTimeSeries(nper=30), DataFrame(np.repeat([[1, 2]], 30, axis=0), dtype="int64"), ), ], ) def test_apply_series_on_date_time_index_aware_series(self, dti, exp): # GH 25959 # Calling apply on a localized time series should not cause an error index = dti.tz_localize("UTC").index result = pd.Series(index).apply(lambda x: pd.Series([1, 2])) tm.assert_frame_equal(result, exp) def test_apply_scaler_on_date_time_index_aware_series(self): # GH 25959 # Calling apply on a localized time series should not cause an error series = tm.makeTimeSeries(nper=30).tz_localize("UTC") result = pd.Series(series.index).apply(lambda x: 1) tm.assert_series_equal(result, pd.Series(np.ones(30), dtype="int64"))
variable_df2 = pd.concat([counter, variable_df], axis=1) return variable_df2 #%% SWE observed data T4 with open("input_SWE_T4.csv") as scvd: reader = csv.reader(scvd) raw_swe = [r for r in reader] sc_swe_column = [] for csv_counter1 in range(len(raw_swe)): for csv_counter2 in range(2): sc_swe_column.append(raw_swe[csv_counter1][csv_counter2]) sc_swe = np.reshape(sc_swe_column, (len(raw_swe), 2)) sc_swe = sc_swe[1:] sc_swe_obs_date = pd.DatetimeIndex(sc_swe[:, 0]) sc_swe_obs = [float(value) for value in sc_swe[:, 1]] swe_obs_df = pd.DataFrame(sc_swe_obs, columns=['observed swe']) swe_obs_df.set_index(sc_swe_obs_date, inplace=True) #counter = pd.DataFrame(np.arange(0,len(swe_obs_df)), columns = ['counter']); counter.set_index(sc_swe_obs_date,inplace=True) #swe_obs_df2 = pd.concat([counter, swe_obs_df], axis=1) maxSwe1 = swe_obs_df['observed swe'][0:50000].max() maxSwe2 = swe_obs_df['observed swe'][50000:].max() maxSwe_date1 = swe_obs_df['observed swe'][0:50000].idxmax() maxSwe_date2 = swe_obs_df['observed swe'][50000:].idxmax() #%% open scenario-Params test p1 = [273.16] #tempCritRain p2 = [4] #mw_exp exponent for meltwater flow
# 开仓条件 df = df.dropna(axis=0) df['高于前两天高点'] = np.where(df.h >= df.nhh, 1, None) # 看当天 df['低于前两天低点'] = np.where(df.l <= df.nll, 1, None) df['开仓'] = np.where(df['高于前两天高点'] == 1, 'bk', None) df['开仓'] = np.where(df['低于前两天低点'] == 1, 'sk', df['开仓'] ) # ''' --------------------------趋势判断end--------------------------------- ''' #平仓的同时不反向开仓 #df['开仓'] = np.where(df['平仓'].isnull(), df['开仓'], None) # 这个不能少 dates = pd.DatetimeIndex(df.date) df.index = dates df = df.drop('date', axis=1) df['bk总手数'] = 0 df['bkprice'] = 0 #df['b持仓均价'] = 0 # #df['b保证金'] = 0 # #df['b合约金额'] = 0 # 比如3000点买的螺纹, 实际合约价值是10吨,3万元 df['是b止损'] = None df['b止损'] = None df['sk总手数'] = 0 df['skprice'] = 0 #df['s保证金'] = 0 #df['s持仓均价'] = 0 # #df['s合约金额'] = 0
def dekad_index(begin, end=None): """Creates a pandas datetime index on a decadal basis. Parameters ---------- begin : datetime Datetime index start date. end : datetime, optional Datetime index end date, set to current date if None. Returns ------- dtindex : pandas.DatetimeIndex Dekadal datetime index. """ if end is None: end = datetime.now() mon_begin = datetime(begin.year, begin.month, 1) mon_end = datetime(end.year, end.month, 1) daterange = pd.date_range(mon_begin, mon_end, freq='MS') dates = [] for i, dat in enumerate(daterange): lday = calendar.monthrange(dat.year, dat.month)[1] if i == 0 and begin.day > 1: if begin.day < 11: if daterange.size == 1: if end.day < 11: dekads = [10] elif end.day >= 11 and end.day < 21: dekads = [10, 20] else: dekads = [10, 20, lday] else: dekads = [10, 20, lday] elif begin.day >= 11 and begin.day < 21: if daterange.size == 1: if end.day < 21: dekads = [20] else: dekads = [20, lday] else: dekads = [20, lday] else: dekads = [lday] elif i == (len(daterange) - 1) and end.day < 21: if end.day < 11: dekads = [10] else: dekads = [10, 20] else: dekads = [10, 20, lday] for j in dekads: dates.append(datetime(dat.year, dat.month, j)) dtindex = pd.DatetimeIndex(dates) return dtindex
ax = sns.barplot(x="birthState", y="points", hue="playerID", data=asd) plt.legend('') plt.show() #.sort_values(["score"],ascending = False).reset_index() #pd.set_option('display.max_columns', None) #asd = asd[asd.birthState == "NC"] #print(asd) """ """ nba_df = nba_df[nba_df.birthCountry == 'USA'] nba_df = nba_df[nba_df.birthDate != "0000-00-00"] nba_df = nba_df[nba_df.height > 0] print(nba_df) nba_df['born_year'] = pd.DatetimeIndex(nba_df['birthDate']).year nba_df['age'] = nba_df['year'] - nba_df['born_year'] print(nba_df) nba_ht = nba_df[[ 'points', 'rebounds', 'assists', 'age', 'playerID', 'pos', 'birthState', 'year', 'height', 'threeMade', 'steals' ]] print(nba_ht) print(nba_ht.corr(method='pearson')) #ax = sns.heatmap(heat_data,cmap="YlGnBu",linewidths=.5) test = nba_df[['points', 'rebounds', 'assists', 'height']] """ corr = test.corr() corr[np.abs(corr)<.2] = 0
dataframe['consumption_dayahead_increase_lastWeek'] = np.nan dataframe['price_intraday_increase_lastWeek'] = np.nan dataframe['consumption_intraday_increase_lastWeek'] = np.nan for index, row in dataframe.iterrows(): # skip the first 8 days daysToSkip = 8 rowsToSkip = (96 * daysToSkip) if index > rowsToSkip - 1: hour = dataframe.iloc[index]['datetime'].hour minute = dataframe.iloc[index]['datetime'].minute # go to the index of the last day at 11:30 lastAuctionDate = pd.to_datetime( dataframe.iloc[index]['datetime'].date() - datetime.timedelta(1)) indexBeforePrediction = dataframe.loc[ (pd.DatetimeIndex(dataframe['datetime']).hour == 11) & (pd.DatetimeIndex(dataframe['datetime']).minute == 30) & (pd.DatetimeIndex(dataframe['datetime']).day == lastAuctionDate.day) & (pd.DatetimeIndex( dataframe['datetime']).month == lastAuctionDate.month) & (pd.DatetimeIndex(dataframe['datetime']).year == lastAuctionDate.year)].index[0] indexBeforePrediction = indexBeforePrediction.astype(np.int32) lastWeekdf = dataframe[indexBeforePrediction - 671:indexBeforePrediction + 1] lastWeekdf = lastWeekdf.loc[ (pd.DatetimeIndex(lastWeekdf['datetime']).hour == hour) & (pd.DatetimeIndex(lastWeekdf['datetime']).minute == minute)] lastDaydf = lastWeekdf.iloc[-1] # last 24 hours before 11:45 on the day of the last auction
def clean(output, data, kind=None, debug=True): """ Checks the output and fix common errors: - liquidity - missed dates - exposure - normalization :param output: :param data: :param kind: :return: """ import qnt.stats as qns import qnt.exposure as qne from qnt.data.common import ds, f, track_event if kind is None: kind = data.name output = output.drop(ds.FIELD, errors='ignore') with LogSettings(err2info=True): log_info("Output cleaning...") single_day = ds.TIME not in output.dims if not single_day: track_event("OUTPUT_CLEAN") log_info("fix uniq") if not single_day: # uniq time fix val, idx = np.unique(output.time, return_index=True) output = output.isel(time=idx) # uniq asset fix val, idx = np.unique(output.asset, return_index=True) output = output.isel(asset=idx) if single_day: output = output.drop(ds.TIME, errors='ignore') output = xr.concat([output], pd.Index([data.coords[ds.TIME].values.max()], name=ds.TIME)) else: log_info("ffill if the current price is None...") output = output.fillna(0) output = output.where(np.isfinite(data.sel(field='close'))) output = output.ffill('time') output = output.fillna(0) if kind == "stocks" or kind == "stocks_long" \ or kind == 'crypto_daily' or kind == 'cryptodaily'\ or kind == 'crypto_daily_long' or kind == 'crypto_daily_long_short': log_info("Check liquidity...") non_liquid = qns.calc_non_liquid(data, output) if len(non_liquid.coords[ds.TIME]) > 0: log_info("WARNING! Strategy trades non-liquid assets.") log_info("Fix liquidity...") is_liquid = data.sel(field=f.IS_LIQUID) is_liquid = xr.align(is_liquid, output, join='right')[0] output = xr.where(is_liquid == 0, 0, output) log_info("Ok.") if not single_day: log_info("Check missed dates...") missed_dates = qns.find_missed_dates(output, data) if len(missed_dates) > 0: log_info("WARNING! Output contain missed dates.") log_info("Adding missed dates and set zero...") add = xr.concat([output.isel(time=-1)] * len(missed_dates), pd.DatetimeIndex(missed_dates, name="time")) add = xr.full_like(add, np.nan) output = xr.concat([output, add], dim='time') output = output.fillna(0) if kind == "stocks" or kind == "stocks_long" \ or kind == 'crypto_daily' or kind == 'cryptodaily' \ or kind == 'crypto_daily_long' or kind == 'crypto_daily_long_short': output = output.where(data.sel(field='is_liquid') > 0) output = output.dropna('asset', 'all').dropna('time', 'all').fillna(0) output = normalize(output) else: log_info("Ok.") if kind == 'stocks_long' or kind == 'crypto_daily_long': log_info("Check positive positions...") neg = output.where(output < 0).dropna(ds.TIME, 'all') if len(neg.time) > 0: log_info( "WARNING! Output contains negative positions. Clean...") output = output.where(output >= 0).fillna(0) else: log_info("Ok.") if kind == "stocks" or kind == "stocks_long": log_info("Check exposure...") if not qns.check_exposure(output): log_info("Cut big positions...") output = qne.cut_big_positions(output) log_info("Check exposure...") if not qns.check_exposure(output): log_info("Drop bad days...") output = qne.drop_bad_days(output) if kind == "crypto": log_info("Check BTC...") if output.where(output != 0).dropna( "asset", "all").coords[ds.ASSET].values.tolist() != ['BTC']: log_info("WARNING! Output contains not only BTC.") log_info("Fixing...") output = output.sel(asset=['BTC']) else: log_info("Ok.") log_info("Normalization...") output = normalize(output) log_info("Output cleaning is complete.") return output
df.loc[::3, ::3] df.iloc[::3, ::3] df.iloc[::3, ['mpg', 'cyl']] #error df.iloc[-1:, -1:] #last row & last col df.iloc[-1:, ] #last row df.iloc[:, -1:] #last col df.iloc[[3, 4], [1, 2]] #access by integer pos # retrieving all rows and some columns by iloc method df.iloc[:, [1, 2]] df.ix["Car1"] df.ix[["Car1", 'Car5']] df.ix[["Car1", 'Car5'], ['mpg', 'cyl']] #%% #index df2 = df.set_index(pd.DatetimeIndex(df['mDate']), drop=False, inplace=False) df2 #between certain dates df2['2018-1-1':'2020-1-1'] df.columns df.index df3 = df.set_index(['mDate'], append=True, inplace=False) df3.head() #%%% #df.isin() filter1 = df["gear"].isin([4]) filter2 = df["cyl"].isin([4, 6]) #4,6,8 df[filter1] df[filter1 & filter2]
def test_combine_first_timezone(self): # see gh-7630 data1 = pd.to_datetime('20100101 01:01').tz_localize('UTC') df1 = pd.DataFrame(columns=['UTCdatetime', 'abc'], data=data1, index=pd.date_range('20140627', periods=1)) data2 = pd.to_datetime('20121212 12:12').tz_localize('UTC') df2 = pd.DataFrame(columns=['UTCdatetime', 'xyz'], data=data2, index=pd.date_range('20140628', periods=1)) res = df2[['UTCdatetime']].combine_first(df1) exp = pd.DataFrame( { 'UTCdatetime': [ pd.Timestamp('2010-01-01 01:01', tz='UTC'), pd.Timestamp('2012-12-12 12:12', tz='UTC') ], 'abc': [pd.Timestamp('2010-01-01 01:01:00', tz='UTC'), pd.NaT] }, columns=['UTCdatetime', 'abc'], index=pd.date_range('20140627', periods=2, freq='D')) tm.assert_frame_equal(res, exp) assert res['UTCdatetime'].dtype == 'datetime64[ns, UTC]' assert res['abc'].dtype == 'datetime64[ns, UTC]' # see gh-10567 dts1 = pd.date_range('2015-01-01', '2015-01-05', tz='UTC') df1 = pd.DataFrame({'DATE': dts1}) dts2 = pd.date_range('2015-01-03', '2015-01-05', tz='UTC') df2 = pd.DataFrame({'DATE': dts2}) res = df1.combine_first(df2) tm.assert_frame_equal(res, df1) assert res['DATE'].dtype == 'datetime64[ns, UTC]' dts1 = pd.DatetimeIndex( ['2011-01-01', 'NaT', '2011-01-03', '2011-01-04'], tz='US/Eastern') df1 = pd.DataFrame({'DATE': dts1}, index=[1, 3, 5, 7]) dts2 = pd.DatetimeIndex(['2012-01-01', '2012-01-02', '2012-01-03'], tz='US/Eastern') df2 = pd.DataFrame({'DATE': dts2}, index=[2, 4, 5]) res = df1.combine_first(df2) exp_dts = pd.DatetimeIndex([ '2011-01-01', '2012-01-01', 'NaT', '2012-01-02', '2011-01-03', '2011-01-04' ], tz='US/Eastern') exp = pd.DataFrame({'DATE': exp_dts}, index=[1, 2, 3, 4, 5, 7]) tm.assert_frame_equal(res, exp) # different tz dts1 = pd.date_range('2015-01-01', '2015-01-05', tz='US/Eastern') df1 = pd.DataFrame({'DATE': dts1}) dts2 = pd.date_range('2015-01-03', '2015-01-05') df2 = pd.DataFrame({'DATE': dts2}) # if df1 doesn't have NaN, keep its dtype res = df1.combine_first(df2) tm.assert_frame_equal(res, df1) assert res['DATE'].dtype == 'datetime64[ns, US/Eastern]' dts1 = pd.date_range('2015-01-01', '2015-01-02', tz='US/Eastern') df1 = pd.DataFrame({'DATE': dts1}) dts2 = pd.date_range('2015-01-01', '2015-01-03') df2 = pd.DataFrame({'DATE': dts2}) res = df1.combine_first(df2) exp_dts = [ pd.Timestamp('2015-01-01', tz='US/Eastern'), pd.Timestamp('2015-01-02', tz='US/Eastern'), pd.Timestamp('2015-01-03') ] exp = pd.DataFrame({'DATE': exp_dts}) tm.assert_frame_equal(res, exp) assert res['DATE'].dtype == 'object'