def add_bulk_data_values(session, series, dvs_size): """ Load up exampleData.csv into a series' datavalues field """ assert 10000 >= dvs_size > 0 path = os.path.dirname(os.path.realpath(__file__)) filepath = os.path.join(path, 'example_files', 'exampleData.csv') df = pd.read_csv(filepath) df['LocalDateTime'] = pd.to_datetime(df['LocalDateTime']).astype(datetime.datetime) df['DateTimeUTC'] = pd.to_datetime(df['DateTimeUTC']).astype(datetime.datetime) dvs = [] for record in df.to_dict('records')[:dvs_size]: dv = DataValue() dv.data_value = record['DataValue'] dv.local_date_time = record['LocalDateTime'] dv.utc_offset = record['UTCOffset'] dv.date_time_utc = record['DateTimeUTC'] dv.site_id = series.site_id dv.variable_id = series.variable_id dv.censor_code = record['CensorCode'] dv.method_id = series.method_id dv.source_id = series.source_id dv.quality_control_level_id = series.quality_control_level_id dvs.append(dv) series.data_values = dvs session.add_all(dvs) session.commit() return df
def parse(self, files): block_groups = self.group_files(files) for file_grp in block_groups.values(): files, mdicts = zip(*file_grp) blk = Block() blk.name = self.get_name(mdicts[0]["name"]) date = pd.to_datetime(mdicts[0]["date"], format="%y%m%d").date() time = pd.to_datetime(mdicts[0]["time"], format="%H%M%S").time() file_types = [m["file"] for m in mdicts] if "parameters" in file_types: fname = files[file_types.index("parameters")] blk.start, blk.first_peck, blk.end = self.parse_time_file(fname, date, time) else: blk.start = pd.Timestamp(pd.datetime.combine(date, time)) if not blk.is_complete: continue if "timestamp" in file_types: fname = files[file_types.index("timestamp")] blk.data = self.get_block_data(fname, start=blk.start) if (blk.data is None) or (len(blk.data) <= 1): continue blk.compute_statistics() blk.files = files self.blocks.append(blk) return self.blocks
def process_df(self, df): print("df_before_processing = ") print(str(df)) # df['Datetime'] = [datetime.strptime(date_string, '%d %b %Y %H:%M') for date_string in df['Date']]# datetime.strptime(df['Date'], "%d %m %Y %H:%M") df['FromDate'] = pd.to_datetime(df['FromDate']) df['Date'] = pd.to_datetime(df['Date']) # df.Date.dt.hour df['DayDiff'] = (df['Date'] - df['FromDate']).astype('timedelta64[h]')/24 df['HoursOfDay'] = df['Date'].dt.hour + df['Date'].dt.minute/60 df['DayOfWeek'] = df['Date'].dt.weekday # df['MinutesOfDay'] = df['Date'].dt.minute # df['Minutes'] = [date_time.hours*60 + date_time.minutes # for date_time in df['Datetime']] # datestring = '01 Jan 2016 10:00' # my_datetime = datetime.strptime(datestring, '%d %b %Y %H:%M') # print("my_datetme = " + str(my_datetime)) print(str(df)) # date_df = df["Date"] # print('date_df = ') # print(str(date_df)) return df # TO add in the from date: # Firstly do group by project and issue number, then sort by date, # and within each section add a counter # Or can we use the index which is already a counter?
def _get_dollar_values(self, group=False): """Calculate the value of portfolio holdings using closing prices. Optionally aggregate the values into groups provided in config. """ dates = sorted(self._config['dates']) # Copy dataframe and zero data before earliest portfolio date. dollar_values = self._daily['close'].copy() dollar_values.ix[ dollar_values.index < pd.to_datetime(str(dates[0])), :] = 0.0 # Loop thru dates and calculate each date range using bitmask index. for i, item in enumerate(dates): index = dollar_values.index >= pd.to_datetime(str(item)) if i < (len(dates) - 1): index = index & ( dollar_values.index < pd.to_datetime(str(dates[i + 1]))) for key in list(dollar_values.columns.values): value = self._config['dates'][item]['symbols'].get(key) if value is None: dollar_values.ix[index, key] = 0.0 else: dollar_values.ix[index, key] *= value * self._config[ 'value_ratio'] if group is True: dollar_values = self._sum_symbol_groups(dollar_values) return dollar_values
def __init__(self, sids, fields, start=None, end=None, period=None, ignore_security_error=0, ignore_field_error=0, period_adjustment=None, currency=None, override_option=None, pricing_option=None, non_trading_day_fill_option=None, non_trading_day_fill_method=None, max_data_points=None, adjustment_normal=None, adjustment_abnormal=None, adjustment_split=None, adjustment_follow_DPDF=None, calendar_code_override=None, **overrides): Request.__init__(self, '//blp/refdata', ignore_security_error=ignore_security_error, ignore_field_error=ignore_field_error) period = period or 'DAILY' assert period in ('DAILY', 'WEEKLY', 'MONTHLY', 'QUARTERLY', 'SEMI-ANNUAL', 'YEARLY') self.is_single_sid = is_single_sid = isinstance(sids, basestring) self.is_single_field = is_single_field = isinstance(fields, basestring) self.sids = is_single_sid and [sids] or list(sids) self.fields = is_single_field and [fields] or list(fields) self.end = end = pd.to_datetime(end) if end else pd.Timestamp.now() self.start = pd.to_datetime(start) if start else end + pd.datetools.relativedelta(years=-1) self.period = period self.period_adjustment = period_adjustment self.currency = currency self.override_option = override_option self.pricing_option = pricing_option self.non_trading_day_fill_option = non_trading_day_fill_option self.non_trading_day_fill_method = non_trading_day_fill_method self.max_data_points = max_data_points self.adjustment_normal = adjustment_normal self.adjustment_abnormal = adjustment_abnormal self.adjustment_split = adjustment_split self.adjustment_follow_DPDF = adjustment_follow_DPDF self.calendar_code_override = calendar_code_override self.overrides = overrides
def get_histdata(symbol,startDate="1990-01-01",endDate=None): """ symbol: string AXS code for a stock startDate,endDate: date string or datetime object this can be neglected """ startDate = pd.to_datetime(startDate) if endDate is not None: endDate = pd.to_datetime(endDate) else: endDate = pd.datetime.now() url = urlYahooFinance.format( symbol,startDate.month-1,startDate.day,startDate.year, endDate.month-1,endDate.day,endDate.year ) # print url try: df = pd.read_csv(url) df.set_index("Date",inplace=True) df.index = pd.to_datetime(df.index) df.sort_index(inplace=True) return df except: print "Couldn't find this stock at Yahoo Finance or banned by Yahoo somehow!" return None
def build_sonar_json(**kwargs): """Provide JSON file""" prod_file = conf['prod_data_dir'] + '/sonar_payload.json' keen_collection = 'sonar_pings_{}'.format(conf['env']).lower() sonar_data = get_keen_sonar(keen_collection, 'this_30_days') df = pd.DataFrame(sonar_data) df['keen_timestamp'] = df['keen'].map(lambda x: x['created_at']) df['exec_date'] = pd.to_datetime(df['exec_date'], format=conf['date_format_keen']) df['keen_timestamp'] = pd.to_datetime(df['keen_timestamp'], format=conf['date_format_keen']) df.sort_values(by=['exec_date', 'keen_timestamp'], ascending=False, inplace=True) df.drop_duplicates(subset='value_key', keep='first', inplace=True) df.reset_index(inplace=True) df.drop(['index', 'keen'], 1, inplace=True) df['exec_date'] = df['exec_date'].dt.strftime(conf['date_format_keen']) df['keen_timestamp'] = df['keen_timestamp'].dt.strftime(conf[ 'date_format_keen']) sonar_payload = json.loads(df.to_json(orient='records')) notify_keen( { 'sonar_payload': sonar_payload }, 'sonar_payloads_{}'.format(conf['env']).lower(), raise_for_status=True) with open(prod_file, 'w') as outfile: json.dump(sonar_payload, outfile, indent=4, sort_keys=True) return sonar_payload
def main(): df = pd.read_csv(args.file) # 列类型转换 df['time_x'], df['time_y'], df['duration'] = \ pd.to_datetime(df['time_x']), \ pd.to_datetime(df['time_y']), \ pd.to_timedelta(df['duration']) df['duration'] = df['duration'] / np.timedelta64(1, 's') df['time_group'] = ((df['time_x'].dt.hour * 60 + df['time_x'].dt.minute) / 5).apply(math.ceil) print("原始长度: {}".format(len(df))) grouped = df.groupby('time_group') statBefore = pd.DataFrame({'q1': grouped['duration'].quantile(.25), 'q3': grouped['duration'].quantile(.75)}) df['outlier'] = df.apply(is_outlier, axis=1, args=(statBefore,)) df = df[~(df.outlier)] del df['outlier'] print("filtered长度: {}".format(len(df))) df_weekday = df[df['time_x'].dt.weekday < 6] df_weekend = df[df['time_x'].dt.weekday >= 6] draw_weekdays_plot(df_weekday) draw_weekdays_plot(df_weekend) df_weekday.boxplot(column='duration', by='time_group') plt.show() df_weekend.boxplot(column='duration', by='time_group') plt.show()
def test_frame_add_datetime64_col_other_units(self): n = 100 units = ['h', 'm', 's', 'ms', 'D', 'M', 'Y'] ns_dtype = np.dtype('M8[ns]') for unit in units: dtype = np.dtype('M8[%s]' % unit) vals = np.arange(n, dtype=np.int64).view(dtype) df = DataFrame({'ints': np.arange(n)}, index=np.arange(n)) df[unit] = vals ex_vals = to_datetime(vals.astype('O')).values self.assertEqual(df[unit].dtype, ns_dtype) self.assertTrue((df[unit].values == ex_vals).all()) # Test insertion into existing datetime64 column df = DataFrame({'ints': np.arange(n)}, index=np.arange(n)) df['dates'] = np.arange(n, dtype=np.int64).view(ns_dtype) for unit in units: dtype = np.dtype('M8[%s]' % unit) vals = np.arange(n, dtype=np.int64).view(dtype) tmp = df.copy() tmp['dates'] = vals ex_vals = to_datetime(vals.astype('O')).values self.assertTrue((tmp['dates'].values == ex_vals).all())
def test_allow_exact_matches_and_tolerance2(self): # GH 13695 df1 = pd.DataFrame({ 'time': pd.to_datetime(['2016-07-15 13:30:00.030']), 'username': ['bob']}) df2 = pd.DataFrame({ 'time': pd.to_datetime(['2016-07-15 13:30:00.000', '2016-07-15 13:30:00.030']), 'version': [1, 2]}) result = pd.merge_asof(df1, df2, on='time') expected = pd.DataFrame({ 'time': pd.to_datetime(['2016-07-15 13:30:00.030']), 'username': ['bob'], 'version': [2]}) assert_frame_equal(result, expected) result = pd.merge_asof(df1, df2, on='time', allow_exact_matches=False) expected = pd.DataFrame({ 'time': pd.to_datetime(['2016-07-15 13:30:00.030']), 'username': ['bob'], 'version': [1]}) assert_frame_equal(result, expected) result = pd.merge_asof(df1, df2, on='time', allow_exact_matches=False, tolerance=pd.Timedelta('10ms')) expected = pd.DataFrame({ 'time': pd.to_datetime(['2016-07-15 13:30:00.030']), 'username': ['bob'], 'version': [np.nan]}) assert_frame_equal(result, expected)
def merge(self, weather, price): weather['date'] = pd.to_datetime(weather['date']) price['日期'] = pd.to_datetime(price['日期']) weather = weather.set_index('date') m = pd.merge(price, weather, left_on='日期', right_index=True, how='left') return m
def get_absolute_margin(exec_lot, req_price, soft_pnl, instrument, time): if exec_lot == 0.0: return np.nan ccy1 = instrument.split('/')[0] ccy2 = instrument.split('/')[1] ccy2_pnl = soft_pnl/exec_lot if (ccy2 == 'USD') or (ccy2 == 'USD_TOM') or (ccy2 == 'USD_TOD'): return ccy2_pnl if (ccy1 == 'USD'): return ccy2_pnl*req_price ccy_pair = ('/' + ccy2 + 'USD') swaped_ccy_pair = ('/' + 'USD' + ccy2) timestamp_end = pd.to_datetime(time) str_timestamp_end = str(timestamp_end) timestamp_start = pd.to_datetime(time) - timedelta(days=1) str_timestamp_start = str(timestamp_start) query = "index>Timestamp('" + str_timestamp_start + "') & index<Timestamp('" + str_timestamp_end +"')" if ccy_pair in instruments: for_price = prices_store.select(ccy_pair, where=query).tail(1) try: convert_price = (for_price['Bid'] + for_price['Ask']).values[0]/2 except IndexError: return ccy2_pnl/req_price return ccy2_pnl/convert_price elif swaped_ccy_pair in instruments: for_price = prices_store.select(swaped_ccy_pair, where=query).tail(1) try: convert_price = (for_price['Bid'] + for_price['Ask']).values[0]/2 except IndexError: return ccy2_pnl*req_price return ccy2_pnl*convert_price else: print(time, instrument) return ccy2_pnl
def clean_data(): """Get the permits file from temp directory, clean it, and save it in Prod directory""" df = pd.read_csv(temp_permits) df.columns = [x.lower() for x in df.columns] df['approval_issue_dt'] = pd.to_datetime( df['approval_issue_dt'], errors='coerce') df['approval_close_dt'] = pd.to_datetime( df['approval_close_dt'], errors='coerce') df['proj_appl_date'] = pd.to_datetime( df['proj_appl_date'], errors='coerce') df['proj_deemed_cmpl_date'] = pd.to_datetime( df['proj_deemed_cmpl_date'], errors='coerce') df = df.sort_values(by='approval_issue_dt') logging.info('Writing all permits') general.pos_write_csv( df, prod_permits, date_format=conf['date_format_ymd_hms']) return 'Successfully cleaned data.'
def test_asfreq_actual(): a = pd.TimeSeries({pd.to_datetime('2010-02-27'): 100, pd.to_datetime('2010-03-25'): 200}) actual = a.asfreq_actual(freq='M', method='ffill') assert len(actual) == 1 assert '2010-02-27' in actual
def show_worst_drawdown_periods(returns, top=5): """ Prints information about the worst drawdown periods. Prints peak dates, valley dates, recovery dates, and net drawdowns. Parameters ---------- returns : pd.Series Daily returns of the strategy, non-cumulative. top : int, optional Amount of top drawdowns periods to plot (default 5). """ print('\nWorst Drawdown Periods') drawdown_df = timeseries.gen_drawdown_table(returns, top=top) drawdown_df['peak date'] = pd.to_datetime( drawdown_df['peak date'], unit='D') drawdown_df['valley date'] = pd.to_datetime( drawdown_df['valley date'], unit='D') drawdown_df['recovery date'] = pd.to_datetime( drawdown_df['recovery date'], unit='D') drawdown_df['net drawdown in %'] = list( map(utils.round_two_dec_places, drawdown_df['net drawdown in %'])) print(drawdown_df.sort('net drawdown in %', ascending=False))
def get_time_delta(start_date, end_date, start_format, end_format): """ Given strings representing time, returns a timedelta object representing the time difference between two dates """ time_delta = pd.to_datetime(end_date, end_format) - pd.to_datetime(start_date, start_format) return time_delta
def get_quote_yahoojp(code, start=None, end=None, interval='d'): base = 'http://info.finance.yahoo.co.jp/history/?code={0}.T&{1}&{2}&tm={3}&p={4}' start, end = web._sanitize_dates(start, end) start = 'sy={0}&sm={1}&sd={2}'.format(start.year, start.month, start.day) end = 'ey={0}&em={1}&ed={2}'.format(end.year, end.month, end.day) p = 1 results = [] if interval not in ['d', 'w', 'm', 'v']: raise ValueError("Invalid interval: valid values are 'd', 'w', 'm' and 'v'") while True: url = base.format(code, start, end, interval, p) tables = pd.read_html(url, header=0) if len(tables) < 2 or len(tables[1]) == 0: break results.append(tables[1]) p += 1 result = pd.concat(results, ignore_index=True) result.columns = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close'] if interval == 'm': result['Date'] = pd.to_datetime(result['Date'], format='%Y年%m月') else: result['Date'] = pd.to_datetime(result['Date'], format='%Y年%m月%d日') result = result.set_index('Date') result = result.sort_index() return result
def make_url( symbol, start_date='2000-1-1', stop_date=dt.date.today(), freq='d'): # Create url to download raw CSV data from Yahoo! Finance. # start_date and stop_date can be any format recognized by pd.to_datetime(). # freq must be one of ['d','w','m'] meaning daily, weekly, monthly. symbol = symbol.upper() start_date = pd.to_datetime(start_date) stop_date = pd.to_datetime(stop_date) params = dict() params['s'] = symbol params['a'] = start_date.month - 1 params['b'] = start_date.day params['c'] = start_date.year params['d'] = stop_date.month - 1 params['e'] = stop_date.day params['f'] = stop_date.year params['g'] = freq params['y'] = str(0) params['z'] = str(30000) yurl = 'http://real-chart.finance.yahoo.com/x?' for key in sorted(params.keys()): yurl += '&' + key + '=' + str(params[key]) return yurl
def draw(tick): #import numpy as np mydata = Quandl.get("WIKI/" + tick) #markers_on = np.array(['2013-02-26','2015-01-26','2016-02-26', '2016-04-01'], dtype='datetime64') #df3 = pd.DataFrame(markers_on) #df4 = df3.set_index(0) #df5 = df4.join(mydata,how='left') #df6 = df5['Adj. Close'] #mynewdata = mydata.join(df6,how="left",lsuffix='_OG',rsuffix='_Mark') #get trading start def tradedetails(tradetype,tradevalue,minprice,maxprice,isofficer,ceo,cfo,isdir,is10,isother,stock): hf = pd.read_html("http://openinsider.com/screener?fd=0&fdr=&td=0&tdr=&s="+ stock + "&o=&t="+ tradetype + "&minprice=" + str(minprice) + "&maxprice=" + str(maxprice) + "&v="+ str(tradevalue) +"&isofficer=" + str(isofficer) + "&isceo=" + str(ceo) + "&iscfo=" + str(cfo) + "&isdirector=" + str(isdir) + "&istenpercent=" + str(is10) + "&isother=" + str(isother) + "&sicMin=&sicMax=&sortcol=1&maxresults=1000") return hf[5] def convertdate(x): return x[5:7] + "/" + x[8:10] + "/" + x[0:4] def converttime(x): return x[11:] def convertnumber(x): return x.replace("+","").replace("$","").replace(",","") def cleandataframe(df): df['Trade Date'] = df['Trade Date'].apply(convertdate) df['Filing Time'] = df['Filing Date'].apply(converttime) df['Filing Date'] = df['Filing Date'].apply(convertdate) #df['Shares Traded'] = df['Shares Traded'].apply(convertnumber) df['Value Traded'] = df['Value Traded'].apply(convertnumber) #df['Shares Owned'] = df['Shares Owned'].apply(convertnumber) return df def cleanerdataframe(df): df['Trade Date'] = df['Trade Date'].apply(convertdate) df['Filing Time'] = df['Filing Date'].apply(converttime) df['Filing Date'] = df['Filing Date'].apply(convertdate) df['Shares Traded'] = df['Shares Traded'].apply(convertnumber) df['Value Traded'] = df['Value Traded'].apply(convertnumber) #df['Shares Owned'] = df['Shares Owned'].apply(convertnumber) return df detail = tradedetails("p",25000,"","",0,1,1,0,0,0,tick) pd.to_datetime(detail['Trade Date']) detail = detail.set_index('Trade Date') newdetail = detail.join(mydata,how='left') df6 = newdetail['Adj. Close'] mynewdata = mydata.join(df6,how="left",lsuffix='_OG',rsuffix='_Mark') #get trading end plt.plot(mynewdata['Adj. Close_OG']) plt.plot(mynewdata['Adj. Close_Mark'],marker='o',color='r', markersize=11) plt.show()
def do_charts(slicer, pdfpages): print "\n***Generating Charts***" fig, ax = plt.subplots(figsize=(7, 6), dpi=80) start = pd.to_datetime('2010-12-13 13:54:10.5-05:00') end = pd.to_datetime('2010-12-13 13:54:11.5-05:00') window_sizes = [32, 64, 128] raw = slicer.series['raw'][start:end] raw.plot() for ws in window_sizes: slicer.extract_rolling_median(seriesname = 'raw', window_size = ws) rm = slicer.series['raw_rolling_median_' + str(ws)][start:end] rm.plot(xticks=[i for i in rm.index]) plt.legend(['512Hz EEG']+[ 'Rolling Median %d window size' % ws \ for ws in window_sizes] ,loc='best') plt.ylabel(r"Potential ($\mu$V)") plt.xlabel(r"Time ($\mu$Sec)") #plt.title('10 Hz rolling median, compared to 512Hz signal') ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter('%S.%f')) ax.set_ylim(ax.get_ylim()[::-1]) pdfpages.savefig()
def load( symbol_list, start_date='2000-1-1', stop_date=dt.date.today(), freq='d', verbose=True): # Get data for multiple symbols. Returns a dictionary of DataFrames. # Each DataFrame stores one variable (e.g 'TotalValue') for all symbols. # symbol_list should be a list of strings, e.g. ['SPY','AAPL','^GSPC'] # Set verbose = False to disable printing to screen. start_date = pd.to_datetime(start_date) stop_date = pd.to_datetime(stop_date) tables = dict() if verbose: print "Loading symbols", for symbol in symbol_list: if verbose: print symbol, tables[symbol] = get_table(symbol,start_date,stop_date,freq) if verbose: print "\nAll symbols loaded.\n" p = pd.Panel(tables) p = p.transpose(2,1,0) return dict(p)
def load_yahoo_stock(sids, start=None, end=None, dvds=True): if hasattr(sids, '__iter__') and not isinstance(sids, basestring): return Instruments([load_yahoo_stock(sid, start=start, end=end, dvds=dvds) for sid in sids]) else: sid = sids end = end and pd.to_datetime(end) or pd.datetime.now() start = start and pd.to_datetime(start) or end + pd.datetools.relativedelta(years=-1) data = get_data_yahoo(sid, start=start, end=end) data = data.rename(columns=lambda c: c.lower()) if dvds: d = get_dividends_yahoo(sid, start, end) d.columns = ['dvds'] if not d.empty: # sanity check - not expected currently # missing = d.index.difference(data.index) missing = d.index - data.index if len(missing) > 0: raise Exception('dividends occur on non-business day, not expecting this') # another sanity check to ensure yahoo rolls dividends up, in case a special occurs on same day if not d.index.is_unique: d = d.groupby(lambda x: x).sum() data = data.join(d) else: data['dvds'] = np.nan pxs = InstrumentPrices(data) return Instrument(sid, pxs, multiplier=1.)
def loadData_getColNames(data_columns): print "Here are the data columns of your file: " print data_columns # Find the column names for each of the 5 data streams colnames = ['EDA data','Temperature data','Acceleration X','Acceleration Y','Acceleration Z'] new_colnames = ['','','','',''] for i in range(len(new_colnames)): new_colnames[i] = raw_input("Column name that contains "+colnames[i]+": ") while (new_colnames[i] not in data_columns): print "Column not found. Please try again" print "Here are the data columns of your file: " print data_columns new_colnames[i] = raw_input("Column name that contains "+colnames[i]+": ") # Get user input on sample rate sampleRate = raw_input("Enter sample rate (must be an integer power of 2): ") while (sampleRate.isdigit()==False) or (np.log(int(sampleRate))/np.log(2) != np.floor(np.log(int(sampleRate))/np.log(2))): print "Not an integer power of two" sampleRate = raw_input("Enter sample rate (must be a integer power of 2): ") sampleRate = int(sampleRate) # Get user input on start time startTime = pd.to_datetime(raw_input("Enter a start time (format: YYYY-MM-DD HH:MM:SS): ")) while type(startTime)==str: print "Not a valid date/time" startTime = pd.to_datetime(raw_input("Enter a start time (format: YYYY-MM-DD HH:MM:SS): ")) return sampleRate, startTime, new_colnames
def read_correct_ch_dam_data(csv_file): """ Function to read, calibrate and convert time format (day1 24:00:00 to day 2 00:00:00) in check dam data :param csv_file: :return: calibrated and time corrected data """ water_level = pd.read_csv( csv_file, skiprows=9, sep=",", header=0, names=["scan no", "date", "time", "raw value", "calibrated value"] ) water_level["calibrated value"] = (water_level["raw value"] * coeff_cal[0]) + coeff_cal[1] # in cm water_level["calibrated value"] /= 100 # convert to metre water_level.columns.values[4] = "stage(m)" # create date time index format = "%d/%m/%Y %H:%M:%S" c_str = " 24:00:00" for index, row in water_level.iterrows(): x_str = row["time"] if x_str == c_str: # convert string to datetime object r_date = pd.to_datetime(row["date"], format="%d/%m/%Y ") # add 1 day c_date = r_date + timedelta(days=1) # convert datetime to string c_date = c_date.strftime("%d/%m/%Y ") c_time = " 00:00:00" water_level["date"][index] = c_date water_level["time"][index] = c_time water_level["date_time"] = pd.to_datetime(water_level["date"] + water_level["time"], format=format) water_level.set_index(water_level["date_time"], inplace=True) # # drop unnecessary columns before datetime aggregation water_level.drop(["scan no", "date", "time", "raw value", "date_time"], inplace=True, axis=1) return water_level
def forecast(self , resampled_df ,data_freq = 52 , number_of_predctions = 5): # start and end date of the series start_date = pd.to_datetime(resampled_df.ix[0].name).date() end_date = pd.to_datetime(resampled_df.ix[-1].name).date() r_series = self.convert_to_r_series(resampled_df, start_date, data_freq) # fit the model log_r_series = self.base.log(r_series) holt_winter_fit = self.stats.HoltWinters(r_series) # forecast holt_winter_forecast = self.forecast_lib.forecast_HoltWinters(holt_winter_fit , \ h = number_of_predctions) # prepare and convert results to pandas dataframe reshaped_melted_results= self.reshape.melt(holt_winter_forecast) if data_freq == 52: forecast_duration = self.base.as_Date(end_date.strftime('%Y-%m-%d')).ro +\ (self.base.seq(1,number_of_predctions).ro * 7) myxts = self.xts.xts(reshaped_melted_results, forecast_duration) results_field = 'value.value.Point.Forecast' elif data_freq == 12: myxts = holt_winter_forecast results_field = 'value.Point.Forecast' results_pd_df = com.convert_robj(self.r.melt(myxts)) results_pd_ts = results_pd_df[results_field ] return (results_pd_ts ,holt_winter_forecast)
def clean(df): df.replace("-unknown-", np.nan, inplace=True) df.loc[df.age > 80, "age"] = np.nan df.loc[df.age < 18, "age"] = np.nan df["timestamp_first_active"] = pd.to_datetime(df.timestamp_first_active.astype(str), format="%Y%m%d%H%M%S") df["date_account_created"] = pd.to_datetime(df["date_account_created"])
def test_resample_across_dst(): # The test resamples a DatetimeIndex with values before and after a # DST change # Issue: 14682 # The DatetimeIndex we will start with # (note that DST happens at 03:00+02:00 -> 02:00+01:00) # 2016-10-30 02:23:00+02:00, 2016-10-30 02:23:00+01:00 df1 = DataFrame([1477786980, 1477790580], columns=['ts']) dti1 = DatetimeIndex(pd.to_datetime(df1.ts, unit='s') .dt.tz_localize('UTC') .dt.tz_convert('Europe/Madrid')) # The expected DatetimeIndex after resampling. # 2016-10-30 02:00:00+02:00, 2016-10-30 02:00:00+01:00 df2 = DataFrame([1477785600, 1477789200], columns=['ts']) dti2 = DatetimeIndex(pd.to_datetime(df2.ts, unit='s') .dt.tz_localize('UTC') .dt.tz_convert('Europe/Madrid')) df = DataFrame([5, 5], index=dti1) result = df.resample(rule='H').sum() expected = DataFrame([5, 5], index=dti2) assert_frame_equal(result, expected)
def get_dividends_yahoo(sid, start, end): # Taken from get_data_yahoo in Pandas library and adjust a single parameter to get dividends from pandas.compat import StringIO, bytes_to_str from pandas.io.common import urlopen start, end = pd.to_datetime(start), pd.to_datetime(end) url = ('http://ichart.finance.yahoo.com/table.csv?' + 's=%s' % sid + '&a=%s' % (start.month - 1) + '&b=%s' % start.day + '&c=%s' % start.year + '&d=%s' % (end.month - 1) + '&e=%s' % end.day + '&f=%s' % end.year + '&g=v' + # THE CHANGE '&ignore=.csv') with urlopen(url) as resp: lines = resp.read() rs = pd.read_csv(StringIO(bytes_to_str(lines)), index_col=0, parse_dates=True, na_values='-')[::-1] # Yahoo! Finance sometimes does this awesome thing where they # return 2 rows for the most recent business day if len(rs) > 2 and rs.index[-1] == rs.index[-2]: # pragma: no cover rs = rs[:-1] return rs
def read(cls, rootdir): path = cls.metadata_path(rootdir) with open(path) as fp: raw_data = json.load(fp) try: version = raw_data['minutes_per_day'] except KeyError: # Version was first written with version 1, assume 0, # if version does not match. version = 0 first_trading_day = pd.Timestamp( raw_data['first_trading_day'], tz='UTC') market_opens = pd.to_datetime(raw_data['market_opens'], unit='m', utc=True) market_closes = pd.to_datetime(raw_data['market_closes'], unit='m', utc=True) ohlc_ratio = raw_data['ohlc_ratio'] if version == 0: # version 0 always assumed US equities. minutes_per_day = US_EQUITIES_MINUTES_PER_DAY else: minutes_per_day = raw_data['minutes_per_day'] return cls( first_trading_day, market_opens, market_closes, ohlc_ratio, minutes_per_day, )
def read_data(verbose=False): """ Read the files and return: Poids, Temps, Charge, Descr data+'Poids.csv' data+'Donnees_Ecometering_Temperature-par-site.csv' data+'charge.csv' data+'Donnees_Ecometering_Description_sites.csv' and a dataframe ready for Prediction """ print('Retrieve the weights') Poids = read_csv(POIDS_CSV, sep=';') Poids.Date = Poids.Date.apply(change_format) print('Retrieve the temperature') Temps = read_csv(TEMPS_CSV, sep=';') Temps.Jour = Temps.Jour.apply(change_format) Temps.index = to_datetime(Temps['Jour'].values, format='%d/%m/%Y %H:%M') Temps = Temps.drop('Jour', 1) Temps = Temps.resample('10min', fill_method='ffill') if verbose: print(echant(Temps, n=10, m=20)) print('Retrieve the charge') Charg = read_csv(CHARGE_CSV, sep=';') Charg.index = to_datetime(Charg['DATE_LOCAL'].values, format='%d/%m/%Y %H:%M') Charg = Charg.drop('DATE_LOCAL', 1) if verbose: print(echant(Charg, n=10, m = 20)) print('Retrieve the description') Descr = read_csv(DESCR_CSV, sep=';') return Poids, Temps, Charg, Descr
import pandas as pd #panggil dataset df = pd.read_csv('https://dqlab-dataset.s3-ap-southeast-1.amazonaws.com/data_retail.csv', sep=';') #cetak lima data teratas print(df.head()) #cetak info dataset print(df.info()) #mengubah kolom waktu transaksi kedalam format tanggal df['First_Transaction']=pd.to_datetime(df['First_Transaction']/1000, unit='s', origin='1970-01-01') df['Last_Transaction']=pd.to_datetime(df['Last_Transaction']/1000, unit='s', origin='1970-01-01') #Pengecekan transaksi terakhir print(max(df['Last_Transaction'])) #Klasifikasi customer churn atau tidak dan dimasukkan ke kolom is_churn df.loc[df['Last_Transaction']<='2018-08-01', 'is_churn'] = True df.loc[df['Last_Transaction']>'2018-08-01', 'is_churn'] = False #menghapus kolom yang tidak diperlukan del df['no'] del df['Row_Num'] import matplotlib.pyplot as plt #Tahun transaksi df['Year_First_Transaction'] = df['First_Transaction'].dt.year df['Year_Last_Transaction'] = df['Last_Transaction'].dt.year
cols = ['ID', 'Vehicle', 'Group', 'Average Speed', 'Minimum Speed', 'Min Location', 'Maximum Speed', 'Max Location', 'Date', 'Estimated Start Time', 'Estimated End Time', 'Estimated Duration'] speeding_out = pd.DataFrame(columns=cols) n = 0 tlist = [] llist = [] tlist.append(speeding.iloc[0]['Speed']) llist.append(speeding.iloc[0]['Location']) for index, row in islice(speeding.iterrows(), 1, None): if row['Rover'] == speeding.iloc[index-1]['Rover'] and int(pd.to_timedelta(pd.to_datetime(row['Stamp']) - pd.to_datetime(speeding.iloc[index - 1]['Stamp'])) / pd.Timedelta('1 minute')) < 1: tlist.append(row['Speed']) llist.append(row['Location']) else: speeding_out.loc[n] = pd.Series({'ID': n, 'Vehicle': speeding.iloc[index - 1]['Rover'], 'Group': speeding.iloc[index - 1]['name'], 'Average Speed': np.mean(tlist), 'Minimum Speed': min(tlist), 'Min Location': llist[tlist.index(min(tlist))], 'Maximum Speed': max(tlist), 'Max Location': llist[tlist.index(min(tlist))], 'Date': pd.to_datetime(speeding.iloc[index - 1]['Stamp']).date(), 'Estimated Start Time': pd.to_datetime(pd.to_datetime(speeding.iloc[index - len(tlist)]['Stamp']) - datetime.timedelta(seconds=10)).time(), 'Estimated End Time': pd.to_datetime(speeding.iloc[index - 1]['Stamp']).time(), 'Estimated Duration': int(pd.to_timedelta(pd.to_datetime(speeding.iloc[index - 1]['Stamp']) - (pd.to_datetime(speeding.iloc[index - len(tlist)]['Stamp']) - datetime.timedelta(seconds=10))) / pd.to_timedelta('1 second'))}) n += 1 del tlist[:] del llist[:] tlist.append(row['Speed'])
def parse_records(urls, admissionIDs): ''' Input: a list of strings, and an array of integers Output: a pandas DataFrame Description: parse_records is a custom parser procedure will query mongoDB and extract the data fields for each admissionID that we want to be able to show in the visualization application. The final output is a pandas dataframe where each row is an admissionID where the columns are specified in the admissionCols, comorbidCols and patientCols arrays. ''' def load_data(dataLocation, collectionName, isURL=False): if isURL: response.get(dataLocation) docsAsJSON = json.loads(response[collectionName]) return pd.DataFrame(docsAsJSON) return pd.read_csv(dataLocation, delimiter='|') admissionsURL, comorbidsURL, patientsURL = urls[0], urls[1], urls[2] dfAdmissions = load_data(admissionsURL, None, isURL=False) dfComorbids = load_data(comorbidsURL, None, isURL=False).rename(index=str, columns={ 'DRG_MORTALITY': 'COMORBID_MORTALITY', 'DRG_SEVERITY': 'COMORBID_SEVERITY' }) dfPatients = load_data(patientsURL, None, isURL=False) subjectIDs = dfAdmissions['SUBJECT_ID'][dfAdmissions.HADM_ID.isin( admissionIDs)] admissionCols = [ 'HADM_ID', 'SUBJECT_ID', 'ADMISSION_TYPE', 'DIAGNOSIS', 'INSURANCE', 'ETHNICITY', 'LANGUAGE', 'MARITAL_STATUS', 'ADMITTIME', 'DISCHTIME' ] admissionInfo = dfAdmissions[admissionCols][dfAdmissions.HADM_ID.isin( admissionIDs)] patientCols = ['SUBJECT_ID', 'GENDER', 'DOB'] patientInfo = dfPatients[patientCols][dfPatients.SUBJECT_ID.isin( subjectIDs)] comorbidCols = ['HADM_ID', 'COMORBID_MORTALITY', 'COMORBID_SEVERITY'] grouped = dfComorbids[comorbidCols][dfComorbids.HADM_ID.isin( admissionIDs)].groupby('HADM_ID') intFrame = admissionInfo.join(grouped.mean(), how='left', on='HADM_ID') finalFrame = intFrame.merge(patientInfo, how='left', on='SUBJECT_ID') finalFrame['AGE'] = np.round((pd.to_datetime(finalFrame.ADMITTIME) - pd.to_datetime(finalFrame.DOB)) \ / np.timedelta64(365, 'D')) finalFrame['HADM_ID'] = finalFrame['HADM_ID'].astype(int) finalFrame['SUBJECT_ID'] = finalFrame['SUBJECT_ID'].astype(int) return finalFrame
def load_vax_data(download = False): if download: download_data(data, "vaccine_doses_statewise.csv") vax = pd.read_csv(data/"vaccine_doses_statewise.csv").set_index("State").T vax.columns = vax.columns.str.title() return vax.set_index(pd.to_datetime(vax.index, format = "%d/%m/%Y"))
# for j in range(x): # df1['e_year'][j]=int(df1['end_date'][j][0:4]) # df1['e_month'][j]=int(df1['end_date'][j][5:7]) # df1['e_date'][j]=int(df1['end_date'][j][8:]) # j=j+1 for k in range(x): start_date=datetime.date(int(df1['start_date'][k][0:4]),int(df1['start_date'][k][5:7]),int(df1['start_date'][k][8:])) end_date=datetime.date(int(df1['end_date'][k][0:4]),int(df1['end_date'][k][5:7]),int(df1['end_date'][k][8:])) diff=end_date-start_date diff+= datetime.timedelta(days=(1)) df1['days'][k]=diff.days #df1['imp_days'][k]=int(df1['imp'][k])*int(df1['days'][k]) k=k+1 df1['start_date']=pd.to_datetime(df1['start_date']) df1['end_date']=pd.to_datetime(df1['end_date']) dff=df1.copy() mask=dff['user_id']==usrl global df_4 df_4= dff.loc[mask] print(df_4) #df1.info() l_cat=list(df1.goal_cat_id.unique()) l_cat.sort() cat={"carrer & education":1,"family":2,"finnaces":3,"friends & social life":4,"fun & recreation":5,"health & fitness":6,"love & relationships":7,"personal development":8} '''for i in test_dict : print(i, test_dict[i]) ''' list_cat = Listbox(root) list_cat.pack()
def main_fun(sector_name, hold_time, if_only_long, time_para_dict): root_path = '/mnt/mfs/DAT_EQT' if_save = True if_new_program = True begin_date = pd.to_datetime('20100101') cut_date = pd.to_datetime('20160401') end_date = pd.to_datetime('20180901') lag = 2 return_file = '' if_hedge = True # if_only_long = False if sector_name.startswith('market_top_300plus'): if_weight = 1 ic_weight = 0 elif sector_name.startswith('market_top_300to800plus'): if_weight = 0 ic_weight = 1 else: if_weight = 0.5 ic_weight = 0.5 main = FactorTestSector(root_path, if_save, if_new_program, begin_date, cut_date, end_date, time_para_dict, sector_name, hold_time, lag, return_file, if_hedge, if_only_long, if_weight, ic_weight) my_factor_list = ['lsgg_num_df_5', 'lsgg_num_df_20', 'lsgg_num_df_60', 'bulletin_num_df', 'news_num_df_5', 'news_num_df_20', 'news_num_df_60', 'staff_changes', 'funds', 'meeting_decide', 'restricted_shares', 'son_company', 'suspend', 'shares', 'bar_num_7_df', 'bar_num_12_df', 'sell_key_title_word', 'sell_summary_key_word', 'buy_key_title__word', 'buy_summary_key_word', ] ratio_list = ['R_DebtAssets_QTTM', 'R_EBITDA_IntDebt_QTTM', 'R_EBITDA_sales_TTM_First', 'R_BusinessCycle_First', 'R_DaysReceivable_First', 'R_DebtEqt_First', 'R_FairVal_TotProfit_TTM_First', 'R_LTDebt_WorkCap_QTTM', 'R_OPCF_TotDebt_QTTM', 'R_OPCF_TotDebt_QTTM', 'R_OPEX_sales_TTM_First', 'R_SalesGrossMGN_QTTM', 'R_CurrentAssetsTurnover_QTTM', 'R_TangAssets_TotLiab_QTTM', 'R_NetROA_TTM_First', 'R_ROE_s_First', 'R_EBIT_sales_QTTM', ] tech_list = [ 'ADX_40_20_10', 'ADX_100_20_10', 'ADX_200_20_10', 'AROON_40_80', 'AROON_200_80', 'CMO_40_0', 'CMO_200_0', 'MFI_40_70_30', 'MFI_140_70_30', 'ADOSC_20_60_0', 'ADOSC_60_120_0', 'ATR_40_0.2', 'ATR_140_0.2', 'RSI_40_30', 'RSI_140_30', 'CCI_p150d_limit_12', 'MACD_40_160', 'bias_turn_p60d', 'vol_p50d', 'vol_p100d', 'vol_p200d', 'evol_p30d', 'evol_p90d', 'moment_p30200d', 'moment_p50300d', 'turn_p30d_0.24', 'turn_p150d_0.18', 'TVOL_p30d_col_extre_0.2', 'TVOL_p90d_col_extre_0.2', 'TVOL_row_extre_0.2', 'aadj_r_p20d_col_extre_0.2', 'aadj_r_p345d_continue_ud_pct', 'aadj_r_p345d_continue_ud', 'volume_moment_p1040d', 'volume_moment_p20120d', 'return_p30d_0.2', 'return_p90d_0.2', 'BBANDS_20_1.5', 'BBANDS_40_1.5', 'BBANDS_100_1.5', 'BBANDS_200_1.5', 'MACD_12_26_9', 'MACD_20_60_18', 'MA_LINE_10_5', 'MA_LINE_60_20', 'MA_LINE_120_60', 'WILLR_10_30', 'WILLR_40_30', 'WILLR_100_20', ] pool_num = 25 main.test_index_3_(my_factor_list, ratio_list, tech_list, pool_num, suffix_name='13')
'MA_LINE_10_5', 'MA_LINE_60_20', 'MA_LINE_120_60', 'WILLR_10_30', 'WILLR_40_30', 'WILLR_100_20', ] pool_num = 25 main.test_index_3_(my_factor_list, ratio_list, tech_list, pool_num, suffix_name='13') time_para_dict = OrderedDict() time_para_dict['time_para_1'] = [pd.to_datetime('20100101'), pd.to_datetime('20150101'), pd.to_datetime('20150401'), pd.to_datetime('20150701'), pd.to_datetime('20151001'), pd.to_datetime('20160101')] time_para_dict['time_para_2'] = [pd.to_datetime('20110101'), pd.to_datetime('20160101'), pd.to_datetime('20160401'), pd.to_datetime('20160701'), pd.to_datetime('20161001'), pd.to_datetime('20170101')] time_para_dict['time_para_3'] = [pd.to_datetime('20120601'), pd.to_datetime('20170601'), pd.to_datetime('20170901'), pd.to_datetime('20171201'), pd.to_datetime('20180301'), pd.to_datetime('20180601')] time_para_dict['time_para_4'] = [pd.to_datetime('20130801'), pd.to_datetime('20180801'), pd.to_datetime('20181101'), pd.to_datetime('20181101'), pd.to_datetime('20181101'), pd.to_datetime('20181101')]
def clean_data(df_csv_datas): ''' 数据清洗 ''' row_length_before = df_csv_datas.shape[0] print('\t清理--清除一些用不到的字段') df_csv_datas = df_csv_datas.drop(['emmcid','上报时间','异常进程名','进程版本名','进程版本号','异常进程包名', '软件系统类型','国家','异常类型','MBN版本信息','异常次数','日志路径'],axis=1) df_csv_datas = df_csv_datas.fillna('-1') print('\t地区码--只保留中国') df_csv_datas = df_csv_datas.loc[df_csv_datas['地区码'] == 'china'] df_csv_datas = df_csv_datas.drop(['地区码'],axis=1) print('\t运营商--移除测试的PLMN') fp = open(os.path.join(os.path.abspath('.'),'config','remove_test_plmn.txt'),'r') test_plmn_list = [plmn.strip() for plmn in fp.readlines()] df_csv_datas = df_csv_datas[-df_csv_datas['运营商'].isin(test_plmn_list)] fp.close() print('\tIMEI--移除测试的IMEI') fp = open(os.path.join(os.path.abspath('.'),'config','remove_test_imei.txt'),'r') test_imei_list = [imei.strip() for imei in fp.readlines()] df_csv_datas = df_csv_datas[-df_csv_datas['imei'].isin(test_imei_list)] fp.close() print('\t处理一些处理异常的字段') df_csv_datas['省直辖市'] = df_csv_datas['省/直辖市'] df_csv_datas = df_csv_datas.drop(['省/直辖市'],axis=1) df_csv_datas['县区'] = df_csv_datas['县/区'] df_csv_datas = df_csv_datas.drop(['县/区'],axis=1) print('\t字段转义') df_csv_datas['机型'] = df_csv_datas['外部机型'].str.cat(df_csv_datas['内部机型'],sep='/') df_csv_datas = df_csv_datas.drop(['外部机型','内部机型'],axis=1) print('\t发生时间--提取发生的小时') df_csv_datas['发生时间1'] = pd.to_datetime(df_csv_datas['发生时间'],infer_datetime_format=True) df_csv_datas['发生时间h'] = df_csv_datas['发生时间1'].apply(get_hour) df_csv_datas = df_csv_datas.drop(['发生时间','发生时间1'],axis=1) df_csv_datas['ImsRat'] = df_csv_datas['保留字段一'] df_csv_datas = df_csv_datas.drop(['保留字段一'],axis=1) df_csv_datas['ExtraCode_ErroMSG'] = df_csv_datas['保留字段二'] df_csv_datas['ExtraCode'] = df_csv_datas['保留字段二'].apply(get_ExtraCode) df_csv_datas['ErroMSG'] = df_csv_datas['保留字段二'].apply(get_ErroMSG) df_csv_datas = df_csv_datas.drop(['保留字段二'],axis=1) df_csv_datas['RSRP'] = df_csv_datas['log信息'].apply(get_rsrp) df_csv_datas['RSRQ'] = df_csv_datas['log信息'].apply(get_rsrq) df_csv_datas = df_csv_datas.drop(['log信息'],axis=1) df_csv_datas['运营商1'] = df_csv_datas['运营商'].apply(get_plmn1) df_csv_datas['运营商2'] = df_csv_datas['运营商'].apply(get_plmn2) df_csv_datas = df_csv_datas.drop(['运营商'],axis=1) print('\t合并一些共同分析的字段') df_csv_datas['开始基站位置'] = df_csv_datas['运营商2'].str.cat(df_csv_datas['起呼位置码'],sep='/').str.cat(df_csv_datas['起呼基站编号'],sep='/') df_csv_datas['结束基站位置'] = df_csv_datas['运营商2'].str.cat(df_csv_datas['结束位置码'],sep='/').str.cat(df_csv_datas['结束基站编号'],sep='/') df_csv_datas = df_csv_datas.drop(['起呼位置码','起呼基站编号'],axis=1) df_csv_datas = df_csv_datas.drop(['结束位置码','结束基站编号'],axis=1) df_csv_datas['电话网络']=df_csv_datas['起呼电话网络'].str.cat(df_csv_datas['结束电话网络'],sep='/') df_csv_datas['数据网络']=df_csv_datas['开始数据网络'].str.cat(df_csv_datas['结束数据网络'],sep='/') df_csv_datas['网络']=df_csv_datas['电话网络'].str.cat(df_csv_datas['数据网络'],sep='/') df_csv_datas = df_csv_datas.drop(['起呼电话网络','结束电话网络'],axis=1) df_csv_datas = df_csv_datas.drop(['开始数据网络','结束数据网络'],axis=1) df_csv_datas['运营商_电话网络'] = df_csv_datas['运营商2'].str.cat(df_csv_datas['电话网络'], sep='/') df_csv_datas['运营商_数据网络'] = df_csv_datas['运营商2'].str.cat(df_csv_datas['数据网络'], sep='/') df_csv_datas['运营商_网络'] = df_csv_datas['运营商2'].str.cat(df_csv_datas['网络'], sep='/') df_csv_datas['运营商_网络_原因'] = df_csv_datas['运营商_网络'].str.cat(df_csv_datas['ExtraCode'], sep='/') df_csv_datas['省市'] = df_csv_datas['省直辖市'].str.cat(df_csv_datas['市'],sep='/') df_csv_datas['省市县区'] = df_csv_datas['省市'].str.cat(df_csv_datas['县区'],sep='/') df_csv_datas = df_csv_datas.drop(['市'],axis=1) df_csv_datas = df_csv_datas.drop(['县区'],axis=1) row_length_after = df_csv_datas.shape[0] print('\t数据清洗前后的数量='+str(row_length_after)+'/'+str(row_length_before)+',数据清洗比率='+str(row_length_after*100/row_length_before)+'%') return df_csv_datas
def datechunk_to_date(date_chunk): return pd.to_datetime(date_chunk[0][2:])
def test_iter_readonly(): # GH#28055 ints_to_pydatetime with readonly array arr = np.array([np.datetime64("2012-02-15T12:00:00.000000000")]) arr.setflags(write=False) dti = pd.to_datetime(arr) list(dti)
def create_full_tearsheet(self, results, heatmap_2d=True): """ Create a full tear sheet of param scan results. Parameters ---------- results : DataFrame multi-index (Field, StrategyOrDate) DataFrame of param scan results, with param vals as (possibly multi-level) columns heatmap_2d : bool use heat maps for 2 paramscans; if False, use bar charts Returns ------- None """ returns = results.loc["AggReturn"] returns.index = pd.to_datetime(returns.index) returns.index.name = "Date" summary = OrderedDict() if results.columns.nlevels == 2: param1, param2 = results.columns.names summary["Parameter 1"] = param1 summary["Parameter 2"] = param2 params_title = " / ".join(results.columns.names) else: summary["Parameter"] = results.columns.name params_title = results.columns.name summary["Start Date"] = returns.index.min().date().isoformat() summary["End Date"] = returns.index.max().date().isoformat() with sns.axes_style("white", {'axes.linewidth': 0}): fig = plt.figure("Parameter Scan Summary") axis = fig.add_subplot(111) axis.get_xaxis().set_visible(False) axis.get_yaxis().set_visible(False) table = axis.table(cellText=[[v] for v in summary.values()], rowLabels=list(summary.keys()), loc="center") table.scale(1, 2) table.set_fontsize("large") fig.tight_layout() # Plot 1d bar charts or 2d heat maps if results.columns.nlevels == 2 and heatmap_2d: self._create_2d_heatmaps(results) else: self._create_1d_bar_charts(results) # Plot performance plots performance = DailyPerformance(returns) # cut height in half since only one chart per figure width, height = self.figsize figsize = width, height / 2 self._create_returns_plots(performance, subplot=111, extra_label=" (Aggregate)", figsize=figsize, legend_title=params_title) self._save_or_show()
#change the number to move the right bound left and right if needed NumOfDaysToMoveBackFromToday = time() - api.DAY*0 #period of candlesticks to recieve: 24, 4, 2, 0.5, 0.25, or 0.083 period = api.HOUR * 24 #api call raw = api.returnChartData(currencyToGet, period=period, start=time() - api.DAY*numOfDaysToGet, end= NumOfDaysToMoveBackFromToday) #load dataframe with infrom from api call df = pd.DataFrame(raw) #create date column and convert epoch time from api call to date df['date'] = pd.to_datetime(df["date"], unit='s') df['date']= df['date'].map(dt.datetime.toordinal) #calculate hui hubel liquidty rates df['liquidity'] = ((df['high'] - df['low']) / df['low']) / (df['volume'] / (df['weightedAverage'] * df['quoteVolume'])) #Calculates a relative strength index with an exponetial moving average as EMA better shows price movements - Tortise vs Heir example close = df['close'] delta = close.diff() delta = delta[1:] up, down = delta.copy(), delta.copy() up[up < 0] = 0 down[down > 0] = 0 roll_up1 = pd.stats.moments.ewma(up, windowLength) roll_down1 = pd.stats.moments.ewma(down.abs(), windowLength)
pos = 0 pos_lst.append(pos) position['pos' + str(i)] = pos_lst position = position.reset_index(drop=True)[[ 'trade_date', 'pos' + str(i) ]] pos_df = pos_df.merge(position, on=['trade_date'], how='outer') pos_df = pos_df.fillna(value=0) pos_df = pos_df.sort_values(['trade_date']).set_index(['trade_date']) pos_df['position'] = pos_df.sum(axis=1) / len(para_lst) pos_df = pos_df.reset_index(drop=False).merge(index_hq, on=['trade_date']).sort_values(['trade_date']) \ .assign(close_1=lambda df: df.close.shift(1)).dropna() fig, ax = plt.subplots(1, 1, figsize=(9, 6)) ax1 = ax.twinx() pos_df.index = pd.to_datetime(pos_df['trade_date']) pos_df[['close']].plot(ax=ax1, figsize=(9, 6), kind='line', style=['k-']) pos_df[['position']].plot(kind='area', grid=True, ax=ax, figsize=(9, 7), rot=60, style=['y']) pos_df[['trade_date', 'position' ]].to_csv(fold_pos + 'pos_ymjh_' + index_code[:6] + '.csv', encoding='gbk', header=None)
def lookup_sources_for_observation(fits_files=None, filename=None, force_new=False, cursor=None, use_intersection=False, **kwargs): if force_new: print(f'Forcing a new source file') with suppress(FileNotFoundError): os.remove(filename) try: print(f'Using existing source file: {filename}') observation_sources = pd.read_csv(filename, parse_dates=True) observation_sources['obstime'] = pd.to_datetime( observation_sources.obstime) except FileNotFoundError: if not cursor: cursor = get_cursor(port=5433, db_name='v702', db_user='******') print(f'Looking up sources in {len(fits_files)} files') observation_sources = None # Lookup the point sources for all frames for fn in tqdm(fits_files): point_sources = lookup_point_sources(fn, force_new=force_new, cursor=cursor, **kwargs) header = fits_utils.getheader(fn) obstime = Time(pd.to_datetime(os.path.basename(fn).split('.')[0])) exptime = header['EXPTIME'] * u.second obstime += (exptime / 2) point_sources['obstime'] = obstime.datetime point_sources['exptime'] = exptime point_sources['airmass'] = header['AIRMASS'] point_sources['file'] = os.path.basename(fn) point_sources['picid'] = point_sources.index print(f'Combining sources with previous observations') if observation_sources is not None: if use_intersection: print(f'Getting intersection of sources') idx_intersection = observation_sources.index.intersection( point_sources.index) print( f'Num sources in intersection: {len(idx_intersection)}' ) observation_sources = pd.concat([ observation_sources.loc[idx_intersection], point_sources.loc[idx_intersection] ], join='inner') else: observation_sources = pd.concat( [observation_sources, point_sources]) else: observation_sources = point_sources print(f'Writing sources out to file') observation_sources.to_csv(filename) observation_sources.set_index(['obstime'], inplace=True) return observation_sources
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Fri Dec 27 15:23:07 2019 @author: student """ import numpy as np import pandas as pd import matplotlib.pyplot as plt import matplotlib as mpl from statsmodels.tsa.arima_model import ARIMA plt.style.use('fivethirtyeight') df=pd.read_csv("/home/student/Desktop/Python/dataset/movavg.csv", index_col='Date') df.index=pd.to_datetime(df.index) model=ARIMA(df.Price,order=(1,2,0)) model_fit=model.fit(disp=0) print(model_fit.summary())
import argparse import requests import pandas as pd import matplotlib.pyplot as plt def getdata(): response = requests.get("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv") with open('data.csv', 'wb') as fp: fp.write(response.content) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--logarithmic", action='store_true') args = parser.parse_args() getdata() df = pd.read_csv('data.csv') dfg = df.groupby(by='Country/Region').sum() dfg.sort_values(by=dfg.columns[-1], ascending=False, inplace=True) dfg.drop(labels=['Lat', 'Long'], axis=1, inplace=True) dfg.columns = pd.to_datetime(dfg.columns) dfplot = dfg.iloc[:10].T.plot(logy=args.logarithmic, title="Covid-19 deaths", grid=True) plt.minorticks_on() plt.show()
def test_list_of_datetime_time_roundtrip(): # ARROW-4135 times = pd.to_datetime( ['09:00', '09:30', '10:00', '10:30', '11:00', '11:30', '12:00']) df = pd.DataFrame({'time': [times.time]}) _roundtrip_pandas_dataframe(df, write_kwargs={})
server = app.server style = { 'background': 'white', 'color': '#E50914', 'fontFamily': 'Montserrat', 'fontSize': '20px', 'text_color': '#564d4d', 'second_color': '#94A3BC', 'third_color': '#C1666B' } # assume you have a "long-form" data frame # see https://plotly.com/python/px-arguments/ for more options df = pd.read_csv('data/viewedHistory_George.csv', sep=';', error_bad_lines=False, engine='python') df['dateStr'] = pd.to_datetime(df['dateStr'], utc=True) df_2020 = df[df['dateStr']>'2019-12-31'] df_2019 = df[df['dateStr']<'2020-01-01'] hours_2020 = df_2020.duration.sum()/3600 days_2020 = str(round(hours_2020/24, 2)) df_2020['weekDay'] = df_2020['dateStr'].dt.day_name() df_2020['weekDayCount'] = df_2020['dateStr'].apply(lambda x: x.weekday()) df_2020['durationM'] = df_2020['duration'].apply(lambda x: x/60) df_2020['durationH'] = df_2020['duration'].apply(lambda x: x/3600) weekday = df_2020[['durationM', 'weekDay', 'weekDayCount']].groupby('weekDay').mean().reset_index().sort_values('weekDayCount') weekday['isWeekend'] = weekday['weekDay'].apply(lambda x: 'yes' if x =='Saturday' or x == 'Sunday' else 'no') weekday_plot = px.bar(weekday, x="weekDay", y="durationM", color="isWeekend", title="Average number of minutes spent per day of week", labels={'durationM':'Avg. Minutes'}, height=350, width=450, color_discrete_map={ "yes": style['color'], "no": style['second_color']
from pandas.tseries.offsets import BMonthEnd from pandas.tseries.holiday import get_calendar, HolidayCalendarFactory, GoodFriday from datetime import datetime, time, timedelta from py_vollib import black_scholes import zipfile from private import settings import numpy as np from scipy.interpolate import InterpolatedUnivariateSpline as interpol years = ([0.0, 1/360, 1/52, 1/12, 2/12, 3/12, 6/12, 12/12]) functions_dict = {} df_yields = pandas.read_csv(settings.path_to_libor_csv) cols = ['date', 'ON', 'w1', 'm1', 'm2', 'm3', 'm6', 'm12'] df_yields.columns = cols df_yields['date'] = pandas.to_datetime(df_yields['date']) df_yields.set_index('date',inplace=True) c = calendar.Calendar(firstweekday=calendar.SUNDAY) offset = BMonthEnd() entries = [] ratio = 100 lower_ul = 1 upper_ul = 1000000 dividend = 0 commissions = 1.25 interest = 0.0225 yeartradingdays = 252
from sklearn.metrics import log_loss from sklearn import metrics from sklearn.naive_bayes import BernoulliNB #朴素贝叶斯(伯努利) from sklearn.linear_model import LogisticRegression #逻辑回归 from sklearn.ensemble import RandomForestClassifier #随机森林 train = pd.read_csv('C:\\Users\\49210\Desktop\\22.csv') test = pd.read_csv('C:\\Users\\49210\Desktop\\22.csv') train.head() le = preprocessing.LabelEncoder() crime_type_encode = le.fit_transform(train['EVENT_TYPE']) MONTH = pd.to_datetime(train['EVENT_DATE']).dt.month MONTH = pd.get_dummies(MONTH)#月份训练 number = pd.get_dummies(train['事件数']) SHENFEN = pd.get_dummies(train['ADMIN1'])#省份训练 train_set = pd.concat([MONTH,number,SHENFEN],axis=1) train_set['crime_type'] = crime_type_encode #训练样本特征因子化 MONTH_t =pd.to_datetime(train['EVENT_DATE']).dt.month MONTH_t = pd.get_dummies(MONTH_t) number_t = pd.get_dummies(test['事件数']) SHENFEN_t = pd.get_dummies(test['ADMIN1']) test_set = pd.concat([MONTH_t,number,SHENFEN_t],axis=1) x = train_set.loc[:,train_set.columns!='crime_type'] y = train_set['crime_type']
def start_db(db_dir: str = 'investments_database.db', start_year: int = 2005, target_funds: list = []): """Starts a SQLite database with 3 tables: daily_quotas (funds data), ibov_returns (ibovespa index data) and selic_rates (the base interest rate for the brazilian economy).\n <b>Parameters:</b>\n db_dir (str): The path of the dabatabse file to be created. Defaults to 'investments_database.db', creating the file in the current working directory.\n start_year (int): Opitional (Defaults to 2005). Starting year for the data collection. . Can be use to reduce the size of the database.\n target_funds (list): Opitional (Defaults to []). List of target funds CNPJs. Only funds with CNPJs contained in this list will be included in the database. Can be used to radically reduce the size of the database. If none is specified, all funds will be included.\n <b>Returns:</b>\n Theres no return from the function. """ ##STEP 1: #starts the new database print (f'creating SQLite database: {db_dir} \n') con = sqlite3.connect(db_dir) ##STEP 2: #downloads each report in the cvm website and pushes it to the sql database daily_quotas table print('downloading daily reports from the CVM website... \n') #for each year between 2017 and now for year in tqdm(range(start_year, datetime.date.today().year + 1), position = 0, leave=True): for mth in range(1, 13): #for each month #loop structure for years equal or after 2017 if year>=2017: informe = cvm_informes(str(year), mth) try: if target_funds: #if the target funds list is not empty, uses it to filter the result set informe = informe[informe.CNPJ_FUNDO.isin(target_funds)] #appends information to the sql database informe.to_sql('daily_quotas', con , if_exists = 'append', index=False) except AttributeError: pass elif year<2017: #loop structure to handle years before 2017 (they have a different file structure) #only executes the download function once every year to avoid duplicates (unique file for each year) if mth == 12: informe = cvm_informes(str(year), mth) try: if target_funds: #if the target funds list is not empty, uses it to filter the result set informe = informe[informe.CNPJ_FUNDO.isin(target_funds)] #appends information to the sql database informe.to_sql('daily_quotas', con , if_exists = 'append', index=False) except AttributeError: pass #pushes target funds to sql for use when updating the database if target_funds: target_df = pd.DataFrame({'targets':target_funds}) target_df.to_sql('target_funds', con , index=False) ##STEP 3: #creates index in the daily_quotas table to make future select queries faster. #tradeoff: The updating proceesses of the database will be slower. print('creating sql index on "CNPJ_FUNDO", "DT_COMPTC" ... \n') index = ''' CREATE INDEX "cnpj_date" ON "daily_quotas" ( "CNPJ_FUNDO" ASC, "DT_COMPTC" ASC )''' cursor = con.cursor() cursor.execute(index) con.commit() cursor.close() ##STEP 4: #downloads cadastral information from CVM of the fundos and pushes it to the database print('downloading cadastral information from cvm...\n') info_cad = pd.read_csv('http://dados.cvm.gov.br/dados/FI/CAD/DADOS/cad_fi.csv', sep = ';', encoding='latin1', dtype = {'RENTAB_FUNDO': object,'FUNDO_EXCLUSIVO': object, 'TRIB_LPRAZO': object, 'ENTID_INVEST': object, 'INF_TAXA_PERFM': object, 'INF_TAXA_ADM': object, 'DIRETOR': object, 'CNPJ_CONTROLADOR': object, 'CONTROLADOR': object} ) if target_funds: info_cad = info_cad[info_cad.CNPJ_FUNDO.isin(target_funds)] info_cad.to_sql('info_cadastral_funds', con, index=False) ##STEP 5: #downloads daily ibovespa prices from investing.com and pushes it to the database print('downloading ibovespa index prices from investing.com ...\n') today = (datetime.date.today() + datetime.timedelta(1)).strftime('%Y-%m-%d') ibov = pd.DataFrame(YahooFinancials('^BVSP').get_historical_price_data('1990-09-15', today, 'daily')['^BVSP']['prices']) ibov = ibov.drop(columns=['date', 'close']).rename(columns={'formatted_date':'date', 'adjclose':'close'}).iloc[:,[5,0,1,2,3,4]] ibov['date'] = pd.to_datetime(ibov['date']) ibov.columns = [i.capitalize() for i in ibov.columns] #capitalizes columns to keep consistency with previous format (investpy) ibov.to_sql('ibov_returns', con, index=False) ##STEP 6: #downloads daily selic returns (basic interest rate of the brazilian economy) #from the brazillian central bank and pushes it to the database print('downloading selic rates from the Brazilian Central Bank website...\n') selic = pd.read_json('http://api.bcb.gov.br/dados/serie/bcdata.sgs.{}/dados?formato=json'.format(11)) selic['data'] = pd.to_datetime(selic['data'], format = '%d/%m/%Y') selic['valor'] = selic['valor']/100 #calculates decimal rate from the percentual value #calculates asset "price" considering day 0 price as 1 selic.loc[0,'price'] = 1 * (1 + selic.loc[0,'valor']) for i in range(1, len(selic)): selic.loc[i, 'price'] = selic.loc[i-1, 'price'] * (1 + selic.loc[i,'valor']) selic.rename(columns = {'data':'date', 'valor':'rate'}, inplace = True) selic.to_sql('selic_rates', con , index=False) ##STEP 7: #creates a table with a log of the execution timestamps of the script print('creating the log table...\n') update_log = pd.DataFrame({'date':[datetime.datetime.now()], 'log':[1]}) update_log.to_sql('update_log', con, if_exists = 'append', index=False) ##STEP 8 #closes the connection with the database con.close() print('connection with the database closed! \n') print(f'Success: database created in {db_dir} !\n')
def mypreditions_to_database(): with open("./utils/make_predicitions/X_train_o3", "rb") as X_scalar_file: X_train_scalar_o3 = pk.load(X_scalar_file) with open("./utils/make_predicitions/o3_api_model", "rb") as model_file: model_o3 = pk.load(model_file) with open("./utils/make_predicitions/X_train_pm25", "rb") as X_scalar_file: X_train_scalar_pm25 = pk.load(X_scalar_file) with open("./utils/make_predicitions/pm25_api_model", "rb") as model_file: model_pm25 = pk.load(model_file) try: predict_start_time = pd.to_datetime(input("-- 请输入预报起始时间: ")) predict_time_len = int(input("-- 请输入预报时长: ")) pollutant = input("-- 请输入预报污染物(O3/PM25): ") # predict_start_time = pd.to_datetime("20201214 13:00:00") # predict_time_len = 24 # pollutant = "O3" except: print("-- 请按指定格式重新输入数据.") time_delta = timedelta(hours=predict_time_len) predict_end_time = predict_start_time + time_delta print("\n") print("-- 预报污染物: {}".format(pollutant)) print("-- 预报时长{} to {}".format(predict_start_time, predict_end_time)) df = pd.read_excel("./utils/make_predicitions/wuxi_meteos.xlsx", index_col="time_point") df = df[predict_start_time:predict_end_time] #存入数据库 mloutDB = MloutDB() if pollutant == "O3": features = ['precipitation', 'temperature', 'ws', 'wd', 'humidity', 'cloudrate', 'pressure', 'visibility', 'dswrf'] X = df[df["position_name"] == "东亭"][features] X_std = (X - X_train_scalar_o3.min(axis=0)) / (X_train_scalar_o3.max(axis=0) - X_train_scalar_o3.min(axis=0)) predictions = pd.Series(model_o3.predict(X_std), index=X_std.index) list_value = [] for time_point, value in predictions.items(): tempList = [time_point, '东亭', 2, value] list_value.append(tempList) count = mloutDB.insertMany(list_value) print(count) elif pollutant == "PM25": features = ['precipitation', 'temperature', 'ws', 'wd', 'humidity', 'cloudrate', 'pressure', 'visibility', 'dswrf', 'pm25'] X = df[df["position_name"] == "东亭"][features] X_std = (X - X_train_scalar_pm25.min(axis=0)) / ( X_train_scalar_pm25.max(axis=0) - X_train_scalar_pm25.min(axis=0)) list_value = [] predictions = pd.Series(model_pm25.predict(X_std), index=X_std.index) for time_point, value in predictions.items(): tempList = [time_point, '东亭', 2, value] list_value.append(tempList) count = mloutDB.insertMany(list_value) print(count)
def min_transform(x): #print(x) scaler = MinMaxScaler() #scaler.fit(x) return scaler.fit_transform([x])[0] #--------------------lectura del dataset----------------------------------- # df_mobility = pd.read_csv('Global_Mobility_Report.csv', encoding='iso-8859-1', low_memory=False) df_mobility['date'] = pd.to_datetime(df_mobility['date']) df_mobility = df_mobility[df_mobility['date'] == pd.to_datetime('2020-02-29')] df_mobility_temp = pd.DataFrame() df_mobility_temp['country_region_code'] = df_mobility['country_region_code'] df_mobility_temp['retail_and_recreation'] = df_mobility[ 'retail_and_recreation_percent_change_from_baseline'] df_mobility_temp['grocery_and_pharmacy'] = df_mobility[ 'grocery_and_pharmacy_percent_change_from_baseline'] df_mobility_temp['parks'] = df_mobility['parks_percent_change_from_baseline'] df_mobility_temp['transit_stations'] = df_mobility[ 'transit_stations_percent_change_from_baseline'] df_mobility_temp['workplaces'] = df_mobility[ 'workplaces_percent_change_from_baseline']
def load_data(): data = pd.read_csv(DATA_URL) data['tweet_created'] = pd.to_datetime(data['tweet_created']) return data
def deal_tradeinfo(tradeinfo, pricefname, n=10, intraday=False): """ 根据交易信号和数据文件,处理数据. return data['high_profits', 'low_profit', 'exit_profit', 'period', 'return', 'entry_nbar_bests', 'entry_nbar_worsts', 'exit_nbar_bests', 'exit_nbar_worsts', 'islong', 'entry_n', 'exit_n' ] """ PRICE = 'close' data = pd.DataFrame(tradeinfo.ix[:, 0:2]) price_data = csv2frame(pricefname) high_profits = [] low_profits = [] exit_profits = [] periods = [] entry_nbar_bests = [] entry_nbar_worsts = [] exit_nbar_bests = [] exit_nbar_worsts = [] islongs = [] returns = [] entry_Nlist = [] exit_Nlist = [] for i in range(len(data)): startt = tradeinfo.index[i] startpos = price_data.index.searchsorted(startt) endt = tradeinfo.ix[i, ['exit_datetime']][0] endpos = price_data.index.searchsorted(endt) tradingdf = price_data.truncate(before=tradeinfo.index[i], after=endt) onetrade = tradeinfo.ix[i, :] # high/low if len(tradingdf) > 1: hp = tradingdf.ix[:-1, :][PRICE].max() lp = tradingdf.ix[:-1, :][PRICE].min() t = tradingdf.ix[:-1, :][PRICE].tolist() t.append(float(onetrade['exit_price'])) returns.append(max_return(t, onetrade['islong'])) else: hp = tradingdf.ix[:, :][PRICE].max() lp = tradingdf.ix[:, :][PRICE].min() if onetrade['islong']: returns.append( max(onetrade['entry_price'] - onetrade['exit_price'], 0)) else: returns.append( max(onetrade['exit_price'] - onetrade['entry_price'], 0)) hp = onetrade['exit_price'] if onetrade['exit_price'] > hp else hp hp = onetrade['entry_price'] if onetrade['entry_price'] > hp else hp lp = onetrade['exit_price'] if onetrade['exit_price'] < lp else lp lp = onetrade['entry_price'] if onetrade['entry_price'] < lp else lp hp = hp - onetrade['entry_price'] lp = lp - onetrade['entry_price'] high_profits.append(hp if onetrade['islong'] else 0 - hp) low_profits.append(lp if onetrade['islong'] else 0 - lp) # exit ep = onetrade['exit_price'] - onetrade['entry_price'] exit_profits.append(ep if onetrade['islong'] else 0 - ep) # period periods.append(endpos - startpos + 1) # nbar todo entry_begin = startpos exit_begin = endpos + 1 if intraday: day_entry_end = price_data.index.searchsorted( (pd.to_datetime(startt) + dt.timedelta(days=1)).strftime("%Y-%m-%d")) day_exit_end = price_data.index.searchsorted( (pd.to_datetime(endt) + dt.timedelta(days=1)).strftime("%Y-%m-%d")) entry_end = min(startpos + n + 1, day_entry_end) exit_end = min(endpos + 1 + n, day_exit_end) else: entry_end = startpos + n + 1 exit_end = endpos + 1 + n entry_Nlist.append(entry_end - entry_begin) exit_Nlist.append(exit_end - exit_begin) islongs.append(onetrade['islong']) if onetrade['islong']: entry_nbar_bests.append(price_data.ix[entry_begin:entry_end, PRICE].max() - onetrade['entry_price']) entry_nbar_worsts.append(price_data.ix[entry_begin:entry_end, PRICE].min() - onetrade['entry_price']) exit_nbar_bests.append(price_data.ix[exit_begin:exit_end, PRICE].max() - onetrade['entry_price']) exit_nbar_worsts.append(price_data.ix[exit_begin:exit_end, PRICE].min() - onetrade['entry_price']) else: entry_nbar_bests.append(onetrade['entry_price'] - price_data.ix[entry_begin:entry_end, PRICE].min()) entry_nbar_worsts.append(onetrade['entry_price'] - price_data.ix[entry_begin:entry_end, PRICE].max()) exit_nbar_bests.append(onetrade['entry_price'] - price_data.ix[exit_begin:exit_end, PRICE].min()) exit_nbar_worsts.append(onetrade['entry_price'] - price_data.ix[exit_begin:exit_end, PRICE].max()) data['high_profit'] = high_profits data['low_profit'] = low_profits data['exit_profit'] = exit_profits data['period'] = periods data['return'] = returns data['entry_nbar_best'] = entry_nbar_bests data['entry_nbar_worst'] = entry_nbar_worsts data['exit_nbar_best'] = exit_nbar_bests data['exit_nbar_worst'] = exit_nbar_worsts data['islong'] = islongs data['entry_n'] = entry_Nlist data['exit_n'] = exit_Nlist print "Data Preprocessing Done!" #data.to_csv("d:\\rst.csv") return data
def update_db(db_dir: str = r'investments_database.db'): """Updates the database.\n <b>Parameters:</b>\n db_dir (str): The path of the dabatabse file to be updated. Defaults to 'investments_database.db'.\n <b>Returns:</b>\n Theres no return from the function. """ ##STEP 1 #connects to the database print(f'connected with the database {db_dir}\n') con = sqlite3.connect(db_dir) ##STEP 2 #calculates relevant date limits to the update process Cal=Brazil() #inicializes the brazillian calendar today = datetime.date.today() #queries the last update from the log table last_update = pd.to_datetime(pd.read_sql('select MAX(date) from update_log', con).iloc[0,0]) last_quota = Cal.sub_working_days(last_update, 2) #date of the last published cvm repport num_months = (today.year - last_quota.year) * 12 + (today.month - last_quota.month) + 1 ##STEP 3 #delete information that will be updated from the database tables print('deleting redundant data from the database... \n') tables = {'daily_quotas' : ['DT_COMPTC',last_quota.strftime("%Y-%m-01")], 'ibov_returns' : ['Date',last_update.strftime("%Y-%m-%d")]} cursor = con.cursor() #sql delete statement to the database cursor.execute('delete from daily_quotas where DT_COMPTC >= :date', {'date': last_quota.strftime("%Y-%m-01")}) cursor.execute('delete from ibov_returns where Date >= :date', {'date': last_update.strftime("%Y-%m-%d")}) con.commit() cursor.close() ##STEP 4 #Pulls new data from CVM, investpy and the brazilian central bank #and pushes it to the database try:#tries to read targets funds if they were specified when starting the database target_funds = pd.read_sql('select targets from target_funds', con).targets.to_list() except DatabaseError: target_funds = [] print('downloading new daily reports from the CVM website...\n') # downloads the daily cvm repport for each month between the last update and today for m in range(num_months+1): data_alvo = last_quota + relativedelta(months=+m) informe = cvm_informes(data_alvo.year, data_alvo.month) if target_funds: informe = informe[informe.CNPJ_FUNDO.isin(target_funds)] try: informe.to_sql('daily_quotas', con , if_exists = 'append', index=False) except AttributeError: pass #downloads cadastral information from CVM of the fundos and pushes it to the database print('downloading updated cadastral information from cvm...\n') info_cad = pd.read_csv('http://dados.cvm.gov.br/dados/FI/CAD/DADOS/cad_fi.csv', sep = ';', encoding='latin1', dtype = {'RENTAB_FUNDO': object,'FUNDO_EXCLUSIVO': object, 'TRIB_LPRAZO': object, 'ENTID_INVEST': object, 'INF_TAXA_PERFM': object, 'INF_TAXA_ADM': object, 'DIRETOR': object, 'CNPJ_CONTROLADOR': object, 'CONTROLADOR': object} ) if target_funds: #filters target funds if they were specified when building the database. info_cad = info_cad[info_cad.CNPJ_FUNDO.isin(target_funds)] info_cad.to_sql('info_cadastral_funds', con, if_exists='replace', index=False) #updates daily interest returns (selic) print('updating selic rates...\n') selic = pd.read_json('http://api.bcb.gov.br/dados/serie/bcdata.sgs.{}/dados?formato=json'.format(11)) selic['data'] = pd.to_datetime(selic['data'], format = '%d/%m/%Y') selic['valor'] = selic['valor']/100 #calculates decimal rate from the percentual value #calculates asset "price" considering day 0 price as 1 selic.loc[0,'price'] = 1 * (1 + selic.loc[0,'valor']) for i in range(1, len(selic)): selic.loc[i, 'price'] = selic.loc[i-1, 'price'] * (1 + selic.loc[i,'valor']) selic.rename(columns = {'data':'date', 'valor':'rate'}, inplace = True) #filters only new data selic = selic[selic.date>=(last_update + datetime.timedelta(-1))] selic.to_sql('selic_rates', con , if_exists = 'append', index=False) #updates ibovespa data print('updating ibovespa returns...\n') today = (datetime.date.today() + datetime.timedelta(1)).strftime('%Y-%m-%d') ibov = pd.DataFrame(YahooFinancials('^BVSP').get_historical_price_data(last_update.strftime('%Y-%m-%d'), today, 'daily')['^BVSP']['prices']) ibov = ibov.drop(columns=['date', 'close']).rename(columns={'formatted_date':'date', 'adjclose':'close'}).iloc[:,[5,0,1,2,3,4]] ibov['date'] = pd.to_datetime(ibov['date']) ibov.columns = [i.capitalize() for i in ibov.columns] #capitalizes columns to keep consistency with previous format (investpy) ibov.to_sql('ibov_returns', con , if_exists = 'append', index=False) ##STEP 5 #updates the log in the database print('updating the log...\n') update_log = pd.DataFrame({'date':[datetime.datetime.now()], 'log':[1]}) update_log.to_sql('update_log', con, if_exists = 'append', index=False) ##STEP 6 #closes the connection with the database con.close() print('connection with the database closed!\n') print(f'database {db_dir} updated!\n')
for x1 in posts_json['data']: commentlst.append(x1.get('message').encode('utf-8').strip()) datelst.append(x1.get('created_time')) next_page = "" try: next_page = posts_json['paging']['next'] url = next_page except: break if not next_page: break print ("Count: %s, Next Page: %s" % ( len(commentlst), url)) print ("\nGenerating JSON File") df = pd.DataFrame({'comment': commentlst, 'dates': datelst}) df['dates'] = pd.to_datetime(df['dates']) df['day_of_week'] = df['dates'].dt.weekday_name df['year'] = df['dates'].dt.year df['month'] = df['dates'].dt.month df['count'] = 1 df.to_json('comment_data.json') #Generate Sentimental Results import requests import json from google.cloud import language, exceptions client = language.Client() # export GOOGLE_APPLICATION_CREDENTIALS environment variable with open('comment_data.json') as data_file:
def tick2period(code, period, start, end): """ get tick data from tushare and resample to certain period data selected by input: period """ import tushare as ts import numpy as np import pandas as pd dfout = None #get valid trade date valid_dates = ts.get_hist_data(code, start=start, end=end).index for date in valid_dates: #date=date.strftime('%Y-%m-%d') rng = pd.date_range( date + ' 9:30:00', date + ' 15:00', closed='right', freq=period) #setup trade time grid by period selected sr = pd.Series(np.nan, index=rng) df = ts.get_tick_data(code, date=date) df.loc[df.time < '09:30:00', 'time'] = '09:30:01' #process open call auction df.loc[df.time > '15:00:00', 'time'] = '14:59:59' #process close call auction df['time'] = date + ' ' + df['time'] df = df.rename(columns={'time': 'datetime'}) df['datetime'] = pd.to_datetime(df['datetime']) df = df.set_index('datetime').sort() df2 = df['volume'].resample(period, how='sum', closed='right', label='right') df2, dummy = df2.align(sr, axis=0) #align to standard time df3 = df2.truncate(before=date + ' 13:00:01', after=date + ' 15:00') df2 = df2.truncate(before=date + ' 9:30:01', after=date + ' 11:30') #remove non-trade time df2 = df2.append(df3).fillna( 0) #fill with 0 for period without valid deal df1 = df['price'].resample(period, how='ohlc', closed='right', label='right') df1, dummy = df1.align(sr, axis=0) #align to standard time df3 = df1.truncate(before=date + ' 13:00:01', after=date + ' 15:00') df1 = df1.truncate(before=date + ' 9:30:01', after=date + ' 11:30') #remove non-trade time df1 = df1.append(df3) if np.isnan( df1.ix[0, 'close'] ): #use last day's close as initial price if there is no deal after open from datetime import timedelta, datetime aDay = timedelta( days=-10 ) #get enough days to ensure at least one trading day is involved pre = (pd.to_datetime(date) + aDay).strftime('%Y-%m-%d') df1.ix[0, 'close'] = ts.get_hist_data(code, start=pre, end=date).ix[-2, 'close'] df1['close'].fillna( method='pad', inplace=True ) #use price before if there is no deal during current period df1.fillna( method='bfill', inplace=True, axis=1 ) #use close as open,high,low if there is no deal during current period df1['volume'] = df2.values dfout = pd.concat([dfout, df1]) return dfout
def get_course_assignments(course_id): sql = f"""select assign.*,sub.avg_score from (select ifnull(assignment_id, 0) as assignment_id ,name,assign_grp_name,grp_id,due_date,points_possible,group_points,weight,drop_lowest,drop_highest from (select a.id as assignment_id,a.assignment_group_id, a.local_date as due_date,a.name,a.points_possible from assignment as a where a.course_id =%(course_id)s) as app right join (select id, name as assign_grp_name, id as grp_id, group_points, weight,drop_lowest,drop_highest from assignment_groups where course_id=%(course_id)s) as ag on ag.id=app.assignment_group_id) as assign left join (select distinct assignment_id,avg_score from submission where course_id=%(course_id)s) as sub on sub.assignment_id = assign.assignment_id """ assignments_in_course = pd.read_sql(sql, conn, params={'course_id': course_id}, parse_dates={'due_date': '%Y-%m-%d'}) # No assignments found in the course if assignments_in_course.empty: logger.info('The course %s don\'t seems to have assignment data' % course_id) return assignments_in_course assignments_in_course['due_date'] = pd.to_datetime( assignments_in_course['due_date'], unit='ms') assignments_in_course[['points_possible', 'group_points']] = assignments_in_course[[ 'points_possible', 'group_points' ]].fillna(0) assignments_in_course[['points_possible', 'group_points', 'weight']] = assignments_in_course[[ 'points_possible', 'group_points', 'weight' ]].astype(float) consider_weight = is_weight_considered(course_id) df2 = assignments_in_course[['weight', 'group_points', 'grp_id']].drop_duplicates() hidden_assignments = are_weighted_assignments_hidden(course_id, df2) total_points = assignments_in_course['points_possible'].sum() # if assignment group is weighted and no assignments added yet then assignment name will be nothing so situation is specific to that if hidden_assignments: assignments_in_course['name'] = assignments_in_course['name'].fillna( assignments_in_course['assign_grp_name'] + ' Group Unavailable Assignments') assignments_in_course['towards_final_grade'] = assignments_in_course.apply( lambda x: percent_calculation(consider_weight, total_points, hidden_assignments, x), axis=1) assignments_in_course['calender_week'] = assignments_in_course[ 'due_date'].dt.week assignments_in_course['calender_week'] = assignments_in_course[ 'calender_week'].fillna(0).astype(int) min_week = find_min_week(course_id) max_week = assignments_in_course['calender_week'].max() week_list = [x for x in range(min_week, max_week + 1)] assignments_in_course['week'] = assignments_in_course[ 'calender_week'].apply(lambda x: 0 if x == 0 else week_list.index(x) + 1) assignments_in_course.sort_values(by='due_date', inplace=True) assignments_in_course['current_week'] = assignments_in_course[ 'calender_week'].apply(lambda x: find_current_week(x)) assignments_in_course['due_date_mod'] = assignments_in_course[ 'due_date'].astype(str).apply(lambda x: x.split()[0]) assignments_in_course['due_dates'] = pd.to_datetime( assignments_in_course['due_date_mod']).dt.strftime('%m/%d') assignments_in_course['due_dates'].replace('NaT', 'N/A', inplace=True) return assignments_in_course
import numpy as np import pandas as pd from keras.models import Sequential from keras.layers import Dense, LSTM import matplotlib.pyplot as plt import datetime # Gather the data file_path = "data/sales_708_3M.csv" df = pd.read_csv(file_path) # Parse date column, set index, drop redundant col df['Date'] = pd.to_datetime(df['Date'], format="%Y-%m-%d") # Making sure there are no duplicated data # If there are some duplicates we average the data during those duplicated days df = df.groupby('Date', as_index=False)['Sales'].mean() # Sorting the values df.sort_values('Date', inplace=True) # Set Params data = df # Daten Y_var = 'Sales' # Zielvariable lag = 7 # Die Anzahl der Verzögerungen, die für die Modellierung verwendet werden LSTM_layer_depth = 100 # Anzahl der Neuronen in der LSTM-Schicht batch_size = 72 # Die Größe der Datenstichprobe für den Gradientenabstieg,