def log_stats(filename, msg, dist): #from numpy import mean, median, std #from scipy.stats import scoreatpercentile as score b = sorted(dist) #.values() b = [getfirstorself(item) for item in b] with open(filename, 'a') as outf: print(msg, file=outf) print("min={} q1={} median={} q3={} max={} mean={} stddev={}".format( min(b), score(b, 25), median(b), score(b, 75), max(b), mean(b), std(b)), file=outf)
def plot_mmm(self, ax, index, xscale=1.0, yscale=1.0, xlabel='', ylabel='', do_rate=False): tmid = (self.ts.t[:-1] + self.ts.t[1:]) / 2.0 d = [] for k in self.ts.j.hosts.keys(): v = self.ts.assemble(index, k, 0) if do_rate: d.append(numpy.divide(numpy.diff(v), numpy.diff(self.ts.t))) else: d.append((v[:-1] + v[1:]) / 2.0) a = numpy.array(d) mn = [] p25 = [] p50 = [] p75 = [] mx = [] for i in range(len(self.ts.t) - 1): mn.append(min(a[:, i])) p25.append(score(a[:, i], 25)) p50.append(score(a[:, i], 50)) p75.append(score(a[:, i], 75)) mx.append(max(a[:, i])) mn = numpy.array(mn) p25 = numpy.array(p25) p50 = numpy.array(p50) p75 = numpy.array(p75) mx = numpy.array(mx) ax.hold = True ax.plot(tmid / xscale, mn / yscale, '--') ax.plot(tmid / xscale, p25 / yscale) ax.plot(tmid / xscale, p50 / yscale) ax.plot(tmid / xscale, p75 / yscale) ax.plot(tmid / xscale, mx / yscale, '--') self.setlabels(ax, index, xlabel, ylabel, yscale) ax.yaxis.set_major_locator(matplotlib.ticker.MaxNLocator(nbins=4)) tspl_utils.adjust_yaxis_range(ax, 0.1)
def diffpdfs_param(df, title=None, fig=None, label=None, footer=True): """Short summary. Parameters ---------- df : type Description of parameter `df`. title : type Description of parameter `title` (the default is None). fig : type Description of parameter `fig` (the default is None). label : type Description of parameter `label` (the default is None). footer : type Description of parameter `footer` (the default is True). Returns ------- type Description of returned object. """ from scipy.stats import scoreatpercentile as score sns.set_style('ticks') maxval = score(df.CMAQ.values - df.Obs.values, per=99.9) minval = score(df.CMAQ.values - df.Obs.values, per=.1) if fig is None: plt.figure(figsize=(10, 7)) if label == 'None': label = 'CMAQ - Obs' sns.kdeplot(df.CMAQ.values - df.Obs.values, color='darkslategrey', label=label) sns.despine() plt.xlim([minval, maxval]) plt.xlabel(df.Species.unique()[0] + ' Difference (' + df.Units.unique()[0] + ')') plt.title(title) plt.gca().axes.set_ylabel('P( Model - Obs )') if footer: footer_text(df) plt.tight_layout() else: ax = fig.get_axes()[0] sns.kdeplot(df.CMAQ.values - df.Obs.values, ax=ax, label=label)
def get_second_baseline_oneTrace(trace, SD_window, SD_percentile): from scipy.stats import scoreatpercentile as score """ params: traces- trace array, after subtraction of first baseline. Shape should be [frames, cells, trial] window - width of sliding window in frames corresponding to secPerFrame*window seconds. Fluor_percentile - percentile of fluorescence distribution at which to calculate score. returns: second_baseline - single value, most common position in trace where SD == SD_percentile idx - 1d array of indeces that correspond to times in trace where SD == SD_percentile Can be plugged into normalized_SD array in Normalization step. rolling_SD """ win = SD_window/2 rolling_SD = np.array([trace[s-win:s+win].std() for s in np.arange(win,(trace.shape[0]-win))]) #Get SD value at 'percentile_val' percentile SD = score(rolling_SD, SD_percentile) SD = np.round(SD) #Find most common position in trace where SD is at the 5th percentile. #find times where std is minimal. idx = np.argwhere(np.round(rolling_SD) == SD) # find the most common intensity value at this index. This is the baseline value of the entire trace. #get the median of the largest bin of the histogram of the range of trace[idx] values #specify bin size: try: bins = np.round((idx[:,0].shape[0])/10.0) a,b = np.histogram(np.round(trace[win:-win][idx]), bins = bins) #get the range of trace[idx] values that reside within the largest bin bin_num = np.argwhere(b==b[a==a.max()][0])[0][0] left_edge = b[bin_num] right_edge = b[bin_num+1] #this is the median val...the baseline. second_baseline = score(np.unique(trace[idx].clip(left_edge,right_edge)),50) except: second_baseline = score(np.unique(trace[idx]),50) return second_baseline, np.squeeze(idx), rolling_SD
def plot_mmm(self,ax,index,xscale=1.0,yscale=1.0,xlabel='',ylabel='', do_rate=False): tmid=(self.ts.t[:-1]+self.ts.t[1:])/2.0 d=[] for k in self.ts.j.hosts.keys(): v=self.ts.assemble(index,k,0) if do_rate: d.append(numpy.divide(numpy.diff(v),numpy.diff(self.ts.t))) else: d.append((v[:-1]+v[1:])/2.0) a=numpy.array(d) mn=[] p25=[] p50=[] p75=[] mx=[] for i in range(len(self.ts.t)-1): mn.append(min(a[:,i])) p25.append(score(a[:,i],25)) p50.append(score(a[:,i],50)) p75.append(score(a[:,i],75)) mx.append(max(a[:,i])) mn=numpy.array(mn) p25=numpy.array(p25) p50=numpy.array(p50) p75=numpy.array(p75) mx=numpy.array(mx) ax.hold=True ax.plot(tmid/xscale,mn/yscale,'--') ax.plot(tmid/xscale,p25/yscale) ax.plot(tmid/xscale,p50/yscale) ax.plot(tmid/xscale,p75/yscale) ax.plot(tmid/xscale,mx/yscale,'--') self.setlabels(ax,index,xlabel,ylabel,yscale) ax.yaxis.set_major_locator( matplotlib.ticker.MaxNLocator(nbins=4)) tspl_utils.adjust_yaxis_range(ax,0.1)
def sp_scatter_bias(df, col1=None, col2=None, ax=None, outline=False, tight=True, global_map=True, map_kwargs={}, cbar_kwargs={}, val_max=None, val_min=None, **kwargs): from scipy.stats import scoreatpercentile as score from numpy import around if ax is None: ax = draw_map(**map_kwargs) try: if col1 is None or col2 is None: print('User must specify col1 and col2 in the dataframe') raise ValueError else: dfnew = df[['latitude', 'longitude', col1, col2]].dropna().copy(deep=True) dfnew['sp_diff'] = (dfnew[col2] - dfnew[col1]) top = score(dfnew['sp_diff'].abs(), per=95) if val_max is not None: top = val_max x, y = df.longitude.values, df.latitude.values dfnew['sp_diff_size'] = dfnew['sp_diff'].abs() / top * 100. dfnew.loc[dfnew['sp_diff_size'] > 300, 'sp_diff_size'] = 300. dfnew.plot.scatter(x='longitude', y='latitude', c=dfnew['sp_diff'], s=dfnew['sp_diff_size'], vmin=-1 * top, vmax=top, ax=ax, colorbar=True, **kwargs) if ~outline: ax.outline_patch.set_alpha(0) if global_map: plt.xlim([-180, 180]) plt.ylim([-90, 90]) if tight: plt.tight_layout(pad=0) return ax except ValueError: exit
def spatial_bias_scatter(df, m, date, vmin=None, vmax=None, savename='', ncolors=15, fact=1.5, cmap='RdBu_r'): from scipy.stats import scoreatpercentile as score from numpy import around # plt.figure(figsize=(11, 6), frameon=False) f, ax = plt.subplots(figsize=(11, 6), frameon=False) ax.set_facecolor('white') diff = (df.CMAQ - df.Obs) top = around(score(diff.abs(), per=95)) new = df[df.datetime == date] x, y = m(new.longitude.values, new.latitude.values) c, cmap = colorbar_index(ncolors, cmap, minval=top * -1, maxval=top, basemap=m) c.ax.tick_params(labelsize=13) # cmap = cmap_discretize(cmap, ncolors) colors = new.CMAQ - new.Obs ss = (new.CMAQ - new.Obs).abs() / top * 100. ss[ss > 300] = 300. plt.scatter(x, y, c=colors, s=ss, vmin=-1. * top, vmax=top, cmap=cmap, edgecolors='k', linewidths=.25, alpha=.7) if savename != '': plt.savefig(savename + date + '.jpg', dpi=75.) plt.close() return f, ax, c
def normalize_oneTrace(trace, first_baseline, second_baseline, rolling_SD, idx, SD_window = 20): from scipy.stats import scoreatpercentile as score """ params: trace - 1d array. Output of step 1. After subtraction of first baseline. first_baseline - 1d array. output from step 1. second_baseline - single float. Output from step 2. idx - single array of indeces where SD == SD_percentile. rolling_SD: returns: """ win = SD_window/2 normed_trace = (trace - second_baseline)/(first_baseline + second_baseline) #baseline is the output of step 2 normed_rolling_SD = (rolling_SD)/(first_baseline[win:-win] + second_baseline) #rolling SD obtained from step 2 sd_vals = np.unique(np.round(normed_rolling_SD[idx], 3)) normed_SD = score(sd_vals, 50) return normed_trace, normed_SD
def get_first_baseline_oneTrace(trace, window, Fluor_percentile): from scipy.stats import scoreatpercentile as score """ params: trace - 1d array of shape [frames] window - width of sliding window in frames corresponding to secPerFrame*window seconds. Fluor_percentile - percentile of fluorescence distribution at which to calculate score. """ win = window/2 baseline = np.array([score(trace[s-win:s+win], Fluor_percentile) for s in range(win,(trace.shape[0]-win))]) #now pad baseline with first and last value of baseline; win samples wide on each end. baselined_trace = trace[win:-win]-baseline a = np.zeros(win) pad = np.hstack((a, baseline, a)) pad[:win] = pad[win+5] pad[-win:] = pad[-win-5] baseline = pad #now subtract padded baseline from trace. baselined_trace = trace-baseline return baseline, baselined_trace
# because some price return scores are None for row in hqm_dataframe.index: for time_period in time_periods: change_col = f'{time_period} Price Return' percentile_col = f'{time_period} Return Percentile' if hqm_dataframe.loc[row, change_col] == None: hqm_dataframe.loc[row, change_col] = 0.0 for row in hqm_dataframe.index: for time_period in time_periods: change_col = f'{time_period} Price Return' percentile_col = f'{time_period} Return Percentile' hqm_dataframe.loc[row, percentile_col] = score( hqm_dataframe[change_col], hqm_dataframe.loc[row, change_col]) / 100 for row in hqm_dataframe.index: momentum_percentiles = [] for time_period in time_periods: momentum_percentiles.append( hqm_dataframe.loc[row, f'{time_period} Return Percentile']) hqm_dataframe.loc[row, 'HQM Score'] = mean(momentum_percentiles) # creating HQM score column hqm_dataframe.sort_values('HQM Score', ascending=False, inplace=True) hqm_dataframe = hqm_dataframe[:50] hqm_dataframe.reset_index(inplace=True, drop=True) position_size = float(portfolio_size) / len(hqm_dataframe.index)
def scatter_param(df, title=None, fig=None, label=None, footer=True): """Short summary. Parameters ---------- df : type Description of parameter `df`. title : type Description of parameter `title` (the default is None). fig : type Description of parameter `fig` (the default is None). label : type Description of parameter `label` (the default is None). footer : type Description of parameter `footer` (the default is True). Returns ------- type Description of returned object. """ from numpy import max, arange, linspace, isnan from scipy.stats import scoreatpercentile as score from scipy.stats import linregress sns.set_style('ticks') species, units = df.Species.unique()[0], df.Units.unique()[0] mask = ~isnan(df.Obs.values) & ~isnan(df.CMAQ.values) maxval1 = score(df.CMAQ.values[mask], per=99.5) maxval2 = score(df.Obs.values[mask], per=99.5) maxval = max([maxval1, maxval2]) print maxval if fig is None: plt.figure(figsize=(10, 7)) plt.scatter(df.Obs, df.CMAQ, c='cornflowerblue', marker='o', edgecolors='w', alpha=.3, label=label) x = arange(0, maxval + 1) if maxval <= 10.: x = linspace(0, maxval, 25) plt.plot(x, x, '--', color='slategrey') tt = linregress(df.Obs.values[mask], df.CMAQ.values[mask]) plt.plot(x, tt[0] * x + tt[1], color='tomato') plt.xlim([0, maxval]) plt.ylim([0, maxval]) plt.xlabel('Obs ' + species + ' (' + units + ')') plt.title(title) plt.gca().axes.set_ylabel('Model ' + species + ' (' + units + ')') if footer: footer_text(df) plt.tight_layout() plt.grid(alpha=.5) else: ax = fig.get_axes()[0] l, = ax.scatter(df.Obs, df.CMAQ, marker='o', edgecolors='w', alpha=.3, label=label) tt = linregress(df.Obs.values, df.CMAQ.values) ax.plot(df.Obs.unique(), tt[0] * df.Obs.unique() + tt[1], color=l.get_color()) plt.legend(loc='Best')
def kdeplots_param(df, title=None, fig=None, label=None, footer=True, cumulative=False): """Short summary. Parameters ---------- df : type Description of parameter `df`. title : type Description of parameter `title` (the default is None). fig : type Description of parameter `fig` (the default is None). label : type Description of parameter `label` (the default is None). footer : type Description of parameter `footer` (the default is True). cumulative : type Description of parameter `cumulative` (the default is False). Returns ------- type Description of returned object. """ from scipy.stats import scoreatpercentile as score sns.set_style('ticks') if fig is None: if cumulative: plt.figure(figsize=(13, 8)) sns.kdeplot(df.Obs, color='darkslategrey', cumulative=True, label='Obs') sns.kdeplot(df.CMAQ, color='dodgerblue', cumulative=True, label=label) else: maxval1 = score(df.CMAQ.values, per=99.5) maxval2 = score(df.Obs.values, per=99.5) maxval = max([maxval1, maxval2]) plt.figure(figsize=(13, 8)) sns.kdeplot(df.Obs, color='darkslategrey') sns.kdeplot(df.CMAQ, color='dodgerblue', label=label) sns.despine() if not cumulative: plt.xlim([0, maxval]) plt.xlabel(df.Species.unique()[0] + ' (' + df.Units.unique()[0] + ')') plt.title(title) plt.gca().axes.set_ylabel('P(' + df.Species.unique()[0] + ')') if footer: footer_text(df) plt.tight_layout() plt.grid(alpha=.5) else: ax = fig.get_axes()[0] sns.kdeplot(df.CMAQ, ax=ax, label=label, cumulative=cumulative)
def fetch_hqm(): hqm_columns = [ 'Ticker', 'Company Name', 'Price', 'Shares to Buy', 'HQM Score', 'One-Year Price Return', 'One-Year Return Percentile', 'Six-Month Price Return', 'Six-Month Return Percentile', 'Three-Month Price Return', 'Three-Month Return Percentile', 'One-Month Price Return', 'One-Month Return Percentile' ] stocks = pd.read_csv('sp_500_stocks.csv') smaller_chunks = np.array_split(stocks['Ticker'], 10) hqm_dataframe = pd.DataFrame(columns=hqm_columns) position_size = math.floor(PORTFOLIO_SIZE/TOP_XX_STOCKS) # for stocks_chunk in smaller_chunks[:2]: for stocks_chunk in smaller_chunks: stocks_list = '' stocks_list = ','.join(stocks_chunk) batch_api_url = f'https://sandbox.iexapis.com/stable/stock/market/batch?symbols={stocks_list}&types=price,stats&token={TOKEN}' try: req_result = requests.get(batch_api_url) print(req_result.status_code) data = req_result.json() print(data.keys()) for symbol in stocks_chunk: company_name = data[symbol]['stats']['companyName'] stock_price = data[symbol]['price'] shares_to_buy = math.floor(position_size/stock_price) hqmScore = 'N/A' year1PriceChangePercent = data[symbol]['stats']['year1ChangePercent'] year1ReturnPercent = 'N/A' month6PriceChangePercent = data[symbol]['stats']['month6ChangePercent'] month6ReturnPercent = 'N/A' month3PriceChangePercent = data[symbol]['stats']['month3ChangePercent'] month3ReturnPercent = 'N/A' month1PriceChangePercent = data[symbol]['stats']['month1ChangePercent'] month1ReturnPercent = 'N/A' hqm_dataframe = hqm_dataframe.append( pd.Series( [ symbol, company_name, stock_price, shares_to_buy, hqmScore, year1PriceChangePercent, month1ReturnPercent, month6PriceChangePercent, month6ReturnPercent, month3PriceChangePercent, month3ReturnPercent, month1PriceChangePercent, month1ReturnPercent ], index = hqm_columns ), ignore_index = True ) except: print("Houston, we have a problem") time_periods = [ 'One-Year', 'Six-Month', 'Three-Month', 'One-Month', ] for row in hqm_dataframe.index: momentum_percentiles = [] co_name = hqm_dataframe.loc[row, 'Ticker'] print(co_name) for time_period in time_periods: col_price = f'{time_period} Price Return' col_percentile = f'{time_period} Return Percentile' hqm_dataframe.loc[row, col_percentile] = score(hqm_dataframe[col_price], hqm_dataframe.loc[row, col_price])/100 momentum_percentiles.append(hqm_dataframe.loc[row, col_percentile]) hqm_dataframe.loc[row, 'HQM Score'] = mean(momentum_percentiles) # Now sort and rank the top 50 momentum stocks hqm_dataframe.sort_values('HQM Score', ascending=False, inplace=True) # hqm_dataframe = hqm_dataframe[:TOP_XX_STOCKS] # reset all indices # hqm_dataframe.reset_index(drop=True, inplace=True) return hqm_dataframe
index=hqm_columns), ignore_index=True) ################### CALCULATE MOMENTUM PERCENTILES ######################### time_periods = ['One-Year', 'Six-Month', 'Three-Month', 'One-Month'] hqm_dataframe.fillna(value=0.0, inplace=True) for row in hqm_dataframe.index: for time_period in time_periods: change_col = f'{time_period} Price Return' percentile_col = f'{time_period} Return Percentile' a = hqm_dataframe[change_col] b = hqm_dataframe.loc[row, change_col] hqm_dataframe.loc[row, percentile_col] = score(a, b) ##################3 from statistics import mean for row in hqm_dataframe.index: momentum_percentiles = [] for time_period in time_periods: momentum_percentiles.append( hqm_dataframe.loc[row, f'{time_period} Return Percentile']) hqm_dataframe.loc[row, 'HQM Score'] = mean(momentum_percentiles) ############# select the 50 best momentum stocks hqm_dataframe.sort_values('HQM Score', ascending=False, inplace=True) hqm_dataframe = hqm_dataframe[:5]
def diffscatter_param(df, title=None, fig=None, label=None, footer=True): """Short summary. Parameters ---------- df : type Description of parameter `df`. title : type Description of parameter `title` (the default is None). fig : type Description of parameter `fig` (the default is None). label : type Description of parameter `label` (the default is None). footer : type Description of parameter `footer` (the default is True). Returns ------- type Description of returned object. """ from scipy.stats import scoreatpercentile as score from numpy import isnan sns.set_style('ticks') df = df.dropna() mask = ~isnan(df.Obs.values) & ~isnan(df.CMAQ.values) if fig is None: species, units = df.Species.unique()[0], df.Units.unique()[0] maxval = score(df.Obs.values[mask], per=99.9) minvaly = score(df.CMAQ.values[mask] - df.Obs.values[mask], per=.1) maxvaly = score(df.CMAQ.values[mask] - df.Obs.values[mask], per=99.9) plt.figure(figsize=(10, 7)) plt.scatter(df.Obs.values[mask], df.CMAQ.values[mask] - df.Obs.values[mask], c='cornflowerblue', marker='o', edgecolors='w', alpha=.3, label=label) plt.plot((0, maxval), (0, 0), '--', color='darkslategrey') plt.xlim([0, maxval]) plt.ylim([minvaly, maxvaly]) plt.xlabel('Obs ' + species + ' (' + units + ')') plt.title(title) plt.gca().axes.set_ylabel('Model - Obs ' + species + ' (' + units + ')') if footer: footer_text(df) plt.tight_layout() else: ax = fig.get_axes()[0] mask = ~isnan(df.Obs.values) & ~isnan(df.CMAQ.values) ax.scatter(df.Obs.values[mask], df.CMAQ.values[mask] - df.Obs.values[mask], marker='o', edgecolors='w', alpha=.3, label=label) plt.legend(loc='best')
rv_dataframe[column].fillna(rv_dataframe[column].mean(), inplace=True) rv_dataframe[rv_dataframe.isnull().any(axis=1)] from scipy.stats import percentileofscore as score metrics = { 'Price-to-Earnings Ratio': 'PE Percentaile', 'Price-to-Book Ratio': 'PB Percentile', 'Price-to-Sales Ratio': 'PS Percentile', 'EV/EBITDA': 'EV/EBITDA Percentile', 'EV/GP': 'EV/GP Percentile' } for metric in metrics.keys(): for row in rv_dataframe.index: rv_dataframe.loc[row, metrics[metric]] = score( rv_dataframe[metric], rv_dataframe.loc[row, metric]) / 100 #print(rv_dataframe) from statistics import mean for row in rv_dataframe.index: value_percentiles = [] for metric in metrics.keys(): value_percentiles.append(rv_dataframe.loc[row, metrics[metric]]) rv_dataframe.loc[row, 'RV Score'] = mean(value_percentiles) #print(rv_dataframe) rv_dataframe.sort_values('RV Score', ascending=True, inplace=True) rv_dataframe = rv_dataframe[:50]
# CALCULATING DAY CHANGE OF STOCKS dc = [] for i in sp500.columns: dc.append(sp500[i].pct_change().sum()) sp500_momentum = pd.DataFrame(columns = ['symbol', 'day_change']) sp500_momentum['symbol'] = sp500.columns sp500_momentum['day_change'] = dc # CALCULATING MOMENTUM sp500_momentum['momentum'] = 'N/A' for i in range(len(sp500_momentum)): sp500_momentum.loc[i, 'momentum'] = score(sp500_momentum.day_change, sp500_momentum.loc[i, 'day_change'])/100 sp500_momentum['momentum'] = sp500_momentum['momentum'].astype(float) print(sp500_momentum.head()) top_picks = sp500_momentum.nlargest(10, 'momentum')['symbol'].reset_index().drop('index', axis = 1) print(top_picks) # BACKTEST portfolio_val = 1000000 per_stock_val = portfolio_val/len(top_picks) day_close = [] for i in top_picks['symbol']: data = sp500[i]