def load_from_file(self, filename, append_data=False): loaded_data_frame = pandas.load(filename) if append_data: self.data_frame = pandas.concat( [self.data_frame, loaded_data_frame]) else: self.data_frame = pandas.load(filename)
def load_data(locking_event, after_trial_type, bins, meth): """Load the appropriate dataset given by locking_event and after_trial_type Returns: suffix, all_event_latencies, binneds all_event_latencies is a series with multi-index ( block, gng, event, ulabel) """ suffix = '_lock_%s_%s' % (locking_event, after_trial_type) dfoldeds = pandas.load('dfoldeds' + suffix) times_distr = pandas.load('times_distrs' + suffix) # Bin each individual sname from each folded binneds = dfoldeds['dfolded'].apply( lambda dfolded: kkpandas.Binned.from_dict_of_folded( dfolded, bins=bins, meth=meth)) # Convert times_distr to a series keyed by (block, gng, event, ulabel) # instead of by (ulabel, group, event) all_event_latencies = times_distr.copy() all_event_latencies.index = pandas.MultiIndex.from_tuples([ (idx[1][:2], idx[1][-2:], idx[2], idx[0]) for idx in all_event_latencies.index], names=['block', 'gng', 'event', 'ulabel'],) return suffix, all_event_latencies, binneds
def load_from_file(self, filename, append_data=False): loaded_data_frame = pandas.load(filename) if append_data: self.data_frame = pandas.concat([self.data_frame, loaded_data_frame]) else: self.data_frame = pandas.load(filename)
def load_data(locking_event, after_trial_type, bins, meth): """Load the appropriate dataset given by locking_event and after_trial_type Returns: suffix, all_event_latencies, binneds all_event_latencies is a series with multi-index ( block, gng, event, ulabel) """ suffix = '_lock_%s_%s' % (locking_event, after_trial_type) dfoldeds = pandas.load('dfoldeds' + suffix) times_distr = pandas.load('times_distrs' + suffix) # Bin each individual sname from each folded binneds = dfoldeds['dfolded'].apply( lambda dfolded: kkpandas.Binned.from_dict_of_folded( dfolded, bins=bins, meth=meth)) # Convert times_distr to a series keyed by (block, gng, event, ulabel) # instead of by (ulabel, group, event) all_event_latencies = times_distr.copy() all_event_latencies.index = pandas.MultiIndex.from_tuples( [(idx[1][:2], idx[1][-2:], idx[2], idx[0]) for idx in all_event_latencies.index], names=['block', 'gng', 'event', 'ulabel'], ) return suffix, all_event_latencies, binneds
def create_data_df(ts,wts): ts = pd.load('../Data/ts_data') #wts = pd.load('../Data/wts_data') wts = pd.load('../Data/ts_humidex') prevday_ts = ts.tshift(1,'D') prevday_avg = prevday_ts.resample('D',how='mean') prevday_avg = prevday_avg.asfreq('15Min',method='pad') prevweek_ts = ts.tshift(7,'D') #change min_date of all timeseries to the prevweek_ts date min_date = max(min(ts.index),min(prevweek_ts.index),min(prevday_ts.index),min(wts.index)) max_date = max(prevday_ts.index) ts = ts[min_date:max_date] wts = wts[min_date:max_date] wts[wts<-20] = None wts = wts.interpolate() prevday_ts = prevday_ts[min_date:] prevday_avg = prevday_avg[min_date:] prevweek_ts = prevweek_ts[min_date:] all_covs = pd.DataFrame({'load':ts,'prevday_load':prevday_ts,'prevweek_ts':prevweek_ts,'prevday_avg':prevday_avg,'weather':wts},index=prevday_ts.index) hr = pd.Series(ts.index.map(lambda x: x.hour+(float(x.minute)/60)),index=ts.index) dod = pd.Series(ts.index.map(lambda x: int('%d%03d'%(x.year,x.dayofyear))),index=ts.index) mydf = pd.concat([ts,dod,hr],axis=1) mydf.columns=['ts','dod','hr'] mydf = mydf.pivot(columns='dod',index='hr',values='ts') mywdf = pd.concat([wts,dod,hr],axis=1) mywdf.columns = ['wts','dod','hr'] mywdf = mywdf.drop_duplicates(cols=['hr','dod'],take_last=True) mywdf = mywdf.pivot(columns='dod',index='hr',values='wts') return mydf,mywdf
def practice_two(): frame = pd.read_csv('ch06/ex1.csv') # 读 frame.save('ch06/frame_pickle') # 写 !!!出现错误,无法存储 pd.load('ch06/frame_pickle') # 读 # 使用HDF5格式 store = pd.HDFStore('mydata.h5') store['obj1'] = frame store['obj1_col'] = frame['a'] # 读取Microsoft Excel文件 使用xlrd和openpyxl包 xls_file = pd.ExcelFile('data.xls') # 传入文件 table = xls_file.parse('Sheet1') pass
def main(): os.chdir("../../") # Load the DataFrame representing the MovieLens subset ratings=pd.load('input_data/ratings_train.pda') ratings = ratings.dropna() nratings = ratings.shape[0] try: itemFilter = CFilter_item(ratings) item_cf_ratings = itemFilter.get_cf_rating(ratings) # Compute average deviation from rating on those ratings avgDeviation = np.sum(np.abs((item_cf_ratings - ratings['rating']))) / len(ratings) print "Average deviation from rating: %.4f" %(avgDeviation) # Dump the ratings to disk for easy future access fp = open("proc_data/item_item_ratings_2.pda", "wb") pkl.dump(item_cf_ratings, fp) fp.close() print "This concludes the main method. Buh-bye!" except LogicalError as l: print "A logical error occurred: %s" %(l) except DatasetError as d: print "A dataset-related error occurred: %s" %(d) except Exception as e: print "An exception occurred: " + str(e)
def readPkl(): currDir = os.getcwd() files = os.path.join(currDir, 'pkl', 'BasicFeatures', '*.pkl') for src in glob.glob(files): df = pd.load(src) print df.index[:10] print df.index[0].strftime('%Y%m%d')
def display_progress_sparse(): progress = pd.load('results/progress_sparse.pkl') for algo in ['batch', 'bb', 'lbfgs']: for i in range(1): x = progress.loc[algo+'_x_dense_'+str(i)]['time'] y = progress.loc[algo+'_x_dense_'+str(i)]['f-f_min'] plt.plot(x, y, 'r', linewidth=2, label='x dense') x = progress.loc[algo+'_z_dense_'+str(i)]['time'] y = progress.loc[algo+'_z_dense_'+str(i)]['f-f_min'] plt.plot(x, y, 'g', linewidth=2, label='z dense') x = progress.loc[algo+'_x_sparse_'+str(i)]['time'] y = progress.loc[algo+'_x_sparse_'+str(i)]['f-f_min'] plt.plot(x, y, '--r', linewidth=2, label='x_sparse') x = progress.loc[algo+'_z_sparse_'+str(i)]['time'] y = progress.loc[algo+'_z_sparse_'+str(i)]['f-f_min'] plt.plot(x, y, '--g', linewidth=2, label='z_sparse') #plt.xscale('log') plt.yscale('log') plt.legend(loc=0) plt.ylabel('log10 f - f_min', fontsize=16) plt.xlabel('time in seconds', fontsize=16) #plt.title(algo+' experiment '+str(i), fontsize=16) plt.title(algo, fontsize=16) plt.show()
def display_progress(): progress = pd.load('results/progress.pkl') for algo in ['batch', 'bb', 'lbfgs']: for i in range(1): data = progress.loc[algo+'_x_'+str(i)] x = np.array(data['time']) y = np.array(data['f-f_min']) x, log_y, alpha = clean_progress(x, y) plt.plot(x, log_y, 'r', label=algo+' in x', linewidth=2) plt.plot(x, alpha*x + log_y[0], '--r', label='linear fit', linewidth=2) data = progress.loc[algo+'_z_'+str(i)] x = np.array(data['time']) y = np.array(data['f-f_min']) x, log_y, alpha = clean_progress(x, y) plt.plot(x, log_y, 'g', label=algo+' in z', linewidth=2) plt.plot(x, alpha*x + log_y[0], '--g', label='linear fit', linewidth=2) plt.legend(loc=0) plt.xlabel('time (s)', fontsize=20) plt.ylabel('f-f_min', fontsize=20) plt.title(algo+' experiment '+str(i), fontsize=20) plt.show()
def __init__(self, table): """ table : string Expect to be a csv file with the first column the name of the item """ ext = os.path.splitext(table)[-1] if ext == '.csv': self.df = pd.read_csv(table, index_col=0) elif ext == '.pickle': self.df = pd.load(table) else: raise Exception('table extension not .csv or .pickle') self.cols = self.df.columns self.index = self.df.index self.df['name'] = self.index # this is the list of types that I've found so far that pandas.read_csv generates self.types = [ dict(float64='Number', float='Number', int64='Number', object='Value', bool='bool')[str(t)] for t in self.df.dtypes ] print 'Loaded CSV file %s: found %d items, %d columns' % ( table, len(self.df), len(self.cols))
def get_annual_total_net_assets(force_db_read=False, df_file = 'monthlytna.pandas'): """ gets average total net assets per fund per year NOTE: This is not used in the current iteration of research """ # caching to make things faster most of the time if not force_db_read and os.path.isfile(df_file): print "Loading file from cache:",df_file return pd.load(df_file) #otherwise read from db try: print "Getting Total Net Assets from database" con=None con = lite.connect(config['db_path']) cur = con.cursor() sql = """select crsp_fundno, substr(caldt,1,4) as year, mtna from MONTHLY_TNA where mtna <> '';""" cur.execute(sql) data = cur.fetchall() print "Done getting Total Net Assets" df = pd.DataFrame(list(data),columns=['FundNo','Year','mtna'],dtype=np.float64) df=df.replace({'mtna':-99.0}, value=np.nan) df.dropna(subset=['mtna']) grouped = df.groupby(['FundNo','Year']) tna_means = grouped.mean() print "Saving monthly TNA dataframe for next time as",df_file tna_means.save(df_file) return tna_means except lite.Error, e: print "Error %s:" % e.args[0] sys.exit(1)
def main(args): df = pd.load(args.df) estimators = [('reduce_dim', KernelPCA(kernel='linear')), ('knc', KNeighborsClassifier(warn_on_equidistant=False))] clf = Pipeline(estimators) params = dict(reduce_dim__n_components=[2, 3, 5, 8, 10, 25, 50, 100]) grid_search = GridSearchCV(clf, param_grid=params, cv=3) best = 0.0 while True: random_seed = int(time.time() * 1000) random.seed(random_seed) df_l, y_l, df_t = split_df(df, 0.33) try: grid_search.fit(df_l, y_l) except ValueError: # Most likely wrong number of classes in cross validation set continue y_t = integer_labels(df_t) c = grid_search.predict(df_t) score, c = compare_clusters(c, y_t) if score > best: best = score print('{} {}'.format(random_seed, best)) with open(args.out, 'a') as fh: fh.write('{} {}\n'.format(random_seed, best))
def calc_all_fund_r2(fund_returns, asset_returns, force_calc=False, df_file='r2_all_funds.pandas'): """ Calculates R2 across all funds and asset class benchmarks, returns dataframe of results """ # caching to make things faster most of the time if not force_calc and os.path.isfile(df_file): print "Loading R2 file from cache:", df_file return pd.load(df_file) #otherwise re-calc asset_r2_results = {} fund_list = get_fund_list(fund_returns) asset_list = asset_classes # dataframe to hold our results, index by fund, columns will be asset classes r2_df = pd.DataFrame(index=fund_list) for asset_series in asset_returns: print '***** Calculating R2 for asset class', asset_series.name for fund in fund_list: #reindex as datetime so we can resample to EOM dates and join with the benchmark f = fund_returns.ix[fund] f.index = pd.DatetimeIndex(f.index) f = f.resample( 'M', how='prod' ) # just to normalize dates, shouldn't change data points r2 = calc_r2(f.join(asset_series, how='outer')) asset_r2_results[fund] = r2 print 'R2(%s, fund %s) = %s' % (asset_series.name, fund, r2) r2_df[asset_series.name] = pd.Series(asset_r2_results) asset_r2_results = {} print 'Saving R2 results to', df_file r2_df.save(df_file) return r2_df
def data(self): """ Returns a pandas.DataFrame of data, or None if not available. """ with transaction.commit_on_success(): self.refresh() if not self.status == Job.COMPLETE: raise ValueError("Job not complete, no data available") self.reference("data()") e = None try: logger.debug("%s looking for data file: %s" % (str(self), self.datafile())) if os.path.exists(self.datafile()): df = pandas.load(self.datafile()) logger.debug("%s data loaded %d rows from file: %s" % (str(self), len(df), self.datafile())) else: logger.debug("%s no data, missing data file: %s" % (str(self), self.datafile())) df = None except Exception as e: logger.error("Error loading datafile %s for %s" % (self.datafile(), str(self))) logger.error("Traceback:\n%s" % e) finally: self.dereference("data()") if e: raise e return df
def data(self): """ Returns a pandas.DataFrame of data, or None if not available. """ with transaction.commit_on_success(): self.refresh() if not self.status == Job.COMPLETE: raise ValueError("Job not complete, no data available") self.reference("data()") e = None try: logger.debug("%s looking for data file: %s" % (str(self), self.datafile())) if os.path.exists(self.datafile()): df = pandas.load(self.datafile()) logger.debug("%s data loaded %d rows from file: %s" % (str(self), len(df), self.datafile())) else: logger.debug("%s no data, missing data file: %s" % (str(self), self.datafile())) df = None except Exception as e: pass finally: self.dereference("data()") if e: raise e return df
def get_annual_total_net_assets(force_db_read=False, df_file='monthlytna.pandas'): """ gets average total net assets per fund per year NOTE: This is not used in the current iteration of research """ # caching to make things faster most of the time if not force_db_read and os.path.isfile(df_file): print "Loading file from cache:", df_file return pd.load(df_file) #otherwise read from db try: print "Getting Total Net Assets from database" con = None con = lite.connect(config['db_path']) cur = con.cursor() sql = """select crsp_fundno, substr(caldt,1,4) as year, mtna from MONTHLY_TNA where mtna <> '';""" cur.execute(sql) data = cur.fetchall() print "Done getting Total Net Assets" df = pd.DataFrame(list(data), columns=['FundNo', 'Year', 'mtna'], dtype=np.float64) df = df.replace({'mtna': -99.0}, value=np.nan) df.dropna(subset=['mtna']) grouped = df.groupby(['FundNo', 'Year']) tna_means = grouped.mean() print "Saving monthly TNA dataframe for next time as", df_file tna_means.save(df_file) return tna_means except lite.Error, e: print "Error %s:" % e.args[0] sys.exit(1)
def process_wave_height(awac_path): awac_path = os.path.normpath(awac_path) path = os.path.sep.join(awac_path.split(os.path.sep)[:-1]) os.chdir(path) awac_file_name = awac_path.split(os.path.sep)[-1:][0] wave_height_df = pd.load(awac_file_name) get_stats_from_df(wave_height_df, "wave_height_decibar", path)
def chewsmet_txt(path, title): import pandas metdf = pandas.load(path + title + '.df') # open all file end with .df #metdf=pandas.load('/Users/Eating/Documents/ATM_R/python program/20150615_7.df') #print metdf['Wdir'][-1] #raw_input() #print metdf.index #raw_input() metdfFile = open(path + title + '.txt', 'w') #metdfFile=open('/Users/Eating/Documents/ATM_R/python program/20150615_7.txt','w') metdfFile.write( 'Date_Time, Wind Speed, Wind Dir, Temperature, WaterPress, Humidity\n ' ) for i in range(0, len(metdf.index)): ##print metdf['Wdir'][i] #print metdf['Wspd'][i] metdfFile.write( str(metdf.index[i]) + ', ' + str(metdf['Wspd'][i]) + ', ' + str(metdf['Wdir'][i]) + ',' + str(metdf['temp'][i]) + ',' + str(metdf['H20Pres'][i]) + ',' + str(metdf['humidity'][i]) + '\n') #raw_input() metdfFile.close() print('done')
def load_quote(currency, path=NETFOUNDS_QUOTES_PATH): try: return pd.load(path + '/{}.dat'.format(currency)) except FileNotFoundError: return None except Exception as e: raise(e)
def load_quote(currency, path=NETFOUNDS_QUOTES_PATH): try: return pd.load(path + '/{}.dat'.format(currency)) except FileNotFoundError: return None except Exception as e: raise (e)
def get_style_bucket_funds(fund_returns, force_bucket=False, bucket_file='bucketed_style.pandas'): """ Creates a dataframe of funds that by style mapped to each asset class""" if not force_bucket and os.path.isfile(bucket_file): print "Loading bucketed style file from cache:",bucket_file return pd.load(bucket_file) #otherwise create the bucket file print "Bucketing by style" fund_styles = get_fund_styles() # apply mapping of our asset classes to styles # [[put in metamappings.py]] asset_style_results = {} fund_list = get_fund_list(fund_returns) asset_list = asset_classes # dataframe to hold our results, index by fund, columns will be asset classes style_df=pd.DataFrame(index=fund_list) for asset_class in asset_classes: print '***** Looking up styles for asset class',asset_class styles = crsp_style_mapping[asset_class] #loop through on the funds we have styles for for fund in list(set(fund_list).intersection(list(fund_styles.index))): if fund_styles.ix[fund]['StyleCode'] in styles: asset_style_results[fund] = 1 style_df[asset_class] = pd.Series(asset_style_results) asset_style_results = {} # take only the matching styles, and drop any rows with no assets highly correlated bucketed_df=style_df[style_df.apply(lambda x: x == 1, axis=1)].dropna(how='all') # save for next time bucketed_df.save(bucket_file) return bucketed_df
def main(): os.chdir("../../") # Load the DataFrame representing the MovieLens subset ratings=pd.load('input_data/ratings_train.pda') ratings = ratings.dropna() nratings = ratings.shape[0] # Load the CFiltering_item object fp = open("proc_data/cfilter_object.pda", "rb") cf3 = pkl.load(fp) fp.close() try: # we need to make sure everything's ok with our code # so we will do some tests on dimished MovieLens data item_cf_ratings = cf3.get_cf_rating(ratings.ix[:10000,:]) # Compute squared loss on those ratings avgSquaredLoss = np.sum(np.square(item_cf_ratings - ratings.ix[:10000, :]['rating'])) / len(ratings) print avgSquaredLoss # 1.59836284936 on the first 10000 items! except LogicalError as l: print "A logical error occurred: %s" %(l) except DatasetError as d: print "A dataset-related error occurred: %s" %(d) except Exception as e: print "An exception occurred: " + str(e)
def chewsmet_txt(path,title): import pandas metdf=pandas.load(path+title+'.df') # open all file end with .df #metdf=pandas.load('/Users/Eating/Documents/ATM_R/python program/20150615_7.df') #print metdf['Wdir'][-1] #raw_input() #print metdf.index #raw_input() metdfFile=open(path+title+'.txt','w') #metdfFile=open('/Users/Eating/Documents/ATM_R/python program/20150615_7.txt','w') metdfFile.write('Date_Time, Wind Speed, Wind Dir, Temperature, WaterPress, Humidity\n ') for i in range(0,len(metdf.index)): ##print metdf['Wdir'][i] #print metdf['Wspd'][i] metdfFile.write(str(metdf.index[i])+', '+str(metdf['Wspd'][i])+', '+str(metdf['Wdir'][i])+','+str(metdf['temp'][i])+','+str(metdf['H20Pres'][i])+','+str(metdf['humidity'][i])+'\n') #raw_input() metdfFile.close() print('done')
def load_assoc(self, fromdf=None): if fromdf is not None: print 'loading associations from file %s' % fromdf self.df['altassoc'] = pd.load(fromdf) else: print 'using associations found in sourceinfo' associations = self.df.associations if fromdf is None else self.df.altassoc probfun = lambda x: x['prob'][0] if not pd.isnull(x) else 0 self.df['aprob'] = np.array([probfun(assoc) for assoc in associations]) self.df['acat'] = np.array([ assoc['cat'][0] if not pd.isnull(assoc) else 'unid' for assoc in associations ]) self.df['aname'] = np.array([ assoc['name'][0] if not pd.isnull(assoc) else 'unid' for assoc in associations ]) self.df['aang'] = np.array([ assoc['ang'][0] if not pd.isnull(assoc) else np.nan for assoc in associations ]) self.df['adeltats'] = np.array([ assoc['deltats'][0] if not pd.isnull(assoc) else np.nan for assoc in associations ]) self.df10 = self.df.ix[self.df.ts > 10] print 'associated: %d/%d' % (sum(self.df10.aprob > 0.8), len( self.df10))
def calc_all_fund_r2(fund_returns, asset_returns, force_calc=False, df_file='r2_all_funds.pandas'): """ Calculates R2 across all funds and asset class benchmarks, returns dataframe of results """ # caching to make things faster most of the time if not force_calc and os.path.isfile(df_file): print "Loading R2 file from cache:",df_file return pd.load(df_file) #otherwise re-calc asset_r2_results = {} fund_list = get_fund_list(fund_returns) asset_list = asset_classes # dataframe to hold our results, index by fund, columns will be asset classes r2_df=pd.DataFrame(index=fund_list) for asset_series in asset_returns: print '***** Calculating R2 for asset class',asset_series.name for fund in fund_list: #reindex as datetime so we can resample to EOM dates and join with the benchmark f = fund_returns.ix[fund] f.index = pd.DatetimeIndex(f.index) f = f.resample('M', how='prod') # just to normalize dates, shouldn't change data points r2 = calc_r2(f.join(asset_series,how='outer')) asset_r2_results[fund] = r2 print 'R2(%s, fund %s) = %s' % (asset_series.name,fund,r2) r2_df[asset_series.name] = pd.Series(asset_r2_results) asset_r2_results = {} print 'Saving R2 results to',df_file r2_df.save(df_file) return r2_df
def loadDf(filename): ''' load '.df' files ret: DataFrame ''' temp_pickle = pandas.load(filename) print temp_pickle return temp_pickle
def test_wap_dataframe(self): print("TestParseWap") try: parse_wap.load(awac_folder_path + 'test_data.wap') except WindowsError: print("Load wap Files failed") wap_dataframe = pd.load('test_data_wap_df') self.assertEqual(len(wap_dataframe),wad_records)
def load_quote(currency): try: return pd.load(DUKASCOPY_QUOTES_PATH + '/{}.dat'.format(currency)) except FileNotFoundError as e: print(e) return None except Exception as e: raise (e)
def filter_cz_cities(input="geography-first-all-2.pd", output=None): data = pd.load(input) places = get_maps()["Czech Rep.-city"] filtered = data[data["item"].isin(places)] filtered.save(output) print filtered
def filter_europe(input="geography-first-all-2.pd", output=None): data = pd.load(input) places = get_maps()["Europe-country"] filtered = data[data["item"].isin(places)] filtered.save(output) print filtered
def get_all_fund_returns(force_db_read=False, df_file = 'fund_returns_lite_nona_reindex.pandas'): """ Reads returns for all funds from the database (excludes some things we aren't interested in """ # caching to make things faster most of the time if not force_db_read and os.path.isfile(df_file): print "Loading file from cache:",df_file return pd.load(df_file) #otherwise read from db print "Reading fund returns from the database" try: con = lite.connect(config['db_path']) cur = con.cursor() # run pragmas (case sensivity on) cur.execute(pragmas) sql= """select mr.crsp_fundno, caldt, mret from FUND_HDR fhdr, MONTHLY_RETURNS mr where fhdr.crsp_fundno=mr.crsp_fundno %s ;""" % common_excludes_data_load cur.execute(sql) data = cur.fetchall() print 'Parsing database results (those pesky date parses take a little while)' df = pd.DataFrame(list(data),columns=['FundNo', 'StrCalDate', 'Return']) # parse the date column (string slicing is faster than datetime.strptime) parsed_dates = df['StrCalDate'].map(lambda x: date(int(str(x)[0:4]),int(str(x)[4:6]),int(str(x)[6:8]) )) # add back to the dataframe df['CalDate'] = pd.Series(parsed_dates,index=df.index) # remove unneeded column del df['StrCalDate'] # change -99 values to NaN, then drop them print 'Drop the -99.0 values' df=df.replace({'Return':-99.0}, value=np.nan) df=df.dropna(subset=['Return']) # add one to all the returns per our convention df['Return']=pd.Series(df['Return'],index=df.index) + 1 print 'Re-index by FundNo and Date' df.index = [df['FundNo'],pd.DatetimeIndex(df['CalDate'])] # now that these are indexes, no need to keep the column data del df['CalDate'] del df['FundNo'] #save the file for next time print "Saving returns dataframe for next time as",df_file df.save(df_file) return df except lite.Error, e: print "Error %s:" % e.args[0] sys.exit(1)
def Improve_plot(filepath): import os import pandas as pan from matplotlib import pyplot as plt os.chdir(filepath) df = pan.load('IMPROVE_data_all.pickle') return df
def load_table(): # Load the table with the cores results_dir = '/d/bip3/ezbc/multicloud/data/python_output/' filename = results_dir + 'tables/multicloud_model_params.pickle' df = pd.load(filename) return df
def load(self, name): ''' Checks for an existing file name and if exists returns the data saved ''' f = os.path.join(self.cache_dir, name) if os.access(f, os.F_OK): return pd.load(f) else: return None
def load_quote(currency): try: return pd.load(DUKASCOPY_QUOTES_PATH + '/{}.dat'.format(currency)) except FileNotFoundError as e: print(e) return None except Exception as e: raise(e)
def filter_states(input="geography-first-all-2.pd", output=None): data = pd.load(input) places = reduce(list.__add__, get_continents_country_maps().values(), []) filtered = data[data["item"].isin(places)] filtered.save(output) print filtered
def test_wad_dataframe(self): print("TestParseWad") try: path = awac_folder_path + 'test_data.wad' parse_wad.ParseWad(path) except WindowsError: print("Load Wad Files failed") wad_dataframe = pd.load('test_data_wad_df') self.assertEqual(len(wad_dataframe),number_of_records)
def get_r2_bucket_funds(threshold=0.9, force_bucket=False, r2_file='r2_all_funds.pandas', bucket_file='bucketed_r2.pandas'): """ Creates a dataframe of funds that are highly correlated (by threshold) to each asset class""" if not force_bucket and os.path.isfile(bucket_file): print "Loading bucketed R2 file from cache:",bucket_file return pd.load(bucket_file) #otherwise create the bucket file if not os.path.isfile(r2_file): print "R2 file (%s) does not exist. Run calc_all_fund_r2() first" % r2_file return False else: print "Bucketing by R2" r2_df = pd.load(r2_file) # take only the ones over threshold, and drop any rows with no assets highly correlated bucketed_df=r2_df[r2_df.apply(lambda x: x > threshold, axis=1)].dropna(how='all') # save for next time bucketed_df.save(bucket_file) return bucketed_df
def display_progress(): coherence_data = pd.load('results/coherences_dense.pkl') rate_data = pd.load('results/rates_dense.pkl') for distribution in distributions: for i, algorithm in enumerate(algorithms): mask = rate_data.loc[distribution].index.get_level_values(0) avg_rates = [] for ratio in measurement_ratios: rate = rate_data.loc[distribution].loc[mask==ratio] rate.index = rate.index.droplevel(0) #print 'average rate for {}, '.format(ratio) + algorithm + ', ' + distribution avg_rates.append(np.mean(rate.loc[algorithm])) plt.plot(index, avg_rates, colors[i], linewidth=2, markersize=7, label=algorithm) plt.legend(loc=0) plt.title(distribution, fontsize=16) plt.xticks(range(5), [0.01,0.03,0.1,0.3,1]) plt.xlabel('measurements / dimension', fontsize=16) plt.ylabel('log10 rate', fontsize=16) plt.show()
def test_awac_stats(self): print("TestAwacStats") try: path = awac_folder_path + 'test_data.wad' parse_wad.ParseWad(path) awac_stats.process_wave_height(awac_folder_path + 'awac_wave_height_df') except WindowsError: print("Load wap Files failed") wave_height_dataframe = pd.load('awac_stats_30min') self.assertEqual(len(wave_height_dataframe),number_of_awac_stats)
def get_distance_json(origins=["Paris"], destinations=["Lyon"], key=API_key): url = "https://maps.googleapis.com/maps/api/distancematrix/json?" origins = "origins=" + "|".join(origins) destinations = "destinations=" + "|".join(destinations) key = "key=" + key response = requests.get(url + "&" + origins + "&" + destinations + "&" + key).text json_data = json.loads(response) df = pd.load(json_data) return df
def __init__(self) : """This reads two csv files into pandas: PD_RO: a csv with UIDs, Names, Prices and Modules. It is called RO as it is Read Only. It will never be altered. NOTE: The UIDs are imported to pandas as the index values. PD_CL: a csv with UIDs, Names, Contact Details, Status Variables, and Comments. It is called CL as it is a contact list. This csv will be directly modifiable by the program according to user input. Note: the UIDs are imported to pandas as index values.""" self.PD_RO = pd.read_csv(r"C:\Users\IPAB\Dropbox\MarketingTeamFolder\DataEntry\ReadOnlyFiles\FactoryList\Wave1\FactoryList.csv", index_col = 'UID') self.PD_CL = pd.read_csv(r"C:\Users\IPAB\Dropbox\MarketingTeamFolder\DataEntry\ContactList\ContactList_Wave1.csv", index_col = 'UID') self.DF = pd.load(r"C:\Users\IPAB\Dropbox\MarketingTeamFolder\DataEntry\ReadOnlyFiles\DataFrames\Wave1\FollowUp1DataFrame")
def load(store_cache_file=False, cache_file=None): if not store_cache_file and cache_file: return pd.load(cache_file) else: #TODO load your data into a DataFrame result = pd.DataFrame() #replace with your data if store_cache_file and cache_file: result.to_csv(cache_file) return result
def load_data(path, smoothstr): return ( pd.load(path + "/expmean" + smoothstr + ".df"), pd.load(path + "/ctrlmean" + smoothstr + ".df"), pd.load(path + "/expstd" + smoothstr + ".df"), pd.load(path + "/ctrlstd" + smoothstr + ".df"), pd.load(path + "/expn" + smoothstr + ".df"), pd.load(path + "/ctrln" + smoothstr + ".df"), pd.load(path + "/df2" + smoothstr + ".df"), )
def get_r2_bucket_funds(threshold=0.9, force_bucket=False, r2_file='r2_all_funds.pandas', bucket_file='bucketed_r2.pandas'): """ Creates a dataframe of funds that are highly correlated (by threshold) to each asset class""" if not force_bucket and os.path.isfile(bucket_file): print "Loading bucketed R2 file from cache:", bucket_file return pd.load(bucket_file) #otherwise create the bucket file if not os.path.isfile(r2_file): print "R2 file (%s) does not exist. Run calc_all_fund_r2() first" % r2_file return False else: print "Bucketing by R2" r2_df = pd.load(r2_file) # take only the ones over threshold, and drop any rows with no assets highly correlated bucketed_df = r2_df[r2_df.apply(lambda x: x > threshold, axis=1)].dropna(how='all') # save for next time bucketed_df.save(bucket_file) return bucketed_df
def main(outfile, infile='sources.pickle', cuts='(sources.ts>10)*(sources.a<0.25)'): assert os.path.exists(infile), 'Input file "%s" not found' % infile sources = pd.load(infile) print 'Loaded DataTable file %s' % infile selected = sources[eval(cuts)] print 'applied cuts %s: %d -> %d sources' % (cuts, len(sources), len(selected)) t = MakeCat(selected) if outfile is None: outfile = '_'.join(os.path.abspath('.').split('/')[-2:]) + '.fits' # for example, 'P202_uw10.fits' t(outfile)
def load_cold_cores(): # summary of cold clump data # http://wiki.cosmos.esa.int/planckpla2015/index.php/Catalogues#Individual_catalogues table_dir = '/d/bip3/ezbc/multicloud/data/cold_clumps/' df_dir = '/d/bip3/ezbc/multicloud/data/python_output/tables/' filename = table_dir + 'HFI_PCCS_GCC_R2.02.fits' if 0: print('\nAnalyzing table...') cc_hdu = fits.open(filename) cc_data = cc_hdu[1].data # get the region vertices regions = load_regions() df = dict() df['Glon'] = [] df['Glat'] = [] df['ra'] = [] df['dec'] = [] df['Region'] = [] df['SNR'] = [] for i in xrange(len(cc_data)): #if myg.point_in_polygon(ra, region_vertices): ra = cc_data[i][3] dec = cc_data[i][4] #if ra < 80 and ra > 40 and dec < 45 and dec > 15: region_check = check_region((ra, dec), regions) if region_check is not None: df['Glon'].append(cc_data.field('GLON')[i]) df['Glat'].append(cc_data.field('GLAT')[i]) df['ra'].append(cc_data.field('RA')[i]) df['dec'].append(cc_data.field('DEC')[i]) df['SNR'].append(cc_data.field('SNR')[i]) df['Region'].append(region_check) df = pd.DataFrame(df) df.save(df_dir + 'multicloud_cold_clumps.pickle') else: df = pd.load(df_dir + 'multicloud_cold_clumps.pickle') print('\nFinished loading...') return df
def main(args): df = pd.load(args.df) svc_seed = int(time.time() * 1000) estimators = [('reduce_dim', KernelPCA(kernel='linear')), ('svm', LinearSVC(random_state=svc_seed))] clf = Pipeline(estimators) equiv_sets = None params = dict(reduce_dim__n_components=[2, 3, 5, 8, 10, 25, 50, 100], svm__C=[0.25, 0.5, 1, 2, 4, 8, 16]) if args.twoclass: params = dict(reduce_dim__n_components=[ 2, 3, 5, 8, 10, 25, 40, 45, 50, 55, 60, 75, 100 ]) equiv_sets = [['external plexiform', 'glomerular cell layer'], ['internal plexiform', 'mitral cell layer']] grid_search = GridSearchCV(clf, param_grid=params, cv=3) best = 0.0 while True: random_seed = int(time.time() * 1000) random.seed(random_seed) df_l, _, df_t = split_df(df, 0.33) y_l = integer_labels(df_l, equiv_sets) try: grid_search.fit(df_l, y_l) except ValueError: # Most likely wrong number of classes in cross validation set continue y_t = integer_labels(df_t, equiv_sets) c = grid_search.predict(df_t) score, c = compare_clusters(c, y_t) if score > best: best = score print('{} {} {}'.format(svc_seed, random_seed, best)) with open(args.out, 'a') as fh: fh.write('{} {} {}\n'.format(svc_seed, random_seed, best))
def setup(self, othermodel='../P202_5years/uw700', **kw): super(UWsourceComparison,self).setup() self.plotfolder = 'comparison_%s' % othermodel.split('/')[-1] otherfilename = '../%s/sources.pickle' %othermodel self.othermodel=othermodel assert os.path.exists(otherfilename), 'File %s not found' % otherfilename print 'loading %s' % otherfilename odf = pd.load(otherfilename) self.odf = odf[odf.ts>10] self.df = self.df[self.df.ts>10] self.df['pindex_old']=self.odf.pindex self.df['ts_old'] = self.odf.ts self.df['eflux_old']=self.odf.eflux self.df['a_old'] = self.odf.a self.df['skydir_old'] = self.odf.skydir self.df['sedrec_old'] = self.odf.sedrec
def main(args): df = pd.load(args.df) y = integer_labels(df) pca = KernelPCA(None, kernel=args.kernel) pca.fit(df) X = pca.transform(df) nonzero_components = X.shape[1] seed = int(time.time() * 1000) gmm = GMM(4, n_init=10, random_state=seed) gmm.fit(X) c = gmm.predict(X) score, _ = compare_clusters(c, y) best = score with open(args.out, 'w') as fh: fh.write('{} {} {} {}\n'.format(args.kernel, nonzero_components, seed, best)) n_comps = range( 2, 16) + [int(i) for i in np.linspace(16, nonzero_components, 20)] for n in n_comps: pca = KernelPCA(n, kernel=args.kernel) pca.fit(df) X = pca.transform(df) for i in range(128): seed = int(time.time() * 1000) gmm = GMM(4, random_state=seed) gmm.fit(df) c = gmm.predict(df) score, _ = compare_clusters(c, y) if score > best: best = score with open(args.out, 'a'): fh.write('{} {} {} {}\n'.format(args.kernel, n, seed, best))
def traceproc(aerofilt_dir): import hysplit_tools as tools import os, sys import numpy as np import pandas as pan startdir = os.getcwd() topdir = aerofilt_dir os.chdir(topdir) data_files = os.listdir(os.getcwd()) d_mean = [] d_std = [] t_mean = [] t_std = [] endpos_mean = [] endpos_std = [] start_time = [] station = [] #run through all location folders for f in data_files: if os.path.isdir(f): os.chdir(f) tracefile = f + 'traceback' #open traceback file trace_df = pan.load(tracefile) #create a separate dict of lists for each day and put those into a #list called dictlist dates = trace_df.index() keys = trace_df.columns() by = lambda x: lambda y: getattr(y, x) trace_mean = trace_df.groupby([by('month'), by('day')]).mean() trace_std = trace_df.groupby([by('month'), by('day')]).std() pan.save(df_out, 'Hyproc.pickle') os.chdir(startdir)