def read_ltdb(sample, fullcount): """ Read data from Brown's Longitudinal Tract Database (LTDB) and store it for later use. Parameters ---------- sample : str file path of the zip file containing the standard Sample CSV files downloaded from https://s4.ad.brown.edu/projects/diversity/Researcher/LTBDDload/Default.aspx fullcount: str file path of the zip file containing the standard Fullcount CSV files downloaded from https://s4.ad.brown.edu/projects/diversity/Researcher/LTBDDload/Default.aspx Returns ------- DataFrame """ sample_zip = zipfile.ZipFile(sample) fullcount_zip = zipfile.ZipFile(fullcount) def _ltdb_reader(path, file, year, dropcols=None): df = pd.read_csv( path.open(file), na_values=["", " ", 99999, -999], converters={ 0: str, "placefp10": str }, low_memory=False, encoding="latin1", ) if dropcols: df.drop(dropcols, axis=1, inplace=True) df.columns = df.columns.str.lower() names = df.columns.values.tolist() names[0] = "geoid" newlist = [] # ignoring the first 4 columns, remove year suffix from column names for name in names[4:]: newlist.append(name[:-2]) colnames = names[:4] + newlist df.columns = colnames # prepend a 0 when FIPS is too short df["geoid"] = df["geoid"].str.rjust(11, "0") df.set_index("geoid", inplace=True) df["year"] = year inflate_cols = ["mhmval", "mrent", "hinc"] df = _adjust_inflation(df, inflate_cols, year) return df # read in Brown's LTDB data, both the sample and fullcount files for each # year population, housing units & occupied housing units appear in both # "sample" and "fullcount" files-- currently drop sample and keep fullcount sample70 = (_ltdb_reader( sample_zip, "ltdb_std_1970_sample.csv", dropcols=["POP70SP1", "HU70SP", "OHU70SP"], year=1970, ), ) fullcount70 = (_ltdb_reader(fullcount_zip, "LTDB_Std_1970_fullcount.csv", year=1970), ) sample80 = (_ltdb_reader( sample_zip, "ltdb_std_1980_sample.csv", dropcols=["pop80sf3", "pop80sf4", "hu80sp", "ohu80sp"], year=1980, ), ) fullcount80 = (_ltdb_reader(fullcount_zip, "LTDB_Std_1980_fullcount.csv", year=1980), ) sample90 = (_ltdb_reader( sample_zip, "ltdb_std_1990_sample.csv", dropcols=["POP90SF3", "POP90SF4", "HU90SP", "OHU90SP"], year=1990, ), ) fullcount90 = (_ltdb_reader(fullcount_zip, "LTDB_Std_1990_fullcount.csv", year=1990), ) sample00 = (_ltdb_reader( sample_zip, "ltdb_std_2000_sample.csv", dropcols=["POP00SF3", "HU00SP", "OHU00SP"], year=2000, ), ) fullcount00 = (_ltdb_reader(fullcount_zip, "LTDB_Std_2000_fullcount.csv", year=2000), ) sample10 = _ltdb_reader(sample_zip, "ltdb_std_2010_sample.csv", year=2010) # join the sample and fullcount variables into a single df for the year ltdb_1970 = sample70.join(fullcount70.iloc[:, 7:], how="left") ltdb_1980 = sample80.join(fullcount80.iloc[:, 7:], how="left") ltdb_1990 = sample90.join(fullcount90.iloc[:, 7:], how="left") ltdb_2000 = sample00.join(fullcount00.iloc[:, 7:], how="left") ltdb_2010 = sample10 # the 2010 file doesnt have CBSA info, so grab it from the 2000 df ltdb_2010["cbsa"] = np.nan ltdb_2010.update(other=ltdb_2000["cbsa"], overwrite=True) df = pd.concat([ltdb_1970, ltdb_1980, ltdb_1990, ltdb_2000, ltdb_2010], sort=True) df = df.set_index("geoid") store = pd.HDFStore(os.path.join(package_directory, "data.h5"), "w") store["ltdb"] = df store.close() return df
def RunDSPVec(t1_file, vec_process=None, digitizer_list=None, out_prefix="t2", verbose=False, output_dir=None, multiprocess=False): """ vector version of tier 1 processor """ if vec_process is None: vec_process = VectorProcess(default_list=True) print("Starting pygama Tier 1 (vector) processing ...") print(" Input file: {}".format(t1_file)) statinfo = os.stat(t1_file) print(" File size: {}".format(sizeof_fmt(statinfo.st_size))) start = time.clock() directory = os.path.dirname(t1_file) output_dir = os.getcwd() if output_dir is None else output_dir # snag the run number (assuming t1_file ends in _run<number>.<filetype>) run_str = re.findall('run\d+', t1_file)[-1] run = int(''.join(filter(str.isdigit, run_str))) # get pygama's available digitizers if digitizer_list is None: digitizer_list = get_digitizers() # get digitizers in the file f = h5py.File(t1_file, 'r') digitizer_list = [d for d in digitizer_list if d.decoder_name in f.keys()] print(" Found digitizers:") for d in digitizer_list: print(" -- {}".format(d.decoder_name)) for d in digitizer_list: print("Processing data from: " + d.decoder_name) object_info = pd.read_hdf(t1_file, key=d.class_name) d.load_object_info(object_info) # single thread process -- let's ABANDON THIS # t1_df = pd.read_hdf(t1_file, key=d.decoder_name) # t2_df = vec_process.Process(t1_df) # multi process -- i want to ALWAYS do this, using hdf5 chunking # even if i only have one thread available. # try to write each chunk to the file so you never hold the whole # file in memory. h5key = d.class_name chunksize = 3000 # num wf rows. optimal for my mac, at least n_cpu = mp.cpu_count() with pd.HDFStore(t1_file, 'r') as store: nrows = store.get_storer(h5key).shape[0] # fixed only chunk_idxs = list(range(nrows // chunksize + 1)) keywords = {"t1_file": t1_file, "chunksize": chunksize, "h5key": h5key} with mp.Pool(n_cpu) as p: result_list = p.map(partial(process_chunk, **keywords), chunk_idxs) # t2_df = pd.concat(result_list) # # print("Elapsed: {:.2f} sec".format(time.time()-t_start)) t2_file = os.path.join(output_dir, "{}_run{}.h5".format(out_prefix, run)) if verbose: print("Writing Tier 2 File:\n {}".format(t2_file)) print(" Entries: {}".format(len(t2_df))) print(" Data columns:") for col in t2_df.columns: print(" -- " + col) t2_df.to_hdf(t2_file, key="data", format='table', mode='w', data_columns=t2_df.columns.tolist()) if verbose: statinfo = os.stat(t2_file) print("File size: {}".format(sizeof_fmt(statinfo.st_size))) elapsed = time.clock() - start proc_rate = elapsed / len(t2_df) print("Time elapsed: {:.2f} sec ({:.5f} sec/wf)".format( elapsed, proc_rate)) print("Done.")
def example_VSTOXX_index(self): V0 = 17.6639 r = 0.01 import pandas as pd h5 = pd.HDFStore("./vstoxx_data_31032014.h5", "r") futures_data = h5["futures_data"] # VSTOXX futures data options_data = h5["options_data"] # VSTOXX call option data h5.close() print(futures_data) options_data.info() options_data[["DATE", "MATURITY", "TTM", "STRIKE", "PRICE"]].head() options_data["IMP_VOL"] = 0.0 # new column for implied volatilities # from bsm_functions import * tol = 0.5 # tolerance level for moneyness for option in options_data.index: # iterating over all option quotes forward = futures_data[ futures_data["MATURITY"] == options_data.loc[option]["MATURITY"] ] ["PRICE"].values[0] # picking the right futures value if (forward * (1 - tol) < options_data.loc[option]["STRIKE"] < forward * (1 + tol)): # only for options with moneyness within tolerance imp_vol = self.bsm_call_imp_vol( V0, # VSTOXX value options_data.loc[option]["STRIKE"], options_data.loc[option]["TTM"], r, # short rate options_data.loc[option]["PRICE"], sigma_est=2., # estimate for implied volatility it=100) options_data["IMP_VOL"].loc[option] = imp_vol futures_data["MATURITY"] options_data.loc[46170] options_data.loc[46170]["STRIKE"] plot_data = options_data[options_data["IMP_VOL"] > 0] maturities = sorted(set(options_data["MATURITY"])) print(maturities) plt.figure(figsize=(8, 6)) for maturity in maturities: data = plot_data[options_data.MATURITY == maturity] # select data for this maturity plt.plot(data["STRIKE"], data["IMP_VOL"], label=maturity.timestamp(), lw=1.5) plt.plot(data["STRIKE"], data["IMP_VOL"], "r.") plt.grid(True) plt.xlabel("strike") plt.ylabel("implied volatility of volatility") plt.legend() plt.show() keep = ["PRICE", "IMP_VOL"] group_data = plot_data.groupby(["MATURITY", "STRIKE"])[keep] print(group_data) group_data = group_data.sum() group_data.head() group_data.index.levels pass
def __init__(self, path, **kwargs): self.ds = pd.HDFStore(path, mode='w', **kwargs)
def CalBarraGrowth(dates): Growth = ['EGRLF','EGRSF','EGRO','SGRO'] statemap = {'EGRLF':['对未来三年预期净利润','net_profit0'], ##数据缺失 'EGRSF':['对未来一年预期净利润','net_profit0'], ##数据缺失 'EGRO':['net_profit0'], ##年报每股收益 = 当期净利润/当期在外发行普通股 'SGRO':['operate_profit','operate_expense']} if DB_CONN == 1: #函数以数据库连接 conn_params = urllib.parse.quote_plus("""DRIVER={SQL Server Native Client 10.0}; SERVER=quant;DATABASE=tbas;UID=quant;PWD=quant007""") conn = sqlalchemy.create_engine("mssql+pyodbc:///?odbc_connect=%s" % conn_params) conn_params=urllib.parse.quote_plus("""DRIVER={SQL Server Native Client 10.0}; SERVER=10.130.14.41;DATABASE=fcdb;UID=ch_data;PWD=68880980""") conn2 = sqlalchemy.create_engine("mssql+pyodbc:///?odbc_connect=%s" % conn_params) for factor in Growth: fcl = ['date'] + statemap[factor] #数据读入,市场,财务数据 st = pd.HDFStore(inFilename) state = st.select('sheet',"columns="+str(fcl)) if factor in ['EGRO','SGRO']: mkt = st.select('mkt', "columns=['total_share']") st.close() #因子计算,财务数据对齐 ##财务数据对齐 state = state.unstack() state = state[(state.index.month.isin([3,6,9,12]))].stack() nf = state.reset_index() tnf = nf['pdate'].groupby([nf['date'],nf['sec_code']]).max() nf = nf.set_index(['date', 'sec_code', 'pdate']) tnf = tnf.reset_index() tnf = tnf.set_index(['date', 'sec_code', 'pdate']) nf = nf[nf.index.isin(tnf.index)] nf = nf.reset_index('pdate') nf = nf.drop(['pdate'], axis=1) ##日度对齐 nf = nf.unstack() if factor in ['EGRO','SGRO']: nf = nf.reindex(nf.index.union(dates)) mkt = mkt.unstack() mkt = mkt.reindex(mkt.index.union(dates)).ffill() mkt = mkt.reindex(dates) else: nf = nf.reindex(nf.index.union(dates)).ffill() nf = nf.reindex(dates) if factor in ['EGRLF','EGRSF']: factorvalue = nf[statemap[factor][0]] / (math.abs(nf[statemap[factor][1]]) - 1) elif factor in ['EGRO','SGRO']: ##temp5是要回归的数据 if factor == 'EGRO': temp5 = nf['net_profit0'] / mkt['total_share'] else: temp5 = (nf['operate_profit'] + nf['operate_expense']) / mkt['total_share'] factorvalue = abs(temp5.copy(deep=True) * 0) for i in range(len(temp5.iloc[0, :])): stock_val = temp5.iloc[:, i] stock_val = stock_val.dropna(how='all') stock_fval = abs(stock_val.copy(deep=True) * 0) for j in range(len(stock_val)-1, 19, -1): ##回归过去5年,就是20个季度 temp_y = stock_val.iloc[j-20:j].fillna(0) temp_x = np.arange(20) x = np.asmatrix(temp_x).transpose() y = np.asmatrix(temp_y).transpose() B = linear_regression_coef(x, y) mean = y.mean() if mean == 0: stock_fval.iloc[j] = B else: stock_fval.iloc[j] = B / mean stock_fval = stock_fval.reindex(stock_fval.index.union(dates)).ffill() factorvalue.iloc[:,i] = stock_fval st = pd.HDFStore(outFilename) if factor in [x[1:] for x in st.keys()]: existday = st.select_column(factor, 'index') st.append(factor, factorvalue.loc[factorvalue.index.difference(existday)], format='t') else: st.append(factor, factorvalue, format='t') st.close() print(factor)
# 获取文章中所有的文章 # lists可循环 # 4.解析数据 lists = soup.find_all('div', class_="list-group-item list-group-item-action p-06") title = [] url = [] author = [] time = [] for i in lists: # 看网页源码再进一步确定这里怎么弄,split要好好使用 # print(i.text.split('\n')[0]) j = i.find('div', class_="topic_title") if j: title.append(j.text.split('\n')[0]) url.append(i.a['href']) author.append(i.strong.a.text) time.append(i.span['title']) # 5.写入数据 df = pd.DataFrame({'标题': title, '作者': author, '时间': time, '链接': url}) data = pd.HDFStore('data.h5', 'w') df.to_hdf('data.h5', key='yq') data.close() # 读取数据 store = pd.HDFStore('data.h5', mode='r') temp = pd.read_hdf('data.h5') print(temp) store.close()
# load coordinates # so height[i] is the geopotential height at a given time height = fileobj.variables['z'][:] g_inv = 1/9.81 height = height*g_inv height = height / 9.81 # get processed dataframe # Create storage object with filename `processed_data` name = 'processed_' + str(f[wantfile][-10:-5]) + '.h5' # Access data store data_store = pd.HDFStore(name) # Retrieve data using key geopot_df = data_store['preprocessed_geopot'] data_store.close() #get fft coeffs def geopot_fft(geopotential): y = fft(geopotential) ck = y return(ck) fft_zonal_result = [geopot_fft(height[k]) for k in range(number_entries)]
speed[speed > 2.5].index.values[1:]).drop_long_intervals( 26000).merge_close_intervals(50000) wake_ep = wake_ep.intersect(speed_ep).drop_short_intervals(3000000) n_channel, fs, shank_to_channel = loadXML(data_directory + session + "/" + session.split("/")[1] + '.xml') rip_ep, rip_tsd = loadRipples(data_directory + session) hd_info = scipy.io.loadmat(data_directory + session + '/Analysis/HDCells.mat')['hdCellStats'][:, -1] hd_info_neuron = np.array([hd_info[n] for n in spikes.keys()]) all_neurons = np.array(list(spikes.keys())) mod_neurons = np.array([ int(n.split("_")[1]) for n in neurons_index if session.split("/")[1] in n ]) if len(sleep_ep) > 1: store = pd.HDFStore("/mnt/DataGuillaume/population_activity_25ms/" + session.split("/")[1] + ".h5") # all_pop = store['allwake'] pre_pop = store['presleep'] pos_pop = store['postsleep'] store.close() store = pd.HDFStore("/mnt/DataGuillaume/population_activity_100ms/" + session.split("/")[1] + ".h5") all_pop = store['allwake'] # pre_pop = store['presleep'] # pos_pop = store['postsleep'] store.close() def compute_eigen(popwak): popwak = popwak - popwak.mean(0) popwak = popwak / (popwak.std(0) + 1e-8)
import numpy as np import pandas as pd from scipy import signal import matplotlib.pyplot as plt import kagglegym # In[ ]: # This part is going to be for explorind the dataset ... # so we want the entire dataset .. with pd.HDFStore("../input/train.h5", "r") as train: df = train.get("train") # In[ ]: list(set([c.split('_')[0] for c in df.columns])) # So there are three types of main cells. # # - `timestamp`: current timestamp # - `y`: This is what we want to predict # - [`fundamental`, `derived`, `technical`]: these are our predictors
#chrs = all_sites['chr'].unique() #logger.info('chromosomes of all WGBS sites are: '+str(chrs)) #cols=['chr', 'coordinate','strand'] #tss = pd.read_csv(home+'data/commons/tss.txt',sep='\s+',header=None,names=cols,skiprows=1) #tss = get_winid.convert_chr_to_num(tss,chrs) all_wgbs_sites = (args.all == 'True') reset_tracker = (args.reset_tracker == 'True') if all_wgbs_sites: logger.info('Using all WGBS sites') selected_wgbs_tss = all_sites[['winid', 'chr', 'coordinate']] elif not os.path.exists(home + 'data/' + dataset + '/all_selected_wgbs_sites'): logger.info('Selecting WGBS sites within 100k of tss sites') selected_wgbs_tss = wgbs_sites_selection(tss, all_sites) with pd.HDFStore(home + 'data/' + dataset + '/all_selected_wgbs_sites', 'w') as h5s: h5s['all_wgbs'] = selected_wgbs_tss else: logger.info('Using selected wgbs sites') with pd.HDFStore(home + 'data/' + dataset + '/all_selected_wgbs_sites', 'r') as h5s: selected_wgbs_tss = h5s['all_wgbs'] start_pos = 0 end_pos = len(selected_wgbs_tss) - 1 logger.info('total selected wgbs sites number: ' + str(end_pos + 1)) subprocess.call([ 'sed', '-i', 's/tss_end =.*/tss_end = ' + str(end_pos) + '/', home + 'code/prediction/prediction_commons.py' ]) ranges = np.arange(start_pos, end_pos, 2000000)
effects=effects, pop_size=pop_size) # copy the clean results and add amplification artifacts for the # jackpot noise case counts.loc[:, idx['jackpot', :, :]] = \ counts.loc[:, idx['clean', :, :]].values add_amplification_artifacts(counts=counts, condition='jackpot', pct_high=artifacts_pct_high, mult_high=artifacts_mult_high, pct_low=artifacts_pct_low, mult_low=artifacts_pct_low) expected = calc_expected(effects) for d in depths: sequencing = generate_sequencing_counts(counts=counts, depth=d) name = "{}_simulation_depth_{}".format(assay, d) output_simulation(sequencing, name, cfg['outdir']) # output to HDF5 store = pd.HDFStore( os.path.join(cfg['outdir'], name, "{}.h5".format(assay))) store.put(key='popcounts', value=counts) store.put(key='seqcounts', value=sequencing) store.put(key='effects', value=effects) store.put(key='expected', value=expected) store.close()
#print 'id=%d,first_occur_idx=%d'%(id_, first_occur_idx) df3.ix[first_occur_idx - 1, 'y1'] = None return df3 # export pre-processed dataframe into a csv file def export_df_to_csv(df, csv_filename="preprocessed_data.csv"): print 'exporting dataframe to ', csv_filename df.to_csv(csv_filename, sep=',', na_rep='', float_format="%.8f", index=False) def import_df_from_csv(csv_filename="preprocessed_data.csv"): print 'importing dataframe from ', csv_filename df = pd.read_csv(csv_filename, sep=',', index_col=None) return df if __name__ == '__main__': print 'Hello' with pd.HDFStore("train.h5", "r") as train: df = train.get("train") explore(df) work_df = build_working_df(df) export_df_to_csv(work_df) work_df_no_na = df_fill_na(work_df) export_df_to_csv(work_df_no_na, "preprocessed_data_no_na.csv")
log_loss, ) from boruta import BorutaPy # finance packages import trademl as tml # import vectorbt as vbt ### DON'T SHOW GRAPH OPTION matplotlib.use("Agg") ### GLOBALS DATA_PATH = 'D:/market_data/usa/ohlcv_features/' ### IMPORT DATA contract = ['SPY'] with pd.HDFStore(DATA_PATH + contract[0] + '.h5') as store: data = store.get(contract[0]) data.sort_index(inplace=True) ### CHOOSE/REMOVE VARIABLES remove_ohl = [ 'open', 'low', 'high', 'average', 'barCount', # 'vixFirst', 'vixHigh', 'vixLow', 'vixClose', 'vixVolume', 'open_orig', 'high_orig', 'low_orig' ]
#!/opt/anaconda/bin/python '''max_widths.py - figure out max width for various text fields''' import pandas as pd store = pd.HDFStore('GDELT-compressed.h5') max_max = pd.Series({ 'Actor1Code': 0, 'Actor2Code': 0, 'EventCode': 0, 'QuadCategory': 0 }) for df in store.select( 'reduced', columns=['Actor1Code', 'Actor2Code', 'EventCode', 'QuadCategory'], iterator=True): curr_max = df.applymap(len).max() max_max = pd.concat([curr_max, max_max], axis=1).max(axis=1) print max_max
def process_multiple_days( dates_to_process, directory_structure="new", loc_processed_h5=os.path.join( d_drive, "Data", "Processed", "tube_data.h5" ), raw_base_dir=os.path.join( d_drive, "Data", "Raw" ), multiprocess=True, overwrite=False ): """ Process multiple days worth of tube data Parameters ---------- dates_to_process : List[Str] or str List of dates to post process as YYYY-MM-DD directory_structure : str How raw data is stored on disk. Can be "old" or "new", but probably should be "new" loc_processed_h5 : str Location of processed data HDF5 raw_base_dir : str Base directory where raw data directories are located. You shouldn't need to change this. multiprocess : bool Whether to use multiprocessing for each day to speed things up overwrite : bool Whether or not to overwrite existing processed data in the processed data HDF5 database """ if hasattr(dates_to_process, "lower"): # it's a string. make it a list. dates_to_process = [dates_to_process] with pd.HDFStore( os.path.join( _DIR, "data", "tube_data_template.h5" ), "r" ) as store: df_out = store["data"] for day in dates_to_process: if directory_structure == "old": df_day, day_schlieren = process_old_data( raw_base_dir, day, multiprocess=multiprocess ) df_day.drop( "sensors", axis=1, inplace=True ) # these were not included and I'm not using the old structure # anymore so it's not important enough to fix df_day["dil_mf"] = 0. df_day["dil_mf_nom"] = 0. df_day["diluent"] = "None" df_day["fuel"] = "C3H8" df_day["oxidizer"] = "air" df_day["p_0_nom"] = 101325. df_day["p_diluent"] = df_day["p_fuel"] df_day["phi_nom"] = 1. df_day["u_dil_mf"] = 0. df_day["u_p_diluent"] = df_day["u_p_fuel"] else: df_day, day_schlieren = process_new_data( raw_base_dir, day, multiprocess=multiprocess ) # force df_day to match desired structure df_day = pd.concat((df_out, df_day), sort=False, ignore_index=True) day = "d" + day.replace("-", "_") with pd.HDFStore(loc_processed_h5, "a") as store: existing_tests = get_existing_tests(store) for _, row in df_day.iterrows(): store_processed_test( row, existing_tests, overwrite, store ) for img_key in day_schlieren.keys(): for i, img in enumerate(day_schlieren[img_key]): store_processed_schlieren( day, img, img_key, i, overwrite, store )
# Copyright 2016 Telenor ASA, Author: Axel Tidemann import os import argparse import pandas as pd import numpy as np parser = argparse.ArgumentParser(description=''' Reads an HDF5 file with mappings of ad IDs to images, calculates plots bar charts showing how many images are in each. ''', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( 'mapping', help='HDF5 file with ad ids and images') parser.add_argument( '--out_file', help='Filename of the HDF5 file', default='bar.h5') args = parser.parse_args() with pd.HDFStore(args.mapping, mode='r') as store: for category in store.keys(): data = store[category] entries = data.count(axis=1) print '{}: {} entries. number of images: mean {}, median {}, std {}'.format(category, len(data), np.mean(entries), np.median(entries), np.std(entries))
def __init__(self, path): self.ds = pd.HDFStore(path, mode='r') self.index = {}
def run(self, name=None, export_buildings_to_urbancanvas=False, base_year=2010, forecast_year=None, fixed_seed=True, random_seed=1, indicator_configuration=None, core_components_to_run=None, household_transition=None,household_relocation=None,employment_transition=None, elcm_configuration=None, developer_configuration=None, table_swapping=None, travel_model_configuration1=None, travel_model_configuration2=None, travel_model_configuration3=None, travel_model_configuration4=None, travel_model_configuration5=None, travel_model_configuration6=None): """Runs an UrbanSim2 scenario """ logger.log_status('Starting UrbanSim2 run.') dset = dataset.DRCOGDataset(os.path.join(misc.data_dir(),'drcog.h5')) seconds_start = time.time() if fixed_seed: logger.log_status('Running with fixed random seed.') np.random.seed(random_seed) #Load estimated coefficients coeff_store = pd.HDFStore(os.path.join(misc.data_dir(),'coeffs.h5')) dset.coeffs = coeff_store.coeffs.copy() coeff_store.close() coeff_store = pd.HDFStore(os.path.join(misc.data_dir(),'coeffs_res.h5')) dset.coeffs_res = coeff_store.coeffs_res.copy() coeff_store.close() #Keep track of unplaced agents by year unplaced_hh = [] unplaced_emp = [] #UrbanCanvas scenario id, replaced by db-retrieved value during export step urbancanvas_scenario_id = 0 #####Residential Buildings##### new_refiner.add_res_buildings(dset) #####Non-Residential Buildings##### new_refiner.add_non_res_buildings(dset) for sim_year in range(base_year,forecast_year+1): print 'Simulating year ' + str(sim_year) logger.log_status(sim_year) ##Variable Library calculations variable_library.calculate_variables(dset) #Record pre-demand model zone-level household/job totals hh_zone1 = dset.fetch('households').groupby('zone_id').size() emp_zone1 = dset.fetch('establishments').groupby('zone_id').employees.sum() ############ ELCM SIMULATION if core_components_to_run['ELCM']: logger.log_status('ELCM simulation.') alternatives = dset.buildings[(dset.buildings.non_residential_sqft>0)] new_elcm_model.simulate(dset, year=sim_year,depvar = 'building_id',alternatives=alternatives,simulation_table = 'establishments',output_names = ("drcog-coeff-elcm-%s.csv","DRCOG EMPLOYMENT LOCATION CHOICE MODELS (%s)","emp_location_%s","establishment_building_ids"), agents_groupby= ['sector_id_retail_agg',],transition_config = {'Enabled':True,'control_totals_table':'annual_employment_control_totals','scaling_factor':1.0}) ################# HLCM SIMULATION if core_components_to_run['HLCM']: logger.log_status('HLCM simulation.') alternatives = dset.buildings[(dset.buildings.residential_units>0)] new_hlcm_simulation.simulate(dset, year=sim_year,depvar = 'building_id',alternatives=alternatives,simulation_table = 'households',output_names = ("drcog-coeff-hlcm-%s.csv","DRCOG HOUSEHOLD LOCATION CHOICE MODELS (%s)","hh_location_%s","household_building_ids"), agents_groupby= ['income_3_tenure',],transition_config = {'Enabled':True,'control_totals_table':'annual_household_control_totals','scaling_factor':1.0}, relocation_config = {'Enabled':True,'relocation_rates_table':'annual_household_relocation_rates','scaling_factor':1.0},) ############ DEMAND-SIDE REFINEMENT #refiner.run(dset, sim_year) # refiner_fnc = "refiner.run(dset, sim_year)" #cProfile.runctx(refiner_fnc, locals={'dset':dset, 'sim_year':sim_year}, globals={'refiner': refiner}, filename='c:/users/jmartinez/documents/refiner_time') ############ REPM SIMULATION if core_components_to_run['Price']: logger.log_status('REPM simulation.') #Residential census_model_simulation.simulate_residential(dset, 'unit_price_res_sqft', 'school_district_id', 10, sim_year) #Non-residential regression_model_simulation.simulate(dset, year=sim_year,output_varname='unit_price_non_residential', simulation_table='buildings', output_names = ["drcog-coeff-nrhedonic-%s.csv","DRCOG NRHEDONIC MODEL (%s)","nrprice_%s"], agents_groupby = 'building_type_id', segment_ids = [5,8,11,16,17,18,21,23,9,22]) ############ DEVELOPER SIMULATION if core_components_to_run['Developer']: logger.log_status('Proforma simulation.') buildings, newbuildings = proforma_developer_model.run(dset,hh_zone1,emp_zone1,developer_configuration,sim_year) #import pdb; pdb.set_trace() dset.d['buildings'] = pd.concat([buildings,newbuildings]) dset.buildings.index.name = 'building_id' ############ INDICATORS if indicator_configuration['export_indicators']: unplaced_hh.append((dset.households.building_id==-1).sum()) unplaced_emp.append(dset.establishments[dset.establishments.building_id==-1].employees.sum()) if sim_year in indicator_configuration['years_to_run']: logger.log_status('Exporting indicators') indicators.run(dset, indicator_configuration['indicator_output_directory'], sim_year) logger.log_status('unplaced hh') logger.log_status(unplaced_hh) logger.log_status('unplaced emp') logger.log_status(unplaced_emp) ############ TRAVEL MODEL export_zonal_file.export_zonal_file_to_tm(dset,sim_year,logger,tm_config=[travel_model_configuration1,travel_model_configuration2,travel_model_configuration3,travel_model_configuration4,travel_model_configuration5,travel_model_configuration6]) ############ SWAPPER if sim_year == table_swapping['year']: if table_swapping['swap_skims']: logger.log_status('Swapping skims') td2 = pd.read_csv(table_swapping['new_skim_file'], index_col=['from_zone_id','to_zone_id']) dset.d['travel_data'] = td2 if table_swapping['swap_dist_rail']: logger.log_status('Swapping parcel distance to rail') p2 = pd.read_csv(table_swapping['new_dist_rail_file'], index_col=['parcel_id']) dset.d['parcels']['dist_rail'] = p2.dist_rail ############ URBANCANVAS if export_buildings_to_urbancanvas: logger.log_status('Exporting %s buildings to Urbancanvas database for project %s and year %s.' % (newbuildings.index.size,urbancanvas_scenario_id,sim_year)) urbancanvas_scenario_id = urbancanvas_export.export_to_urbancanvas(newbuildings, sim_year, urbancanvas_scenario_id) elapsed = time.time() - seconds_start print "TOTAL elapsed time: " + str(elapsed) + " seconds."
def test_to_hdf_multiple_nodes(): pytest.importorskip("tables") df = pd.DataFrame({ "x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4] }, index=[1.0, 2.0, 3.0, 4.0]) a = dd.from_pandas(df, 2) df16 = pd.DataFrame( { "x": [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", ], "y": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], }, index=[ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, ], ) b = dd.from_pandas(df16, 16) # saving to multiple nodes with tmpfile("h5") as fn: a.to_hdf(fn, "/data*") out = dd.read_hdf(fn, "/data*") assert_eq(df, out) # saving to multiple nodes making sure order is kept with tmpfile("h5") as fn: b.to_hdf(fn, "/data*") out = dd.read_hdf(fn, "/data*") assert_eq(df16, out) # saving to multiple datasets with custom name_function with tmpfile("h5") as fn: a.to_hdf(fn, "/data_*", name_function=lambda i: "a" * (i + 1)) out = dd.read_hdf(fn, "/data_*") assert_eq(df, out) out = pd.read_hdf(fn, "/data_a") tm.assert_frame_equal(out, df.iloc[:2]) out = pd.read_hdf(fn, "/data_aa") tm.assert_frame_equal(out, df.iloc[2:]) # test multiple nodes with hdf object with tmpfile("h5") as fn: with pd.HDFStore(fn) as hdf: b.to_hdf(hdf, "/data*") out = dd.read_hdf(fn, "/data*") assert_eq(df16, out)
def generate_metadata(project_directory, metadata_file): """ project_directory = directory containing the input metadata.csv file and all the featuresN.hdf5 files""" results_files = glob.glob(str( Path(project_directory) / '**/*metadata_featuresN.hdf5'), recursive=True) try: input_metadata_fname = Path(metadata_file) except Exception: input_metadata_fname = Path( glob.glob(str(Path(project_directory) / '**/metadata.csv'), recursive=True)[0]) metadata_in = pd.read_csv(input_metadata_fname, index_col=False) metadata_in.drop(columns='filename', inplace=True) metadata_in.index = pd.MultiIndex.from_arrays([ metadata_in.date_yyyymmdd, metadata_in.run_number, metadata_in.well_number, metadata_in.instrument_name ], names=meta_index) dates_to_analyse = list(metadata_in.date_yyyymmdd.unique()) #extract from the results filename the setnumber, date, camera number and then from file extract well names metadata_extract = pd.DataFrame() for r in results_files: #and extract other metadata from the filename if 'bluelight' in r: continue else: _date = int(re.findall(date, r)[0]) if _date in dates_to_analyse: _set = re.findall(set_no, r, re.IGNORECASE)[0] _camera = re.findall(camera, r)[0] _rig = HYDRA2CAM_DF.columns[(HYDRA2CAM_DF == _camera).any( axis=0)][0] #extra wells from featuresM with pd.HDFStore(r, 'r') as fid: wells = list(fid['/fov_wells'].well_name.unique()) metadata_extract = metadata_extract.append( pd.DataFrame({ 'run_number': int(_set), 'date_yyyymmdd': _date, 'camera_no': _camera, 'well_number': wells, 'filename': r, 'instrument_name': _rig })) metadata_extract.reset_index(drop=True, inplace=True) metadata_extract.index = pd.MultiIndex.from_arrays([ metadata_extract.date_yyyymmdd, metadata_extract.run_number, metadata_extract.well_number, metadata_extract.instrument_name ], names=meta_index) #concatenate together so that can merge metadata_concat = pd.concat([metadata_extract, metadata_in], axis=1, join='inner', sort=True) metadata_concat = metadata_concat.drop(columns=[ 'date_yyyymmdd', 'run_number', 'well_number', 'instrument_name' ]) metadata_concat.reset_index(drop=False, inplace=True) #save to csv metadata_concat.to_csv(input_metadata_fname.parent / 'updated_metadata.csv', index=False) # add in extracting out the temperature and humidity data extra_jsons = glob.glob(str( Path(project_directory) / '**/*extra_data.json'), recursive=True) if len(extra_jsons) > 0: json_metadata = pd.DataFrame() for e in extra_jsons: _date = int(re.findall(date, e)[0]) if _date in dates_to_analyse: _set = re.findall(set_no, e, re.IGNORECASE)[0] _camera = re.findall(camera, e)[0] _rig = HYDRA2CAM_DF.columns[(HYDRA2CAM_DF == _camera).any( axis=0)][0] with open(e) as fid: extras = json.load(fid) for t in extras: json_metadata = json_metadata.append(pd.concat([ pd.DataFrame.from_records([{ 'run_number': int(_set), 'date_yyyymmdd': _date, 'camera_no': _camera, 'filename': e, 'filedir': Path(e).parent, 'instrument_name': _rig }]), pd.DataFrame(pd.Series(t)).transpose() ], axis=1), ignore_index=True, sort=True) # summarise json metadata by json_metadata.to_csv(input_metadata_fname.parent / 'extra_data.csv') if 'json_metadata' in locals(): return metadata_concat, json_metadata else: return metadata_concat
def __init__(self, path): self.ds = pd.HDFStore(path)
for dim, set in product([9, 7, 4], ['test', 'training']): print(dim, set) store = pd.HDFStore(set + '_' + 'gen2_' + str(dim) + 'D_nions0_flat.h5') store['/megarun1/flattened'] = data.loc[idx[dim] & idx[set]] store['/megarun1/input'] = inputs[dim].loc[idx[set]] store['/megarun1/constants'] = consts[dim] store.close() if __name__ == '__main__': dim = 9 store_name = ''.join(['gen2_', str(dim), 'D_nions0_flat']) store = pd.HDFStore('../' + store_name + '.h5', 'r') input = store['/megarun1/input'] data = store['/megarun1/flattened'] startlen = len(data) data = sanity_filter(data, 50, 1.5, 1.5, 1e-4, startlen=startlen) data = regime_filter(data, 0, 100) gc.collect() input = input.loc[data.index] print('After filter {!s:<13} {:.2f}% left'.format( 'regime', 100 * len(data) / startlen)) filter_num = 7 sane_store = pd.HDFStore('../sane_' + store_name + '_filter' + str(filter_num) + '.h5') sane_store['/megarun1/input'] = input
'eblob2_bary': eblob2_bary, 'blob1_bary_x': blob1_bary_x, 'blob1_bary_y': blob1_bary_y, 'blob1_bary_z': blob1_bary_z, 'blob2_bary_x': blob2_bary_x, 'blob2_bary_y': blob2_bary_y, 'blob2_bary_z': blob2_bary_z, }) df_vxls = pd.DataFrame({ 'event': event_vxls, 'track_ID': track_ID_vxls, 'voxel_x': voxel_x, 'voxel_y': voxel_y, 'voxel_z': voxel_z, 'voxel_e': voxel_e }) df_run_info = pd.DataFrame({ 'events_in': loop_events, 'blob_radius': blob_radius }) out_name = '/home/paolafer/analysis/tracking_trueinfo_TlMC_run4_vxl{0}mm_R{1}mm_{2}_{3}.hdf5'.format( int(size), int(blob_radius[0]), start, numb) store = pd.HDFStore(out_name, "w", complib=str("zlib"), complevel=4) store.put('tracks', df, format='table', data_columns=True) store.put('voxels', df_vxls, format='table', data_columns=True) store.put('run_info', df_run_info, format='table', data_columns=True) store.close()
opts = argparse.ArgumentParser() opts.add_argument('--pairStore', dest='pairStore') opts.add_argument('--armStore', dest='armStore') opts.add_argument('--table', dest='table') opts.add_argument('--readLenFwd', type=int, default=75, dest='readLenFwd') opts.add_argument('--readLenRev', type=int, default=75, dest='readLenRev') opts.add_argument('--bedOut', dest='bedOut') o = opts.parse_args() pairStore = pd.HDFStore(o.pairStore, 'r') assert o.table in pairStore, 'cannot find table %s in %s' % (o.table, o.pairStore) tblPairs = pairStore[o.table] (tblnameExtArm, tblnameLigArm) = pairStore.get_storer( o.table).attrs['mippipe__ext_and_lig_arm_tables'] armStore = pd.HDFStore(o.armStore, 'r') tblArmsExt = armStore[tblnameExtArm] tblArmsLig = armStore[tblnameLigArm] btReadable = []
import sys import os sys.path.append(os.path.join(os.path.dirname(__file__),'..')) import numpy as np import pandas as pd from Conf.loadconf import * store = pd.HDFStore(os.path.join(CSV_DATA_PATH, H5FILENAME)) test = store['test'] train = store['train'] test_desc = test.describe() train_desc = train.describe() test_desc.to_csv(os.path.join(CSV_DATA_PATH,'desc-test.csv')) train_desc.to_csv(os.path.join(CSV_DATA_PATH,'desc-training.csv')) # save to h5 store['test_desc'] = test_desc store['train_desc'] = train_desc # save to mongo # a = np.log(data.RevolvingUtilizationOfUnsecuredLines+1)
# # import os import re from glob import glob import numpy as np import pandas as pd import gzip files = glob("/home/alex/data/stackexchange/overflow/posts/*.txt.gz") hdfpath = "/home/alex/data/stackexchange/overflow/caches/posts_all.hdf5" print "Number of posts in directory:", len(files) # rani = 1798777 rani = np.random.randint(len(files)) with gzip.open(files[rani]) as gf: print ">>>" print gf.read() print "<<<" store = pd.HDFStore(hdfpath, "r", complib="blosc", complevel=9) cols = None smask = store.select_as_coordinates("posts", "Id == %i" % int(re.findall(r"\d+", os.path.split(files[rani])[-1])[0])) posts = store.select("posts", where=smask) print "Information about this post:\n" print posts.iloc[0]
data.append(el_data) perf = pd.DataFrame(data) perf from StringIO import StringIO tag = '<a href="http://www.google.com">Google</a>' root = objectify.parse(StringIO(tag)).getroot() root # 二进制数据格式 frame = pd.read_csv('pydata-book/examples/ex1.csv') frame.to_pickle('frame_pickle') # 储存 pd.read_pickle('frame_pickle') # 读取 # HDF5格式 store = pd.HDFStore('mydata.h5') store['obj1'] = frame store['obj1_col'] = frame['a'] store # Excel文件 import xlrd import openpyxl xls_file = pd.ExcelFile('mydata.xlsx') table = xls_file.parse('Sheet1') table # 使用HTML和Web API import requests
# Set up the destination and secrects directory dataDir = 'DATA_DIRECTORY' secretsDir = 'SECRETS_DIRECTORY' apiDic = pd.read_csv('~\\ML-energy-use\\' + secretsDir + '\\apiKeyDictionary.csv') ids = apiDic['id'] type = apiDic['type'] # Counting time of proccessing start = time.time() # Creating a saving file for after processing store = pd.HDFStore( 'C:\\Users\\Gonxo\\ML-energy-use\\DATA_DIRECTORY\\15min_noNaNs_201703081045.h5' ) # Empty data frame to store previous feeds feeds = pd.DataFrame() # Looping for all the different feeds individually for i in range(len(apiDic)): print(str(type[i]) + '_' + str(ids[i])) # Obtaining the feed from the hdf5 file feeds = pd.read_hdf( 'C:\\Users\\Gonxo\\ML-energy-use\\DATA_DIRECTORY\\home_feeds.h5', str(type[i]) + '_' + str(ids[i]))['watts_hour'] # Deleting NaNs at the beggining and end of te series.
import pandas as pd def add_csv_to_store(csv_file, name, store): df = pd.read_csv(csv_file) store.put(name, df, format='table', data_columns=True) with pd.HDFStore(path='example_results_package.h5', mode='a') as hdf: add_csv_to_store( './oasis_output/output/gul_S1_aalcalc.csv', '/output/groundup_loss/all/aal', hdf) add_csv_to_store( './oasis_output/output/gul_S1_eltcalc.csv', 'results_package_example/output/groundup_loss/all/elt', hdf) add_csv_to_store( './oasis_output/output/gul_S1_leccalc_full_uncertainty_aep.csv', 'results_package_example/output/groundup_loss/all/aep_full_uncertainty', hdf) add_csv_to_store( './oasis_output/output/gul_S1_leccalc_full_uncertainty_oep.csv', 'results_package_example/output/groundup_loss/all/oep_full_uncertainty', hdf) add_csv_to_store( './oasis_output/output/gul_S1_pltcalc.csv', 'results_package_example/output/groundup_loss/all/plt', hdf) add_csv_to_store( './oasis_output/output/gul_S1_summary-info.csv', 'results_package_example/output/groundup_loss/all/summary_info', hdf) add_csv_to_store( './oasis_output/output/gul_S2_aalcalc.csv', 'results_package_example/output/groundup_loss/by_geography/aal', hdf) add_csv_to_store( './oasis_output/output/gul_S2_eltcalc.csv', 'results_package_example/output/groundup_loss/by_geography/elt', hdf) add_csv_to_store( './oasis_output/output/gul_S2_leccalc_full_uncertainty_aep.csv', 'results_package_example/output/groundup_loss/by_geography/aep_full_uncertainty', hdf) add_csv_to_store( './oasis_output/output/gul_S2_leccalc_full_uncertainty_oep.csv', 'results_package_example/output/groundup_loss/by_geography/oep_full_uncertainty', hdf) add_csv_to_store( './oasis_output/output/gul_S2_pltcalc.csv', 'results_package_example/output/groundup_loss/by_geography/plt', hdf)
def read_ncdb(filepath): """ Read data from Geolytics's Neighborhood Change Database (NCDB) and store it for later use. Parameters ---------- input_dir : str location of the input CSV file extracted from your Geolytics DVD Returns ------- DataFrame """ ncdb_vars = variables["ncdb"].dropna()[1:].values df = pd.read_csv( filepath, low_memory=False, na_values=["", " ", 99999, -999], converters={ "GEO2010": str, "COUNTY": str, "COUSUB": str, "DIVISION": str, "REGION": str, "STATE": str, }, ) cols = df.columns fixed = [] for col in cols: if col.endswith("D"): fixed.append("D" + col[:-1]) elif col.endswith("N"): fixed.append("N" + col[:-1]) elif col.endswith("1A"): fixed.append(col[:-2] + "2") orig = [] for col in cols: if col.endswith("D"): orig.append(col) elif col.endswith("N"): orig.append(col) elif col.endswith("1A"): orig.append(col) df.rename(dict(zip(orig, fixed)), axis="columns", inplace=True) df = pd.wide_to_long(df, stubnames=ncdb_vars, i="GEO2010", j="year", suffix="(7|8|9|0|1|2)").reset_index() df["year"] = df["year"].replace({ 7: 1970, 8: 1980, 9: 1990, 0: 2000, 1: 2010, 2: 2010 }) df = df.groupby(["GEO2010", "year"]).first() mapper = dict(zip(variables.ncdb, variables.ltdb)) df.reset_index(inplace=True) df = df.rename(mapper, axis="columns") df = df.set_index("geoid") store = pd.HDFStore(os.path.join(package_directory, "data.h5"), "w") store["ncdb"] = df store.close() return df