Esempio n. 1
0
def read_ltdb(sample, fullcount):
    """
    Read data from Brown's Longitudinal Tract Database (LTDB) and store it for later use.

    Parameters
    ----------
    sample : str
        file path of the zip file containing the standard Sample CSV files downloaded from
        https://s4.ad.brown.edu/projects/diversity/Researcher/LTBDDload/Default.aspx

    fullcount: str
        file path of the zip file containing the standard Fullcount CSV files downloaded from
        https://s4.ad.brown.edu/projects/diversity/Researcher/LTBDDload/Default.aspx

    Returns
    -------
    DataFrame

    """
    sample_zip = zipfile.ZipFile(sample)
    fullcount_zip = zipfile.ZipFile(fullcount)

    def _ltdb_reader(path, file, year, dropcols=None):

        df = pd.read_csv(
            path.open(file),
            na_values=["", " ", 99999, -999],
            converters={
                0: str,
                "placefp10": str
            },
            low_memory=False,
            encoding="latin1",
        )

        if dropcols:
            df.drop(dropcols, axis=1, inplace=True)
        df.columns = df.columns.str.lower()
        names = df.columns.values.tolist()
        names[0] = "geoid"
        newlist = []

        # ignoring the first 4 columns, remove year suffix from column names
        for name in names[4:]:
            newlist.append(name[:-2])
        colnames = names[:4] + newlist
        df.columns = colnames

        # prepend a 0 when FIPS is too short
        df["geoid"] = df["geoid"].str.rjust(11, "0")
        df.set_index("geoid", inplace=True)

        df["year"] = year

        inflate_cols = ["mhmval", "mrent", "hinc"]
        df = _adjust_inflation(df, inflate_cols, year)

        return df

    # read in Brown's LTDB data, both the sample and fullcount files for each
    # year population, housing units & occupied housing units appear in both
    # "sample" and "fullcount" files-- currently drop sample and keep fullcount

    sample70 = (_ltdb_reader(
        sample_zip,
        "ltdb_std_1970_sample.csv",
        dropcols=["POP70SP1", "HU70SP", "OHU70SP"],
        year=1970,
    ), )

    fullcount70 = (_ltdb_reader(fullcount_zip,
                                "LTDB_Std_1970_fullcount.csv",
                                year=1970), )

    sample80 = (_ltdb_reader(
        sample_zip,
        "ltdb_std_1980_sample.csv",
        dropcols=["pop80sf3", "pop80sf4", "hu80sp", "ohu80sp"],
        year=1980,
    ), )

    fullcount80 = (_ltdb_reader(fullcount_zip,
                                "LTDB_Std_1980_fullcount.csv",
                                year=1980), )

    sample90 = (_ltdb_reader(
        sample_zip,
        "ltdb_std_1990_sample.csv",
        dropcols=["POP90SF3", "POP90SF4", "HU90SP", "OHU90SP"],
        year=1990,
    ), )

    fullcount90 = (_ltdb_reader(fullcount_zip,
                                "LTDB_Std_1990_fullcount.csv",
                                year=1990), )

    sample00 = (_ltdb_reader(
        sample_zip,
        "ltdb_std_2000_sample.csv",
        dropcols=["POP00SF3", "HU00SP", "OHU00SP"],
        year=2000,
    ), )

    fullcount00 = (_ltdb_reader(fullcount_zip,
                                "LTDB_Std_2000_fullcount.csv",
                                year=2000), )

    sample10 = _ltdb_reader(sample_zip, "ltdb_std_2010_sample.csv", year=2010)

    # join the sample and fullcount variables into a single df for the year
    ltdb_1970 = sample70.join(fullcount70.iloc[:, 7:], how="left")
    ltdb_1980 = sample80.join(fullcount80.iloc[:, 7:], how="left")
    ltdb_1990 = sample90.join(fullcount90.iloc[:, 7:], how="left")
    ltdb_2000 = sample00.join(fullcount00.iloc[:, 7:], how="left")
    ltdb_2010 = sample10

    # the 2010 file doesnt have CBSA info, so grab it from the 2000 df
    ltdb_2010["cbsa"] = np.nan
    ltdb_2010.update(other=ltdb_2000["cbsa"], overwrite=True)

    df = pd.concat([ltdb_1970, ltdb_1980, ltdb_1990, ltdb_2000, ltdb_2010],
                   sort=True)

    df = df.set_index("geoid")

    store = pd.HDFStore(os.path.join(package_directory, "data.h5"), "w")
    store["ltdb"] = df

    store.close()

    return df
Esempio n. 2
0
def RunDSPVec(t1_file,
              vec_process=None,
              digitizer_list=None,
              out_prefix="t2",
              verbose=False,
              output_dir=None,
              multiprocess=False):
    """ vector version of tier 1 processor """

    if vec_process is None:
        vec_process = VectorProcess(default_list=True)

    print("Starting pygama Tier 1 (vector) processing ...")
    print("   Input file: {}".format(t1_file))
    statinfo = os.stat(t1_file)
    print("   File size: {}".format(sizeof_fmt(statinfo.st_size)))

    start = time.clock()
    directory = os.path.dirname(t1_file)
    output_dir = os.getcwd() if output_dir is None else output_dir

    # snag the run number (assuming t1_file ends in _run<number>.<filetype>)
    run_str = re.findall('run\d+', t1_file)[-1]
    run = int(''.join(filter(str.isdigit, run_str)))

    # get pygama's available digitizers
    if digitizer_list is None:
        digitizer_list = get_digitizers()

    # get digitizers in the file
    f = h5py.File(t1_file, 'r')
    digitizer_list = [d for d in digitizer_list if d.decoder_name in f.keys()]

    print("   Found digitizers:")
    for d in digitizer_list:
        print("   -- {}".format(d.decoder_name))

    for d in digitizer_list:
        print("Processing data from: " + d.decoder_name)

        object_info = pd.read_hdf(t1_file, key=d.class_name)
        d.load_object_info(object_info)

        # single thread process -- let's ABANDON THIS
        # t1_df = pd.read_hdf(t1_file, key=d.decoder_name)
        # t2_df = vec_process.Process(t1_df)

        # multi process -- i want to ALWAYS do this, using hdf5 chunking
        # even if i only have one thread available.
        # try to write each chunk to the file so you never hold the whole
        # file in memory.
        h5key = d.class_name
        chunksize = 3000  # num wf rows.  optimal for my mac, at least
        n_cpu = mp.cpu_count()

        with pd.HDFStore(t1_file, 'r') as store:
            nrows = store.get_storer(h5key).shape[0]  # fixed only
            chunk_idxs = list(range(nrows // chunksize + 1))

        keywords = {"t1_file": t1_file, "chunksize": chunksize, "h5key": h5key}

        with mp.Pool(n_cpu) as p:
            result_list = p.map(partial(process_chunk, **keywords), chunk_idxs)

        # t2_df = pd.concat(result_list)
        #
        # print("Elapsed: {:.2f} sec".format(time.time()-t_start))

    t2_file = os.path.join(output_dir, "{}_run{}.h5".format(out_prefix, run))

    if verbose:
        print("Writing Tier 2 File:\n   {}".format(t2_file))
        print("   Entries: {}".format(len(t2_df)))
        print("   Data columns:")
        for col in t2_df.columns:
            print("   -- " + col)

    t2_df.to_hdf(t2_file,
                 key="data",
                 format='table',
                 mode='w',
                 data_columns=t2_df.columns.tolist())

    if verbose:
        statinfo = os.stat(t2_file)
        print("File size: {}".format(sizeof_fmt(statinfo.st_size)))
        elapsed = time.clock() - start
        proc_rate = elapsed / len(t2_df)
        print("Time elapsed: {:.2f} sec  ({:.5f} sec/wf)".format(
            elapsed, proc_rate))
        print("Done.")
Esempio n. 3
0
 def example_VSTOXX_index(self):
     V0 = 17.6639
     r = 0.01
     import pandas as pd
     h5 = pd.HDFStore("./vstoxx_data_31032014.h5", "r")
     futures_data = h5["futures_data"] # VSTOXX futures data
     options_data = h5["options_data"] # VSTOXX call option data
     h5.close()
     
     print(futures_data)
     options_data.info()
     options_data[["DATE", "MATURITY", "TTM", "STRIKE", "PRICE"]].head()
     
     options_data["IMP_VOL"] = 0.0
     # new column for implied volatilities
     # from bsm_functions import *
     tol = 0.5 # tolerance level for moneyness
     for option in options_data.index:
         # iterating over all option quotes
         forward = futures_data[ futures_data["MATURITY"] == options_data.loc[option]["MATURITY"] ] ["PRICE"].values[0]
         # picking the right futures value
         if (forward * (1 - tol) < options_data.loc[option]["STRIKE"] < forward * (1 + tol)):
         # only for options with moneyness within tolerance
             imp_vol = self.bsm_call_imp_vol(
                 V0, # VSTOXX value
                 options_data.loc[option]["STRIKE"],
                 options_data.loc[option]["TTM"],
                 r, # short rate
                 options_data.loc[option]["PRICE"],
                 sigma_est=2., # estimate for implied volatility
                 it=100)
             options_data["IMP_VOL"].loc[option] = imp_vol
     
     futures_data["MATURITY"]    
     options_data.loc[46170]
     options_data.loc[46170]["STRIKE"]
     plot_data = options_data[options_data["IMP_VOL"] > 0]
     maturities = sorted(set(options_data["MATURITY"]))
     print(maturities)
     
     plt.figure(figsize=(8, 6))
     for maturity in maturities:
         data = plot_data[options_data.MATURITY == maturity]
         # select data for this maturity
         plt.plot(data["STRIKE"], data["IMP_VOL"], label=maturity.timestamp(), lw=1.5)
         plt.plot(data["STRIKE"], data["IMP_VOL"], "r.")
     plt.grid(True)
     plt.xlabel("strike")
     plt.ylabel("implied volatility of volatility")
     plt.legend()
     plt.show()
     
     
     keep = ["PRICE", "IMP_VOL"]
     group_data = plot_data.groupby(["MATURITY", "STRIKE"])[keep]
     print(group_data)
     group_data = group_data.sum()
     group_data.head()
     group_data.index.levels
     
     pass
Esempio n. 4
0
File: io.py Progetto: spark706/PyPSA
 def __init__(self, path, **kwargs):
     self.ds = pd.HDFStore(path, mode='w', **kwargs)
Esempio n. 5
0
def CalBarraGrowth(dates):
    Growth = ['EGRLF','EGRSF','EGRO','SGRO']
    statemap = {'EGRLF':['对未来三年预期净利润','net_profit0'], ##数据缺失
        'EGRSF':['对未来一年预期净利润','net_profit0'], ##数据缺失
        'EGRO':['net_profit0'], ##年报每股收益 = 当期净利润/当期在外发行普通股
        'SGRO':['operate_profit','operate_expense']}

    if DB_CONN == 1:
        #函数以数据库连接
        conn_params = urllib.parse.quote_plus("""DRIVER={SQL Server Native Client 10.0};
                                        SERVER=quant;DATABASE=tbas;UID=quant;PWD=quant007""")
        conn = sqlalchemy.create_engine("mssql+pyodbc:///?odbc_connect=%s" % conn_params)
        conn_params=urllib.parse.quote_plus("""DRIVER={SQL Server Native Client 10.0};
                                        SERVER=10.130.14.41;DATABASE=fcdb;UID=ch_data;PWD=68880980""")
        conn2 = sqlalchemy.create_engine("mssql+pyodbc:///?odbc_connect=%s" % conn_params)

    for factor in Growth:
        fcl = ['date'] + statemap[factor]

        #数据读入,市场,财务数据
        st = pd.HDFStore(inFilename)
        state = st.select('sheet',"columns="+str(fcl))
        if factor in ['EGRO','SGRO']:
            mkt = st.select('mkt', "columns=['total_share']")
        st.close()

        #因子计算,财务数据对齐
        ##财务数据对齐
        state = state.unstack()
        state = state[(state.index.month.isin([3,6,9,12]))].stack()

        nf = state.reset_index()
        tnf = nf['pdate'].groupby([nf['date'],nf['sec_code']]).max()
        nf = nf.set_index(['date', 'sec_code', 'pdate'])
        tnf = tnf.reset_index()
        tnf = tnf.set_index(['date', 'sec_code', 'pdate'])
        nf = nf[nf.index.isin(tnf.index)]
        nf = nf.reset_index('pdate')

        nf = nf.drop(['pdate'], axis=1)

        ##日度对齐
        nf = nf.unstack()
        if factor in ['EGRO','SGRO']:
            nf = nf.reindex(nf.index.union(dates))
            mkt = mkt.unstack()
            mkt = mkt.reindex(mkt.index.union(dates)).ffill()
            mkt = mkt.reindex(dates)
        else:
            nf = nf.reindex(nf.index.union(dates)).ffill()
        nf = nf.reindex(dates)

        if factor in ['EGRLF','EGRSF']:
            factorvalue = nf[statemap[factor][0]] / (math.abs(nf[statemap[factor][1]]) - 1)

        elif factor in ['EGRO','SGRO']:
            ##temp5是要回归的数据
            if factor == 'EGRO':
                temp5 = nf['net_profit0'] / mkt['total_share']
            else:
                temp5 = (nf['operate_profit'] + nf['operate_expense']) / mkt['total_share']

            factorvalue = abs(temp5.copy(deep=True) * 0)
            for i in range(len(temp5.iloc[0, :])):
                stock_val = temp5.iloc[:, i]
                stock_val = stock_val.dropna(how='all')
                stock_fval = abs(stock_val.copy(deep=True) * 0)
                for j in range(len(stock_val)-1, 19, -1):    ##回归过去5年,就是20个季度
                    temp_y = stock_val.iloc[j-20:j].fillna(0)
                    temp_x = np.arange(20)
                    x = np.asmatrix(temp_x).transpose()
                    y = np.asmatrix(temp_y).transpose()
                    B = linear_regression_coef(x, y)
                    mean = y.mean()
                    if mean == 0:
                        stock_fval.iloc[j] = B
                    else:
                        stock_fval.iloc[j] = B / mean
                stock_fval = stock_fval.reindex(stock_fval.index.union(dates)).ffill()
                factorvalue.iloc[:,i] = stock_fval

        st = pd.HDFStore(outFilename)
        if factor in [x[1:] for x in st.keys()]:
            existday = st.select_column(factor, 'index')
            st.append(factor, factorvalue.loc[factorvalue.index.difference(existday)], format='t')
        else:
            st.append(factor, factorvalue, format='t')
        st.close()
    
        print(factor)
Esempio n. 6
0
    # 获取文章中所有的文章
    # lists可循环
    # 4.解析数据
    lists = soup.find_all('div',
                          class_="list-group-item list-group-item-action p-06")
    title = []
    url = []
    author = []
    time = []
    for i in lists:
        # 看网页源码再进一步确定这里怎么弄,split要好好使用
        # print(i.text.split('\n')[0])
        j = i.find('div', class_="topic_title")
        if j:
            title.append(j.text.split('\n')[0])
            url.append(i.a['href'])
            author.append(i.strong.a.text)
            time.append(i.span['title'])

    # 5.写入数据
    df = pd.DataFrame({'标题': title, '作者': author, '时间': time, '链接': url})
    data = pd.HDFStore('data.h5', 'w')
    df.to_hdf('data.h5', key='yq')
    data.close()

    # 读取数据
    store = pd.HDFStore('data.h5', mode='r')
    temp = pd.read_hdf('data.h5')
    print(temp)

    store.close()
Esempio n. 7
0
    # load coordinates
    # so height[i] is the geopotential height at a given time
    height = fileobj.variables['z'][:]
    g_inv = 1/9.81
    height = height*g_inv
    height = height / 9.81



    # get processed dataframe
    # Create storage object with filename `processed_data`
    name = 'processed_' + str(f[wantfile][-10:-5]) + '.h5'

    # Access data store
    data_store = pd.HDFStore(name)
    
    # Retrieve data using key
    geopot_df = data_store['preprocessed_geopot']
    data_store.close()



    #get fft coeffs
    def geopot_fft(geopotential):
        y = fft(geopotential)
        ck = y
        return(ck)

    fft_zonal_result = [geopot_fft(height[k]) for k in range(number_entries)]
Esempio n. 8
0
        speed[speed > 2.5].index.values[1:]).drop_long_intervals(
            26000).merge_close_intervals(50000)
    wake_ep = wake_ep.intersect(speed_ep).drop_short_intervals(3000000)
    n_channel, fs, shank_to_channel = loadXML(data_directory + session + "/" +
                                              session.split("/")[1] + '.xml')
    rip_ep, rip_tsd = loadRipples(data_directory + session)
    hd_info = scipy.io.loadmat(data_directory + session +
                               '/Analysis/HDCells.mat')['hdCellStats'][:, -1]
    hd_info_neuron = np.array([hd_info[n] for n in spikes.keys()])
    all_neurons = np.array(list(spikes.keys()))
    mod_neurons = np.array([
        int(n.split("_")[1]) for n in neurons_index
        if session.split("/")[1] in n
    ])
    if len(sleep_ep) > 1:
        store = pd.HDFStore("/mnt/DataGuillaume/population_activity_25ms/" +
                            session.split("/")[1] + ".h5")
        # all_pop 		= store['allwake']
        pre_pop = store['presleep']
        pos_pop = store['postsleep']
        store.close()

        store = pd.HDFStore("/mnt/DataGuillaume/population_activity_100ms/" +
                            session.split("/")[1] + ".h5")
        all_pop = store['allwake']
        # pre_pop		= store['presleep']
        # pos_pop		= store['postsleep']
        store.close()

        def compute_eigen(popwak):
            popwak = popwak - popwak.mean(0)
            popwak = popwak / (popwak.std(0) + 1e-8)
Esempio n. 9
0

import numpy as np 
import pandas as pd 
from scipy import signal
import matplotlib.pyplot as plt

import kagglegym


# In[ ]:


# This part is going to be for explorind the dataset ...
# so we want the entire dataset ..
with pd.HDFStore("../input/train.h5", "r") as train:
    df = train.get("train")


# In[ ]:


list(set([c.split('_')[0] for c in df.columns]))


# So there are three types of main cells. 
# 
#  - `timestamp`: current timestamp
#  - `y`: This is what we want to predict
#  - [`fundamental`, `derived`, `technical`]: these are our predictors
Esempio n. 10
0
#chrs = all_sites['chr'].unique()
#logger.info('chromosomes of all WGBS sites are: '+str(chrs))
#cols=['chr', 'coordinate','strand']
#tss =  pd.read_csv(home+'data/commons/tss.txt',sep='\s+',header=None,names=cols,skiprows=1)
#tss = get_winid.convert_chr_to_num(tss,chrs)

all_wgbs_sites = (args.all == 'True')
reset_tracker = (args.reset_tracker == 'True')

if all_wgbs_sites:
    logger.info('Using all WGBS sites')
    selected_wgbs_tss = all_sites[['winid', 'chr', 'coordinate']]
elif not os.path.exists(home + 'data/' + dataset + '/all_selected_wgbs_sites'):
    logger.info('Selecting WGBS sites within 100k of tss sites')
    selected_wgbs_tss = wgbs_sites_selection(tss, all_sites)
    with pd.HDFStore(home + 'data/' + dataset + '/all_selected_wgbs_sites',
                     'w') as h5s:
        h5s['all_wgbs'] = selected_wgbs_tss
else:
    logger.info('Using selected wgbs sites')
    with pd.HDFStore(home + 'data/' + dataset + '/all_selected_wgbs_sites',
                     'r') as h5s:
        selected_wgbs_tss = h5s['all_wgbs']

start_pos = 0
end_pos = len(selected_wgbs_tss) - 1
logger.info('total selected wgbs sites number: ' + str(end_pos + 1))
subprocess.call([
    'sed', '-i', 's/tss_end =.*/tss_end = ' + str(end_pos) + '/',
    home + 'code/prediction/prediction_commons.py'
])
ranges = np.arange(start_pos, end_pos, 2000000)
Esempio n. 11
0
                     effects=effects,
                     pop_size=pop_size)

        # copy the clean results and add amplification artifacts for the
        # jackpot noise case
        counts.loc[:, idx['jackpot', :, :]] = \
            counts.loc[:, idx['clean', :, :]].values
        add_amplification_artifacts(counts=counts,
                                    condition='jackpot',
                                    pct_high=artifacts_pct_high,
                                    mult_high=artifacts_mult_high,
                                    pct_low=artifacts_pct_low,
                                    mult_low=artifacts_pct_low)

        expected = calc_expected(effects)

        for d in depths:
            sequencing = generate_sequencing_counts(counts=counts, depth=d)

            name = "{}_simulation_depth_{}".format(assay, d)
            output_simulation(sequencing, name, cfg['outdir'])

            # output to HDF5
            store = pd.HDFStore(
                os.path.join(cfg['outdir'], name, "{}.h5".format(assay)))
            store.put(key='popcounts', value=counts)
            store.put(key='seqcounts', value=sequencing)
            store.put(key='effects', value=effects)
            store.put(key='expected', value=expected)
            store.close()
Esempio n. 12
0
        #print 'id=%d,first_occur_idx=%d'%(id_, first_occur_idx)
        df3.ix[first_occur_idx - 1, 'y1'] = None
    return df3


# export pre-processed dataframe into a csv file
def export_df_to_csv(df, csv_filename="preprocessed_data.csv"):
    print 'exporting dataframe to ', csv_filename
    df.to_csv(csv_filename,
              sep=',',
              na_rep='',
              float_format="%.8f",
              index=False)


def import_df_from_csv(csv_filename="preprocessed_data.csv"):
    print 'importing dataframe from ', csv_filename
    df = pd.read_csv(csv_filename, sep=',', index_col=None)
    return df


if __name__ == '__main__':
    print 'Hello'
    with pd.HDFStore("train.h5", "r") as train:
        df = train.get("train")
        explore(df)
        work_df = build_working_df(df)
        export_df_to_csv(work_df)
        work_df_no_na = df_fill_na(work_df)
        export_df_to_csv(work_df_no_na, "preprocessed_data_no_na.csv")
Esempio n. 13
0
    log_loss,
)
from boruta import BorutaPy
# finance packages
import trademl as tml
# import vectorbt as vbt

### DON'T SHOW GRAPH OPTION
matplotlib.use("Agg")

### GLOBALS
DATA_PATH = 'D:/market_data/usa/ohlcv_features/'

### IMPORT DATA
contract = ['SPY']
with pd.HDFStore(DATA_PATH + contract[0] + '.h5') as store:
    data = store.get(contract[0])
data.sort_index(inplace=True)

### CHOOSE/REMOVE VARIABLES
remove_ohl = [
    'open',
    'low',
    'high',
    'average',
    'barCount',
    # 'vixFirst', 'vixHigh', 'vixLow', 'vixClose', 'vixVolume',
    'open_orig',
    'high_orig',
    'low_orig'
]
Esempio n. 14
0
#!/opt/anaconda/bin/python
'''max_widths.py - figure out max width for various text fields'''

import pandas as pd

store = pd.HDFStore('GDELT-compressed.h5')

max_max = pd.Series({
    'Actor1Code': 0,
    'Actor2Code': 0,
    'EventCode': 0,
    'QuadCategory': 0
})

for df in store.select(
        'reduced',
        columns=['Actor1Code', 'Actor2Code', 'EventCode', 'QuadCategory'],
        iterator=True):
    curr_max = df.applymap(len).max()
    max_max = pd.concat([curr_max, max_max], axis=1).max(axis=1)
    print max_max
Esempio n. 15
0
def process_multiple_days(
        dates_to_process,
        directory_structure="new",
        loc_processed_h5=os.path.join(
            d_drive,
            "Data",
            "Processed",
            "tube_data.h5"
        ),
        raw_base_dir=os.path.join(
            d_drive,
            "Data",
            "Raw"
        ),
        multiprocess=True,
        overwrite=False
):
    """
    Process multiple days worth of tube data
    
    Parameters
    ----------
    dates_to_process : List[Str] or str
        List of dates to post process as YYYY-MM-DD
    directory_structure : str
        How raw data is stored on disk. Can be "old" or "new", but probably
         should be "new"
    loc_processed_h5 : str
        Location of processed data HDF5
    raw_base_dir : str
        Base directory where raw data directories are located. You shouldn't
        need to change this.
    multiprocess : bool
        Whether to use multiprocessing for each day to speed things up
    overwrite : bool
        Whether or not to overwrite existing processed data in the processed
        data HDF5 database
    """
    if hasattr(dates_to_process, "lower"):
        # it's a string. make it a list.
        dates_to_process = [dates_to_process]

    with pd.HDFStore(
        os.path.join(
            _DIR,
            "data",
            "tube_data_template.h5"
        ),
        "r"
    ) as store:
        df_out = store["data"]

    for day in dates_to_process:
        if directory_structure == "old":
            df_day, day_schlieren = process_old_data(
                raw_base_dir,
                day,
                multiprocess=multiprocess
            )
            df_day.drop(
                "sensors",
                axis=1,
                inplace=True
            )

            # these were not included and I'm not using the old structure
            # anymore so it's not important enough to fix
            df_day["dil_mf"] = 0.
            df_day["dil_mf_nom"] = 0.
            df_day["diluent"] = "None"
            df_day["fuel"] = "C3H8"
            df_day["oxidizer"] = "air"
            df_day["p_0_nom"] = 101325.
            df_day["p_diluent"] = df_day["p_fuel"]
            df_day["phi_nom"] = 1.
            df_day["u_dil_mf"] = 0.
            df_day["u_p_diluent"] = df_day["u_p_fuel"]

        else:
            df_day, day_schlieren = process_new_data(
                raw_base_dir,
                day,
                multiprocess=multiprocess
            )

        # force df_day to match desired structure
        df_day = pd.concat((df_out, df_day), sort=False, ignore_index=True)

        day = "d" + day.replace("-", "_")
        with pd.HDFStore(loc_processed_h5, "a") as store:
            existing_tests = get_existing_tests(store)

            for _, row in df_day.iterrows():
                store_processed_test(
                    row,
                    existing_tests,
                    overwrite,
                    store
                )

            for img_key in day_schlieren.keys():
                for i, img in enumerate(day_schlieren[img_key]):
                    store_processed_schlieren(
                        day,
                        img,
                        img_key,
                        i,
                        overwrite,
                        store
                    )
Esempio n. 16
0
# Copyright 2016 Telenor ASA, Author: Axel Tidemann

import os
import argparse

import pandas as pd
import numpy as np

parser = argparse.ArgumentParser(description='''
Reads an HDF5 file with mappings of ad IDs to images, calculates
plots bar charts showing how many images are in each.
''', formatter_class=argparse.ArgumentDefaultsHelpFormatter)

parser.add_argument(
    'mapping',
    help='HDF5 file with ad ids and images')
parser.add_argument(
    '--out_file',
    help='Filename of the HDF5 file', 
    default='bar.h5')
args = parser.parse_args()

with pd.HDFStore(args.mapping, mode='r') as store:
    for category in store.keys():
        data = store[category]
        entries = data.count(axis=1)
        print '{}: {} entries. number of images: mean {}, median {}, std {}'.format(category, len(data), np.mean(entries), np.median(entries), np.std(entries))
Esempio n. 17
0
 def __init__(self, path):
     self.ds = pd.HDFStore(path, mode='r')
     self.index = {}
Esempio n. 18
0
    def run(self, name=None, export_buildings_to_urbancanvas=False, base_year=2010, forecast_year=None, fixed_seed=True, random_seed=1, indicator_configuration=None, core_components_to_run=None, household_transition=None,household_relocation=None,employment_transition=None, elcm_configuration=None, developer_configuration=None, table_swapping=None, travel_model_configuration1=None, travel_model_configuration2=None, travel_model_configuration3=None, travel_model_configuration4=None, travel_model_configuration5=None, travel_model_configuration6=None):
        """Runs an UrbanSim2 scenario 
        """
        logger.log_status('Starting UrbanSim2 run.')
        dset = dataset.DRCOGDataset(os.path.join(misc.data_dir(),'drcog.h5'))
        seconds_start = time.time()
        if fixed_seed:
            logger.log_status('Running with fixed random seed.')
            np.random.seed(random_seed)
            
        #Load estimated coefficients
        coeff_store = pd.HDFStore(os.path.join(misc.data_dir(),'coeffs.h5'))
        dset.coeffs = coeff_store.coeffs.copy()
        coeff_store.close()

        coeff_store = pd.HDFStore(os.path.join(misc.data_dir(),'coeffs_res.h5'))
        dset.coeffs_res = coeff_store.coeffs_res.copy()
        coeff_store.close()

        #Keep track of unplaced agents by year
        unplaced_hh = []
        unplaced_emp = []
        
        #UrbanCanvas scenario id, replaced by db-retrieved value during export step
        urbancanvas_scenario_id = 0

                #####Residential Buildings#####
        new_refiner.add_res_buildings(dset)

        #####Non-Residential Buildings#####
        new_refiner.add_non_res_buildings(dset)
        
        for sim_year in range(base_year,forecast_year+1):
            print 'Simulating year ' + str(sim_year)
            logger.log_status(sim_year)

            ##Variable Library calculations
            variable_library.calculate_variables(dset)
            
            #Record pre-demand model zone-level household/job totals
            hh_zone1 = dset.fetch('households').groupby('zone_id').size()
            emp_zone1 = dset.fetch('establishments').groupby('zone_id').employees.sum()
            
            ############     ELCM SIMULATION
            if core_components_to_run['ELCM']:
                logger.log_status('ELCM simulation.')
                alternatives = dset.buildings[(dset.buildings.non_residential_sqft>0)]
                new_elcm_model.simulate(dset, year=sim_year,depvar = 'building_id',alternatives=alternatives,simulation_table = 'establishments',output_names = ("drcog-coeff-elcm-%s.csv","DRCOG EMPLOYMENT LOCATION CHOICE MODELS (%s)","emp_location_%s","establishment_building_ids"),
                                         agents_groupby= ['sector_id_retail_agg',],transition_config = {'Enabled':True,'control_totals_table':'annual_employment_control_totals','scaling_factor':1.0})

            #################     HLCM SIMULATION
            if core_components_to_run['HLCM']:
                logger.log_status('HLCM simulation.')
                alternatives = dset.buildings[(dset.buildings.residential_units>0)]
                new_hlcm_simulation.simulate(dset, year=sim_year,depvar = 'building_id',alternatives=alternatives,simulation_table = 'households',output_names = ("drcog-coeff-hlcm-%s.csv","DRCOG HOUSEHOLD LOCATION CHOICE MODELS (%s)","hh_location_%s","household_building_ids"),
                                         agents_groupby= ['income_3_tenure',],transition_config = {'Enabled':True,'control_totals_table':'annual_household_control_totals','scaling_factor':1.0},
                                         relocation_config = {'Enabled':True,'relocation_rates_table':'annual_household_relocation_rates','scaling_factor':1.0},)
                                         
            ############     DEMAND-SIDE REFINEMENT
            #refiner.run(dset, sim_year)
            # refiner_fnc = "refiner.run(dset, sim_year)"
            #cProfile.runctx(refiner_fnc, locals={'dset':dset, 'sim_year':sim_year}, globals={'refiner': refiner}, filename='c:/users/jmartinez/documents/refiner_time')

            ############     REPM SIMULATION
            if core_components_to_run['Price']:
                logger.log_status('REPM simulation.')
                #Residential
                census_model_simulation.simulate_residential(dset, 'unit_price_res_sqft', 'school_district_id', 10, sim_year)

                #Non-residential                                    
                regression_model_simulation.simulate(dset, year=sim_year,output_varname='unit_price_non_residential', simulation_table='buildings', output_names = ["drcog-coeff-nrhedonic-%s.csv","DRCOG NRHEDONIC MODEL (%s)","nrprice_%s"],
                                                     agents_groupby = 'building_type_id', segment_ids = [5,8,11,16,17,18,21,23,9,22])
            
            ############     DEVELOPER SIMULATION
            if core_components_to_run['Developer']:
                logger.log_status('Proforma simulation.')
                buildings, newbuildings = proforma_developer_model.run(dset,hh_zone1,emp_zone1,developer_configuration,sim_year)
                #import pdb; pdb.set_trace()
                dset.d['buildings'] = pd.concat([buildings,newbuildings])
                dset.buildings.index.name = 'building_id'
            
            ############   INDICATORS
            if indicator_configuration['export_indicators']:
                unplaced_hh.append((dset.households.building_id==-1).sum())
                unplaced_emp.append(dset.establishments[dset.establishments.building_id==-1].employees.sum())
                if sim_year in indicator_configuration['years_to_run']:
                    logger.log_status('Exporting indicators')
                    indicators.run(dset, indicator_configuration['indicator_output_directory'], sim_year)
                    logger.log_status('unplaced hh')
                    logger.log_status(unplaced_hh)
                    logger.log_status('unplaced emp')
                    logger.log_status(unplaced_emp)
                    
            ############     TRAVEL MODEL
            export_zonal_file.export_zonal_file_to_tm(dset,sim_year,logger,tm_config=[travel_model_configuration1,travel_model_configuration2,travel_model_configuration3,travel_model_configuration4,travel_model_configuration5,travel_model_configuration6])
                    
            ############     SWAPPER
            if sim_year == table_swapping['year']:
                if table_swapping['swap_skims']:
                    logger.log_status('Swapping skims')
                    td2 = pd.read_csv(table_swapping['new_skim_file'], index_col=['from_zone_id','to_zone_id'])
                    dset.d['travel_data'] = td2
                if table_swapping['swap_dist_rail']:
                    logger.log_status('Swapping parcel distance to rail')
                    p2 = pd.read_csv(table_swapping['new_dist_rail_file'], index_col=['parcel_id'])
                    dset.d['parcels']['dist_rail'] = p2.dist_rail
            
            ############      URBANCANVAS
            if export_buildings_to_urbancanvas:
                logger.log_status('Exporting %s buildings to Urbancanvas database for project %s and year %s.' % (newbuildings.index.size,urbancanvas_scenario_id,sim_year))
                urbancanvas_scenario_id = urbancanvas_export.export_to_urbancanvas(newbuildings, sim_year, urbancanvas_scenario_id)
                
        elapsed = time.time() - seconds_start
        print "TOTAL elapsed time: " + str(elapsed) + " seconds."
Esempio n. 19
0
def test_to_hdf_multiple_nodes():
    pytest.importorskip("tables")
    df = pd.DataFrame({
        "x": ["a", "b", "c", "d"],
        "y": [1, 2, 3, 4]
    },
                      index=[1.0, 2.0, 3.0, 4.0])
    a = dd.from_pandas(df, 2)
    df16 = pd.DataFrame(
        {
            "x": [
                "a",
                "b",
                "c",
                "d",
                "e",
                "f",
                "g",
                "h",
                "i",
                "j",
                "k",
                "l",
                "m",
                "n",
                "o",
                "p",
            ],
            "y": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
        },
        index=[
            1.0,
            2.0,
            3.0,
            4.0,
            5.0,
            6.0,
            7.0,
            8.0,
            9.0,
            10.0,
            11.0,
            12.0,
            13.0,
            14.0,
            15.0,
            16.0,
        ],
    )
    b = dd.from_pandas(df16, 16)

    # saving to multiple nodes
    with tmpfile("h5") as fn:
        a.to_hdf(fn, "/data*")
        out = dd.read_hdf(fn, "/data*")
        assert_eq(df, out)

    # saving to multiple nodes making sure order is kept
    with tmpfile("h5") as fn:
        b.to_hdf(fn, "/data*")
        out = dd.read_hdf(fn, "/data*")
        assert_eq(df16, out)

    # saving to multiple datasets with custom name_function
    with tmpfile("h5") as fn:
        a.to_hdf(fn, "/data_*", name_function=lambda i: "a" * (i + 1))
        out = dd.read_hdf(fn, "/data_*")
        assert_eq(df, out)

        out = pd.read_hdf(fn, "/data_a")
        tm.assert_frame_equal(out, df.iloc[:2])
        out = pd.read_hdf(fn, "/data_aa")
        tm.assert_frame_equal(out, df.iloc[2:])

    # test multiple nodes with hdf object
    with tmpfile("h5") as fn:
        with pd.HDFStore(fn) as hdf:
            b.to_hdf(hdf, "/data*")
            out = dd.read_hdf(fn, "/data*")
            assert_eq(df16, out)
Esempio n. 20
0
def generate_metadata(project_directory, metadata_file):
    """ project_directory = directory containing the input metadata.csv file and
    all the featuresN.hdf5 files"""

    results_files = glob.glob(str(
        Path(project_directory) / '**/*metadata_featuresN.hdf5'),
                              recursive=True)
    try:
        input_metadata_fname = Path(metadata_file)
    except Exception:
        input_metadata_fname = Path(
            glob.glob(str(Path(project_directory) / '**/metadata.csv'),
                      recursive=True)[0])

    metadata_in = pd.read_csv(input_metadata_fname, index_col=False)
    metadata_in.drop(columns='filename', inplace=True)
    metadata_in.index = pd.MultiIndex.from_arrays([
        metadata_in.date_yyyymmdd, metadata_in.run_number,
        metadata_in.well_number, metadata_in.instrument_name
    ],
                                                  names=meta_index)

    dates_to_analyse = list(metadata_in.date_yyyymmdd.unique())

    #extract from the results filename the setnumber, date, camera number and then from file extract well names
    metadata_extract = pd.DataFrame()
    for r in results_files:
        #and extract other metadata from the filename
        if 'bluelight' in r:
            continue
        else:
            _date = int(re.findall(date, r)[0])
            if _date in dates_to_analyse:
                _set = re.findall(set_no, r, re.IGNORECASE)[0]
                _camera = re.findall(camera, r)[0]
                _rig = HYDRA2CAM_DF.columns[(HYDRA2CAM_DF == _camera).any(
                    axis=0)][0]
                #extra wells from featuresM
                with pd.HDFStore(r, 'r') as fid:
                    wells = list(fid['/fov_wells'].well_name.unique())

                metadata_extract = metadata_extract.append(
                    pd.DataFrame({
                        'run_number': int(_set),
                        'date_yyyymmdd': _date,
                        'camera_no': _camera,
                        'well_number': wells,
                        'filename': r,
                        'instrument_name': _rig
                    }))

    metadata_extract.reset_index(drop=True, inplace=True)
    metadata_extract.index = pd.MultiIndex.from_arrays([
        metadata_extract.date_yyyymmdd, metadata_extract.run_number,
        metadata_extract.well_number, metadata_extract.instrument_name
    ],
                                                       names=meta_index)

    #concatenate together so that can merge
    metadata_concat = pd.concat([metadata_extract, metadata_in],
                                axis=1,
                                join='inner',
                                sort=True)
    metadata_concat = metadata_concat.drop(columns=[
        'date_yyyymmdd', 'run_number', 'well_number', 'instrument_name'
    ])
    metadata_concat.reset_index(drop=False, inplace=True)

    #save to csv
    metadata_concat.to_csv(input_metadata_fname.parent /
                           'updated_metadata.csv',
                           index=False)

    # add in extracting out the temperature and humidity data
    extra_jsons = glob.glob(str(
        Path(project_directory) / '**/*extra_data.json'),
                            recursive=True)

    if len(extra_jsons) > 0:
        json_metadata = pd.DataFrame()
        for e in extra_jsons:
            _date = int(re.findall(date, e)[0])
            if _date in dates_to_analyse:
                _set = re.findall(set_no, e, re.IGNORECASE)[0]
                _camera = re.findall(camera, e)[0]
                _rig = HYDRA2CAM_DF.columns[(HYDRA2CAM_DF == _camera).any(
                    axis=0)][0]
                with open(e) as fid:
                    extras = json.load(fid)
                    for t in extras:
                        json_metadata = json_metadata.append(pd.concat([
                            pd.DataFrame.from_records([{
                                'run_number': int(_set),
                                'date_yyyymmdd': _date,
                                'camera_no': _camera,
                                'filename': e,
                                'filedir': Path(e).parent,
                                'instrument_name': _rig
                            }]),
                            pd.DataFrame(pd.Series(t)).transpose()
                        ],
                                                                       axis=1),
                                                             ignore_index=True,
                                                             sort=True)
        # summarise json metadata by
        json_metadata.to_csv(input_metadata_fname.parent / 'extra_data.csv')

    if 'json_metadata' in locals():
        return metadata_concat, json_metadata
    else:
        return metadata_concat
Esempio n. 21
0
File: io.py Progetto: spark706/PyPSA
 def __init__(self, path):
     self.ds = pd.HDFStore(path)
    for dim, set in product([9, 7, 4], ['test', 'training']):
        print(dim, set)
        store = pd.HDFStore(set + '_' + 'gen2_' + str(dim) +
                            'D_nions0_flat.h5')
        store['/megarun1/flattened'] = data.loc[idx[dim] & idx[set]]
        store['/megarun1/input'] = inputs[dim].loc[idx[set]]
        store['/megarun1/constants'] = consts[dim]
        store.close()


if __name__ == '__main__':
    dim = 9

    store_name = ''.join(['gen2_', str(dim), 'D_nions0_flat'])
    store = pd.HDFStore('../' + store_name + '.h5', 'r')

    input = store['/megarun1/input']
    data = store['/megarun1/flattened']

    startlen = len(data)
    data = sanity_filter(data, 50, 1.5, 1.5, 1e-4, startlen=startlen)
    data = regime_filter(data, 0, 100)
    gc.collect()
    input = input.loc[data.index]
    print('After filter {!s:<13} {:.2f}% left'.format(
        'regime', 100 * len(data) / startlen))
    filter_num = 7
    sane_store = pd.HDFStore('../sane_' + store_name + '_filter' +
                             str(filter_num) + '.h5')
    sane_store['/megarun1/input'] = input
    'eblob2_bary': eblob2_bary,
    'blob1_bary_x': blob1_bary_x,
    'blob1_bary_y': blob1_bary_y,
    'blob1_bary_z': blob1_bary_z,
    'blob2_bary_x': blob2_bary_x,
    'blob2_bary_y': blob2_bary_y,
    'blob2_bary_z': blob2_bary_z,
})
df_vxls = pd.DataFrame({
    'event': event_vxls,
    'track_ID': track_ID_vxls,
    'voxel_x': voxel_x,
    'voxel_y': voxel_y,
    'voxel_z': voxel_z,
    'voxel_e': voxel_e
})

df_run_info = pd.DataFrame({
    'events_in': loop_events,
    'blob_radius': blob_radius
})

out_name = '/home/paolafer/analysis/tracking_trueinfo_TlMC_run4_vxl{0}mm_R{1}mm_{2}_{3}.hdf5'.format(
    int(size), int(blob_radius[0]), start, numb)

store = pd.HDFStore(out_name, "w", complib=str("zlib"), complevel=4)
store.put('tracks', df, format='table', data_columns=True)
store.put('voxels', df_vxls, format='table', data_columns=True)
store.put('run_info', df_run_info, format='table', data_columns=True)
store.close()
    opts = argparse.ArgumentParser()

    opts.add_argument('--pairStore', dest='pairStore')
    opts.add_argument('--armStore', dest='armStore')

    opts.add_argument('--table', dest='table')

    opts.add_argument('--readLenFwd', type=int, default=75, dest='readLenFwd')
    opts.add_argument('--readLenRev', type=int, default=75, dest='readLenRev')

    opts.add_argument('--bedOut', dest='bedOut')

    o = opts.parse_args()

    pairStore = pd.HDFStore(o.pairStore, 'r')

    assert o.table in pairStore, 'cannot find table %s in %s' % (o.table,
                                                                 o.pairStore)
    tblPairs = pairStore[o.table]

    (tblnameExtArm, tblnameLigArm) = pairStore.get_storer(
        o.table).attrs['mippipe__ext_and_lig_arm_tables']

    armStore = pd.HDFStore(o.armStore, 'r')

    tblArmsExt = armStore[tblnameExtArm]
    tblArmsLig = armStore[tblnameLigArm]

    btReadable = []
Esempio n. 25
0
import sys
import os
sys.path.append(os.path.join(os.path.dirname(__file__),'..'))
import numpy as np
import pandas as pd
from Conf.loadconf import *

store = pd.HDFStore(os.path.join(CSV_DATA_PATH, H5FILENAME))
test = store['test']
train = store['train']

test_desc = test.describe()
train_desc = train.describe()

test_desc.to_csv(os.path.join(CSV_DATA_PATH,'desc-test.csv'))
train_desc.to_csv(os.path.join(CSV_DATA_PATH,'desc-training.csv'))

# save to h5
store['test_desc'] = test_desc
store['train_desc'] = train_desc

# save to mongo


# a = np.log(data.RevolvingUtilizationOfUnsecuredLines+1)

#
#
import os
import re
from glob import glob
import numpy as np
import pandas as pd
import gzip

files = glob("/home/alex/data/stackexchange/overflow/posts/*.txt.gz")
hdfpath = "/home/alex/data/stackexchange/overflow/caches/posts_all.hdf5"

print "Number of posts in directory:", len(files)

# rani = 1798777
rani = np.random.randint(len(files))

with gzip.open(files[rani]) as gf:
    print ">>>"
    print gf.read()
    print "<<<"

store = pd.HDFStore(hdfpath, "r", complib="blosc", complevel=9)

cols = None
smask = store.select_as_coordinates("posts", "Id == %i" % int(re.findall(r"\d+", os.path.split(files[rani])[-1])[0]))
posts = store.select("posts", where=smask)

print "Information about this post:\n"
print posts.iloc[0]
Esempio n. 27
0
    data.append(el_data)
perf = pd.DataFrame(data)
perf
from StringIO import StringIO

tag = '<a href="http://www.google.com">Google</a>'
root = objectify.parse(StringIO(tag)).getroot()
root

# 二进制数据格式
frame = pd.read_csv('pydata-book/examples/ex1.csv')
frame.to_pickle('frame_pickle')  # 储存
pd.read_pickle('frame_pickle')  # 读取

# HDF5格式
store = pd.HDFStore('mydata.h5')
store['obj1'] = frame
store['obj1_col'] = frame['a']
store

# Excel文件
import xlrd
import openpyxl

xls_file = pd.ExcelFile('mydata.xlsx')
table = xls_file.parse('Sheet1')
table

# 使用HTML和Web API
import requests
Esempio n. 28
0

# Set up the destination  and secrects directory
dataDir = 'DATA_DIRECTORY'
secretsDir = 'SECRETS_DIRECTORY'
apiDic = pd.read_csv('~\\ML-energy-use\\' + secretsDir +
                     '\\apiKeyDictionary.csv')
ids = apiDic['id']
type = apiDic['type']

# Counting time of proccessing
start = time.time()

# Creating a saving file for after processing
store = pd.HDFStore(
    'C:\\Users\\Gonxo\\ML-energy-use\\DATA_DIRECTORY\\15min_noNaNs_201703081045.h5'
)

# Empty data frame to store previous feeds
feeds = pd.DataFrame()

# Looping for all the different feeds individually
for i in range(len(apiDic)):
    print(str(type[i]) + '_' + str(ids[i]))

    # Obtaining the feed from the hdf5 file
    feeds = pd.read_hdf(
        'C:\\Users\\Gonxo\\ML-energy-use\\DATA_DIRECTORY\\home_feeds.h5',
        str(type[i]) + '_' + str(ids[i]))['watts_hour']

    # Deleting NaNs at the beggining and end of te series.
Esempio n. 29
0
import pandas as pd

def add_csv_to_store(csv_file, name, store):
    df = pd.read_csv(csv_file)
    store.put(name, df, format='table', data_columns=True)

with pd.HDFStore(path='example_results_package.h5', mode='a') as hdf:

    add_csv_to_store(
        './oasis_output/output/gul_S1_aalcalc.csv', '/output/groundup_loss/all/aal', hdf)
    add_csv_to_store(
        './oasis_output/output/gul_S1_eltcalc.csv', 'results_package_example/output/groundup_loss/all/elt', hdf)
    add_csv_to_store(
        './oasis_output/output/gul_S1_leccalc_full_uncertainty_aep.csv', 'results_package_example/output/groundup_loss/all/aep_full_uncertainty', hdf)
    add_csv_to_store(
        './oasis_output/output/gul_S1_leccalc_full_uncertainty_oep.csv', 'results_package_example/output/groundup_loss/all/oep_full_uncertainty', hdf)
    add_csv_to_store(
        './oasis_output/output/gul_S1_pltcalc.csv', 'results_package_example/output/groundup_loss/all/plt', hdf)
    add_csv_to_store(
        './oasis_output/output/gul_S1_summary-info.csv', 'results_package_example/output/groundup_loss/all/summary_info', hdf)

    add_csv_to_store(
        './oasis_output/output/gul_S2_aalcalc.csv', 'results_package_example/output/groundup_loss/by_geography/aal', hdf)
    add_csv_to_store(
        './oasis_output/output/gul_S2_eltcalc.csv', 'results_package_example/output/groundup_loss/by_geography/elt', hdf)
    add_csv_to_store(
        './oasis_output/output/gul_S2_leccalc_full_uncertainty_aep.csv', 'results_package_example/output/groundup_loss/by_geography/aep_full_uncertainty', hdf)
    add_csv_to_store(
        './oasis_output/output/gul_S2_leccalc_full_uncertainty_oep.csv', 'results_package_example/output/groundup_loss/by_geography/oep_full_uncertainty', hdf)
    add_csv_to_store(
        './oasis_output/output/gul_S2_pltcalc.csv', 'results_package_example/output/groundup_loss/by_geography/plt', hdf)
Esempio n. 30
0
def read_ncdb(filepath):
    """
    Read data from Geolytics's Neighborhood Change Database (NCDB) and store it for later use.

    Parameters
    ----------
    input_dir : str
        location of the input CSV file extracted from your Geolytics DVD

    Returns
    -------
    DataFrame

    """

    ncdb_vars = variables["ncdb"].dropna()[1:].values

    df = pd.read_csv(
        filepath,
        low_memory=False,
        na_values=["", " ", 99999, -999],
        converters={
            "GEO2010": str,
            "COUNTY": str,
            "COUSUB": str,
            "DIVISION": str,
            "REGION": str,
            "STATE": str,
        },
    )

    cols = df.columns
    fixed = []
    for col in cols:
        if col.endswith("D"):
            fixed.append("D" + col[:-1])
        elif col.endswith("N"):
            fixed.append("N" + col[:-1])
        elif col.endswith("1A"):
            fixed.append(col[:-2] + "2")

    orig = []
    for col in cols:
        if col.endswith("D"):
            orig.append(col)
        elif col.endswith("N"):
            orig.append(col)
        elif col.endswith("1A"):
            orig.append(col)

    df.rename(dict(zip(orig, fixed)), axis="columns", inplace=True)

    df = pd.wide_to_long(df,
                         stubnames=ncdb_vars,
                         i="GEO2010",
                         j="year",
                         suffix="(7|8|9|0|1|2)").reset_index()

    df["year"] = df["year"].replace({
        7: 1970,
        8: 1980,
        9: 1990,
        0: 2000,
        1: 2010,
        2: 2010
    })
    df = df.groupby(["GEO2010", "year"]).first()

    mapper = dict(zip(variables.ncdb, variables.ltdb))

    df.reset_index(inplace=True)

    df = df.rename(mapper, axis="columns")

    df = df.set_index("geoid")

    store = pd.HDFStore(os.path.join(package_directory, "data.h5"), "w")
    store["ncdb"] = df

    store.close()

    return df