Python HDFStore.putの例、pandas.HDFStore.put Pythonの例

コード例 #1

0

ファイルを表示

ファイル: MeasurementCampaignTools.py プロジェクト: jbebic/pv-inverters-islanding-tests

def AddSeqComp(mypath):
    """ Loads TestLogAll.h5 from the specified path, then calls 
    MeasurementGroupTools.AddSeqComp to recalculate seq components using FFT  

    Input:  Directory of the measurment campaign, e.g.: "aLabView2"
    Output: Results1.h5, Results1.pdf in the data subdirs.
    """
    from pandas import HDFStore, ExcelWriter
    import MeasurementGroupTools as mgt

    h5logs = HDFStore(mypath + "\\" + 'TestLogsAll.h5')
    TestLog = h5logs['TestLogsAll']

    dirs = TestLog[u'DirName'].unique()
    for dname in dirs:
        mysubdirpath = mypath + "\\" + dname
        print "Processing: " + dname
        mgt.AddSeqComp(mysubdirpath, TestLog, dname)

    h5logs.put('TestLogsAll',TestLog)
    h5logs.close()

    writer = ExcelWriter(mypath + "\\" + 'TestLogsAll.xlsx')
    TestLog.to_excel(writer,'TestLogsAll') # the second argument defines sheet name
    writer.save()

    return

コード例 #2

0

ファイルを表示

def init_h5_database(database_name, meta_data, overwrite=False):
    """Initialize a h5 file for storing EEMs using a pandas DataFrame containing EEM meta data 
    
    Args:
        database_name (str): filename and relative path for h5 database
        meta_data (pandas DataFrame): DataFrame containing eem meta data from `pyeem.load_eem_meta_data` 
        function or created manually - see pyeem.load_eem_meta_data for required columns.  NOTE: do not use
        spaces or decimals in column names as this causes a warning when saving to H5 file format
        
    Returns:
        no retun - data is saved as h5 and may be loaded using `pyeem.load_eem_data`
    """
    from pandas import HDFStore

    # check if h5 file exists and overwrite or warn
    if os.path.isfile(database_name):
        if overwrite is True:
            print('overwriting ' + database_name)
            os.remove(database_name)
        else:
            raise ValueError(
                "h5 file " + database_name +
                " exists. Choose new database name or set overwrite=True")

    # create a h5 file to store EEM meta data
    hdf = HDFStore(database_name)
    hdf.put('meta', meta_data, format='table', data_columns=True)
    hdf.close()
    return

コード例 #3

0

ファイルを表示

ファイル: hdf5.py プロジェクト: sagarkumardse/retriever

class engine(Engine):
    """Engine instance for writing data to a HDF5 file."""

    name = "HDF5"
    abbreviation = "hdf5"
    insert_limit = 1000
    required_opts = [
        ("file", "Enter the filename of your HDF5 file", "hdf5.h5"),
        ("table_name", "Format of table name", "{db}_{table}"),
        ("data_dir", "Install directory", DATA_DIR),
    ]

    def create_db(self):
        """Override create_db since an SQLite dataset needs to be created
        first followed by the creation of an empty HDFStore file.
        """
        file_path = os.path.join(self.opts["data_dir"], self.opts["file"])
        self.file = HDFStore(file_path)

    def create_table(self):
        """Don't create table for HDF5

        HDF5 doesn't create tables. Each database is a file which has been
        created. This overloads`create_table` to do nothing in this case.
        """
        return None

    def insert_data_from_file(self, filename):
        """Fill the table by fetching the dataframe from the
        SQLite engine and putting it into the HDFStore file.
        """
        table_name = self.table_name()
        df = self.fetch_table(table_name)
        self.file.put(table_name, df, data_columns=True)

    def fetch_table(self, table_name):
        """Return a table from sqlite dataset as pandas dataframe."""
        connection = self.get_sqlite_connection()
        sql_query = "SELECT * FROM {};".format(table_name)
        return pd.read_sql_query(sql_query, connection)

    def get_sqlite_connection(self):
        # self.get_input()
        file = self.opts["file"]
        file = (file.split("."))[0] + ".db"
        db_file = self.opts["data_dir"]
        full_path = os.path.join(db_file, file)
        return dbapi.connect(os.path.normpath(full_path))

    def get_connection(self):
        """Gets the db connection."""
        self.get_input()
        return DummyConnection()

    def disconnect(self):
        """Close the file after being written"""
        self.file.close()
        file = self.opts["file"]
        file = (file.split("."))[0] + ".db"
        os.remove(file)

コード例 #4

0

ファイルを表示

ファイル: utils.py プロジェクト: kamalshadi/MANIA2.0

def create_store(sub):
    hdf = HDFStore('all.h5')
    d = DataFrame(columns=[
        'SUB', 'SEED', 'SEED ROI', 'TARGET ROI', 'HEMISPHERE', 'DISTANCE',
        'STRENGTH', 'CAT1', 'CAT2', 'CAT3'
    ])
    for i in range(1, 181):
        LSfname = '../' + sub + '/out/L' + str(
            i) + '/matrix_seeds_to_all_targets'
        LDfname = '../' + sub + '/out/L' + str(
            i) + '/matrix_seeds_to_all_targets_lengths'
        RSfname = '../' + sub + '/out/R' + str(
            i) + '/matrix_seeds_to_all_targets'
        RDfname = '../' + sub + '/out/R' + str(
            i) + '/matrix_seeds_to_all_targets_lengths'
        ls = readS2R(LSfname)
        rs = readS2R(RSfname)
        ld = readS2R_L(LDfname)
        rd = readS2R_L(RDfname)
        numSeeds, numROIs = ls.shape
        for j in tqdm(range(numSeeds), total=numSeeds):
            for q in range(numROIs):
                tmp = Series([
                    sub, j + 1, i + 1, q + 1, 'L', ld[j, q], ls[j, q], '', '',
                    ''
                ])
                d = d.append(tmp, ignore_index=True)
        # numSeeds ,numROIs = rs.shape
        # for j in range(numSeeds):
        #     for q in range(numROIs):
        #         tmp = Series([sub,j+1,i+1,q+1,'R',rd[j,q],rs[j,q],'','',''])
        #         d = d.append(tmp,ignore_index=True)
        if i == 1: break
    hdf.put(sub, d)

コード例 #5

0

ファイルを表示

def compute_and_save_hist_as_pd(values     : np.array           ,
                                out_file   : pd.HDFStore        ,
                                hist_name  : str                ,
                                n_bins     : int                ,
                                range_hist : Tuple[float, float],
                                norm       : bool = False       )->None:
    """
    Computes 1d-histogram and saves it in a file.
    The name of the table inside the file must be provided.
    Parameters
    ----------
    values : np.array
        Array with values to be plotted.
    out_file: pd.HDFStore
        File where histogram will be saved.
    hist_name: string
        Name of the pd.Dataframe to contain the histogram.
    n_bins: int
        Number of bins to make the histogram.
    range_hist: length-2 tuple (optional)
        Range of the histogram.
    norm: bool
        If True, histogram will be normalized.
    """
    n, b = np.histogram(values, bins = n_bins,
                        range = range_hist,
                        density = norm)
    table = pd.DataFrame({'entries': n,
                          'magnitude': shift_to_bin_centers(b)})
    out_file.put(hist_name, table, format='table', data_columns=True)

    return

コード例 #6

0

ファイルを表示

class PandasHDFHandler(FileHandler):
    r"""
    Handler for HDF5 files using Pandas.
    """
    def _open_for_read(self):
        self.handle = HDFStore(self.fname, mode='r')

    def _open_for_write(self):
        self.handle = HDFStore(self.fname)

    def list_items(self):
        keys = [key.strip('/') for key in self.handle.keys()]
        items = [(key, _get_type_from_attrs(self.handle.get_storer(key).attrs))
                 for key in keys if '/' not in key]
        # ---- for backward compatibility (LArray < 0.33) ----
        # axes
        items += [(key.split('/')[-1], 'Axis_Backward_Comp') for key in keys
                  if '__axes__' in key]
        # groups
        items += [(key.split('/')[-1], 'Group_Backward_Comp') for key in keys
                  if '__groups__' in key]
        return items

    def _read_item(self, key, typename, *args, **kwargs):
        if typename in _supported_typenames:
            hdf_key = '/' + key
        # ---- for backward compatibility (LArray < 0.33) ----
        elif typename == 'Axis_Backward_Comp':
            hdf_key = '__axes__/' + key
        elif typename == 'Group_Backward_Comp':
            hdf_key = '__groups__/' + key
        else:
            raise TypeError()
        return read_hdf(self.handle, hdf_key, *args, **kwargs)

    def _dump_item(self, key, value, *args, **kwargs):
        hdf_key = '/' + key
        if isinstance(value, (Array, Axis)):
            value.to_hdf(self.handle, hdf_key, *args, **kwargs)
        elif isinstance(value, Group):
            hdf_axis_key = '/' + value.axis.name
            value.to_hdf(self.handle, hdf_key, hdf_axis_key, *args, **kwargs)
        elif isinstance(value, _supported_scalars_types):
            s = pd.Series(data=value)
            self.handle.put(hdf_key, s)
            self.handle.get_storer(hdf_key).attrs.type = type(value).__name__
        else:
            raise TypeError()

    def _read_metadata(self):
        metadata = Metadata.from_hdf(self.handle)
        if metadata is None:
            metadata = Metadata()
        return metadata

    def _dump_metadata(self, metadata):
        metadata.to_hdf(self.handle)

    def close(self):
        self.handle.close()

コード例 #7

0

ファイルを表示

ファイル: kraken.py プロジェクト: portfolioscout/tf

def storeHdf5(data, tag, path):
    hdf = HDFStore(path,'a')
    if tag in hdf.keys():
        hdf.append(tag,data)
    else:
        hdf.put(tag,data)
    hdf.close()

コード例 #8

0

ファイルを表示

def storeHdf5(data, tag, path):
    hdf = HDFStore(path, 'a')
    if tag in hdf.keys():
        hdf.append(tag, data)
    else:
        hdf.put(tag, data)
    hdf.close()

コード例 #9

0

ファイルを表示

ファイル: age_structure.py プロジェクト: Iliato/openfisca-qt

def build_from_openfisca( directory = None):

    df_age_final = None
    for yr in range(2006,2010):
        simulation = SurveySimulation()
        simulation.set_config(year = yr)
        simulation.set_param()
        simulation.set_survey()


        df_age = get_age_structure(simulation)
        df_age[yr] = df_age['wprm']
        del df_age['wprm']
        if df_age_final is None:
            df_age_final = df_age
        else:
            df_age_final = df_age_final.merge(df_age)

    if directory is None:
        directory = os.path.dirname(__file__)

    fname = os.path.join(directory, H5_FILENAME)
    store = HDFStore(fname)
    print df_age_final.dtypes
    store.put("openfisca", df_age_final)
    store.close()

コード例 #10

0

ファイルを表示

    def save(self, store: pandas.HDFStore) -> None:
        """
        Save a model to an open HDFStore.

        Notes:
            Performs an IO operation.

        Args:
            store (pandas.HDFStore)

        Returns:
            None

        """
        # save the config as an attribute
        config = self.get_config()
        store.put('model', pandas.DataFrame())
        store.get_storer('model').attrs.config = config
        # save the parameters
        for i in range(self.num_weights):
            key = os.path.join('weights', 'weights' + str(i))
            self.weights[i].save_params(store, key)
        for i in range(self.num_layers):
            key = os.path.join('layers', 'layers' + str(i))
            self.layers[i].save_params(store, key)

コード例 #11

0

ファイルを表示

ファイル: separate_tables_generator.py プロジェクト: stanislasrybak/openfisca-qt

def convert_to_3_tables(year=2006, survey_file=None, output_file=None):

    if survey_file is None:
        raise Exception(
            'You need a .h5 file with the survey to extract the variables from'
        )
    if output_file is None:
        output_file = survey_file
        raise Warning(
            'the survey file will be used to store the created tables')

    store = HDFStore(survey_file)
    output = HDFStore(output_file)
    print output

    simulation = SurveySimulation()
    simulation.set_config(year=year)
    table1 = store['survey_' + str(year)]

    for entity in ['ind', 'foy', 'men', 'fam']:
        key = 'survey_' + str(year) + '/' + str(entity)

        vars_matching_entity = vars_matching_entity_from_table(
            table1, simulation, entity)
        print entity, vars_matching_entity_from_table
        print 'table1 enum'

        if entity == 'ind':
            print 'INDIVIDUALS'
            print table1['noindiv']
            table_entity = table1.loc[:, vars_matching_entity]

        # we take care have all ident and selecting qui==0
        else:
            #             print '    entity :', entity
            #             print table1['noindiv'].head()
            position = 'qui' + entity
            #             print table1[position]
            table_entity = table1.ix[table1[position] == 0, [
                'noi', 'idmen', 'idfoy', 'idfam', 'quifoy', 'quimen', 'quifam'
            ] + vars_matching_entity]
            #             print table_entity.noi.head()
            table_entity = table_entity.rename_axis(table_entity['id' +
                                                                 entity],
                                                    axis=1)


#             print '    APRES'
#             print table_entity.noi.head()
        print key
        output.put(key, table_entity)

    del table1
    import gc
    gc.collect()

    store.close()
    output.close()

コード例 #12

0

ファイルを表示

ファイル: gen_h5df.py プロジェクト: awilli69/hdfio

def write_file(format):
    outfile = '../inst/exampledata/pytables_' + format + '.h5'
    
    if os.path.isfile(outfile):
        os.remove(outfile)
    
    hdf = HDFStore(outfile)
    hdf.put('mydata', df, format=format, data_columns=True, encoding="utf-8")
    hdf.close()

コード例 #13

0

ファイルを表示

class HdfStore(DataStore):
    complevel = 9
    complib = "blosc:zstd"

    def __init__(self,
                 path: str,
                 table: str,
                 compute: Optional[Callable] = None) -> None:
        self.table = table
        if compute:
            self.store = PandasHDFStore(path,
                                        complevel=self.complevel,
                                        complib=self.complib)
            dataframe = compute()
            dataframe.sort_values(by="where", axis=0, inplace=True)
            self._mangle_where(dataframe)
            self.store.put(
                self.table,
                dataframe,
                append=False,
                format="table",
                expectedrows=len(dataframe),
                data_columns=[
                    "where_", "where_type", "who", "who_type", "when",
                    "when_type"
                ],
            )
        else:
            self.store = PandasHDFStore(path,
                                        complevel=self.complevel,
                                        complib=self.complib,
                                        mode="r")

    def query(self, query: str) -> DataFrame:
        query = self._mangle_where_in_query(query)
        df = self.store.select(self.table, where=query)
        self._unmangle_where(df)
        return df

    def _mangle_where(self, df: DataFrame) -> None:
        # See: https://github.com/PyTables/PyTables/issues/638
        df.rename(columns={"where": "where_"}, inplace=True)

    def _unmangle_where(self, df: DataFrame) -> None:
        # See: https://github.com/PyTables/PyTables/issues/638
        df.rename(columns={"where_": "where"}, inplace=True)

    def _mangle_where_in_query(
            self, query: Union[str, List[str]]) -> Union[str, List[str]]:
        # See: https://github.com/PyTables/PyTables/issues/638
        if isinstance(query, str):
            return re.sub("where([^_])", "where_\\1", query)
        else:
            return [
                self._mangle_where_in_query(subquery) for subquery in query
            ]

コード例 #14

0

ファイルを表示

ファイル: dbm_paysage.py プロジェクト: des137/Restricted-BM-Scratch

 def save(self, store: pandas.HDFStore) -> None:
     config = self.get_config()
     store.put('model', pandas.DataFrame())
     store.get_storer('model').attrs.config = config
     for i in range(self.num_layers):
         key = os.path.join('layers', 'layers_'+str(i))
         self.layers[i].save_params(store, key)
     for i in range(self.num_connections):
         key = os.path.join('connections', 'weights_'+str(i))
         self.connections[i].weights.save_params(store, key)

コード例 #15

0

ファイルを表示

ファイル: selector.py プロジェクト: haje01/wzdat

 def to_frame_hdf(self, store_path, store_key, df_cb=None, max_msg=None,
                  usecols=None, chunk_cnt=CHUNK_CNT, show_prog=True):
     store = HDFStore(store_path, 'w')
     df = self._to_frame(usecols, chunk_cnt, show_prog)
     df['msg'] = df['msg'].apply(lambda m: m.encode('utf8'))
     if df_cb is not None:
         df_cb(df)
     min_itemsize = {'kind': 20, 'msg': 255}
     if max_msg is not None:
         min_itemsize['msg'] = max_msg
     store.put(store_key, df, format='table', min_itemsize=min_itemsize)
     store.flush()
     store.close()

コード例 #16

0

ファイルを表示

ファイル: load_logs.py プロジェクト: jasonshih/log_analysis

 def save_all_logs(self, force=False):
      
     if os.path.exists(self.store_path):
         final_store = HDFStore(self.store_path)
         print 'Keys: %s' % final_store
         final_store.close()
         return
     if not force:
         assert not os.path.exists(self.history_path), '''
             %s exists but %s does not.
             There appears to be a conversion in progress.
             -f forces conversion to complete.
         ''' % (self.history_path, self.store_path)
     
     self.directory.make_dir_if_necessary(self.progress_store_path)
     self.progress_store = HDFStore(self.progress_store_path)
     for path in self.log_list:
         self.save_log(path)
     
     self.check()    
     print '--------'
     print 'All tables in %s' % self.progress_store_path
     print self.progress_store.keys()
     print '--------'
     
     def get_log(path):
         try:
             return self.progress_store.get(LogSaver.normalize(path))
         except Exception as e:
             print
             print path
             raise e
            
     
     df_list = [get_log(path) for path in self.log_list]     
     self.progress_store.close()
     print 'Closed %s' % self.progress_store_path
     
     df_all = pd.concat(df_list)
     print 'Final list has %d entries' % len(df_all)
     final_store = HDFStore(self.store_path)
     final_store.put('logs', df_all)
     print 'Keys: %s' % final_store
     final_store.close()
     print 'Closed %s' % self.store_path
     
     # Save the history in a corresponding file
     self.directory.save('history', self.history)
     print 'Saved history'
     
     self.saved = True

コード例 #17

0

ファイルを表示

ファイル: io.py プロジェクト: rosswhitfield/javelin

def save_xarray_to_HDF5(dataArray, filename, complib=None):
    """Save the xarray DataArray to HDF file using pandas HDFStore

    attrs will be saved as metadata via pickle

    requries pytables

    complib : {'zlib', 'bzip2', 'lzo', 'blosc', None}, default None"""
    from pandas import HDFStore
    f = HDFStore(filename, mode='w', complib=complib)
    f.put('data', dataArray.to_pandas())
    if len(dataArray.attrs) > 0:
        f.get_storer('data').attrs.metadata = dataArray.attrs
    f.close()

コード例 #18

0

ファイルを表示

ファイル: io.py プロジェクト: stuartcampbell/javelin

def save_xarray_to_HDF5(dataArray, filename, complib=None):
    """Save the xarray DataArray to HDF file using pandas HDFStore

    attrs will be saved as metadata via pickle

    requries pytables

    complib : {'zlib', 'bzip2', 'lzo', 'blosc', None}, default None"""
    from pandas import HDFStore
    f = HDFStore(filename, mode='w', complib=complib)
    f.put('data', dataArray.to_pandas())
    if len(dataArray.attrs) > 0:
        f.get_storer('data').attrs.metadata = dataArray.attrs
    f.close()

コード例 #19

0

ファイルを表示

ファイル: separate_tables_generator.py プロジェクト: Iliato/openfisca-qt

def convert_to_3_tables(year=2006, survey_file=None, output_file=None):

    if survey_file is None:
        raise Exception('You need a .h5 file with the survey to extract the variables from')
    if output_file is None:
        output_file = survey_file
        raise Warning('the survey file will be used to store the created tables')

    store = HDFStore(survey_file)
    output = HDFStore(output_file)
    print output

    simulation = SurveySimulation()
    simulation.set_config(year=year)
    table1 = store['survey_'+str(year)]

    for entity in ['ind','foy','men','fam']:
        key = 'survey_'+str(year) + '/'+str(entity)

        vars_matching_entity = vars_matching_entity_from_table(table1, simulation, entity)
        print entity, vars_matching_entity_from_table
        print 'table1 enum'

        if entity == 'ind':
            print 'INDIVIDUALS'
            print table1['noindiv']
            table_entity = table1.loc[:, vars_matching_entity]

        # we take care have all ident and selecting qui==0
        else:
#             print '    entity :', entity
#             print table1['noindiv'].head()
            position = 'qui'+entity
#             print table1[position]
            table_entity = table1.ix[table1[position] == 0 ,['noi','idmen','idfoy','idfam','quifoy','quimen','quifam'] +
                                                        vars_matching_entity]
#             print table_entity.noi.head()
            table_entity= table_entity.rename_axis(table_entity['id'+entity], axis=1)
#             print '    APRES'
#             print table_entity.noi.head()
        print key
        output.put(key, table_entity)

    del table1
    import gc
    gc.collect()

    store.close()
    output.close()

コード例 #20

0

ファイルを表示

ファイル: script_preprocessing.py プロジェクト: fxnnxc/Movie_Sentiment_Classification

def make_df_file():
    path = '../data/'
    remove_list = ['CAMERA', 'END', 'DISSOLVE', 'CUT']
    files = [f for f in listdir(path) if isfile(join(path, f))]
    for f in files:
        script = list(open(path+f, 'r', encoding='utf-8'))
        act_list = get_main_actors(script)
        speaker, sentence = get_flow(script, act_list)
        # Evaluate Sentimet
        value = calculateSent(sentence)
        value = moving_average(value)
        hdf = HDFStore('../processed_data/'+f[0:-4]+'.h5')
        hdf.put('d1', pd.DataFrame({'speaker':speaker, 'value':value}), format='table', data_columns=True)
        hdf.close()
        print("Store ", f)

コード例 #21

0

ファイルを表示

def save_data(data=pd.DataFrame, hdfs=True, dir=""):

    import datetime

    file_name = "raw_data_" + datetime.datetime.now().strftime("%y%m%d")

    data.to_csv(dir + "/" + file_name + ".csv", encoding="utf-8")
    print("Data saved as csv already")

    if hdfs:
        hdf = HDFStore(file_name + ".h5")
        hdf.put(file_name, result, format="table", data_columns=True, encoding="utf-8")
        hdf.close()
        print("Data saved as HDF already")
    else:
        pass

コード例 #22

0

ファイルを表示

ファイル: funcs4pf.py プロジェクト: tsherwen/AC_tools

def pf2pandas(wd, files, vars=None, npwd=None, rmvars=None,   \
            debug=False):
    """ 
    Read in GEOS-Chem planeflight output and convert to HDF format

     - Converts date and time columns to datetime format indexes
     - the resultant HDF is in 2D list form 
    ( aka further processing required to 3D /2D output  )
        
    Note: 
     - This function is limited by the csv read speed. for large csv output expect 
     significant processing times or set to automatically run post run
     - Original files are not removed, so this function will double space usage for 
     output unless the original fiels are deleted.
    """

    # Ensure working dorectory string has leading foreward slash
    if wd[-1] != '/':
        wd += '/'

#    pfdate =( re.findall('\d+', file ) )[-1]
    if not isinstance(vars, list ):
        vars, sites = get_pf_headers( files[0], debug=debug )
    if not isinstance(npwd, str ):
        npwd = get_dir('npwd')
    hdf =HDFStore( npwd+ 'pf_{}_{}.h5'.format( wd.split('/')[-3], \
        wd.split('/')[-2], wd.split('/')[-1]  ))
    
    if debug:
        print hdf

    for file in files:
        print file#, pfdate

        # convert planeflight.log to DataFrame
        df = pf_csv2pandas( file, vars )
            
        if file==files[0]:
            hdf.put('d1', df, format='table', data_columns=True)
        else:
            hdf.append('d1', df, format='table', data_columns=True)

        if debug:
            print hdf['d1'].shape, hdf['d1'].index
        del df
    hdf.close()

コード例 #23

0

ファイルを表示

ファイル: temporary.py プロジェクト: openfisca/openfisca-survey-manager

def save_hdf_r_readable(data_frame, config_files_directory = default_config_files_directory, file_name = None,
                        file_path = None):
    if file_path is None:
        parser = SafeConfigParser()
        config_ini = os.path.join(config_files_directory, 'config.ini')
        parser.read(config_ini)
        tmp_directory = parser.get('data', 'tmp_directory')
        if file_name is not None:
            if not file_name.endswith('.h5'):
                file_name = "{}.h5".format(file_name)
            file_path = os.path.join(tmp_directory, file_name)
        else:
            file_path = os.path.join(tmp_directory, 'temp.h5')

    store = HDFStore(file_path, "w", complib = str("zlib"), complevel = 5)
    store.put("dataframe", data_frame, data_columns = data_frame.columns)
    store.close()

コード例 #24

0

ファイルを表示

ファイル: funcs4pf.py プロジェクト: BenNewsome/AC_tools

def pf2pandas(wd, files, vars=None, npwd=None, rmvars=None,   \
            debug=False):
    """ 
    Read in GEOS-Chem planeflight output and convert to HDF format

     - Converts date and time columns to datetime format indexes
     - the resultant HDF is in 2D list form 
    ( aka further processing required to 3D /2D output  )
        
    Note: 
     - This function is limited by the csv read speed. for large csv output expect 
     significant processing times or set to automatically run post run
     - Original files are not removed, so this function will double space usage for 
     output unless the original fiels are deleted.
    """

    # Ensure working dorectory string has leading foreward slash
    if wd[-1] != '/':
        wd += '/'

#    pfdate =( re.findall('\d+', file ) )[-1]
    if not isinstance(vars, list ):
        vars, sites = get_pf_headers( files[0], debug=debug )
    if not isinstance(npwd, str ):
        npwd = get_dir('npwd')
    hdf =HDFStore( npwd+ 'pf_{}_{}.h5'.format( wd.split('/')[-3], \
        wd.split('/')[-2], wd.split('/')[-1]  ))
    
    if debug:
        print hdf

    for file in files:
        print file#, pfdate

        # convert planeflight.log to DataFrame
        df = pf_csv2pandas( file, vars )
            
        if file==files[0]:
            hdf.put('d1', df, format='table', data_columns=True)
        else:
            hdf.append('d1', df, format='table', data_columns=True)

        if debug:
            print hdf['d1'].shape, hdf['d1'].index
        del df
    hdf.close()

コード例 #25

0

ファイルを表示

class Serialization(object):
    def __init__(self, filename, mode='r', compress=True):

        self._filename = filename
        self._compress = compress
        self._mode = mode

    def __enter__(self):

        if self._compress:

            self._store = HDFStore(self._filename,
                                   complib='blosc:lz4',
                                   complevel=9,
                                   mode=self._mode)

        else:  # pragma: no cover

            self._store = HDFStore(self._filename, mode=self._mode)

        return self

    def __exit__(self, exc_type, exc_val, exc_tb):

        self._store.close()

    @property
    def keys(self):

        return self._store.keys()

    def store_pandas_object(self, path, obj, **metadata):

        self._store.put(path, obj, format='fixed')

        self._store.get_storer(path).attrs.metadata = metadata

    def retrieve_pandas_object(self, path):

        # Get the metadata
        metadata = self._store.get_storer(path).attrs.metadata

        # Get the object
        obj = self._store.get(path)

        return obj, metadata

コード例 #26

0

ファイルを表示

ファイル: age_structure.py プロジェクト: Iliato/openfisca-qt

def build_from_insee( directory = None, verbose=False):

    if directory is None:
        directory = os.path.dirname(__file__)

    fname = os.path.join(directory, H5_FILENAME)
    store = HDFStore(fname)
    xls = ExcelFile(os.path.join(model.DATA_SOURCES_DIR, "sd2010_t6_fm.xls"))

    df_age_final = None

    for year in range(2006,2010):
        sheet_name = str(year)

        df = xls.parse(sheet_name, header=0, index_col=0, skiprows=8, parse_cols=[1,2], na_values=['NA'])

        df.index.name = u"âge"
        df.rename(columns = {"Unnamed: 1" : year}, inplace = True)

        # Dealing with te 90 et plus and 105 et plus
        df = df.reset_index()
        df = df.dropna(axis=0)
        df.set_value(106,u"âge", 105)
        df = df.set_index(u"âge")
        df.drop(df.index[90], axis=0, inplace=True)
        df.index.name = u"âge"
        df = df.reset_index()
        if verbose:
            print "year : " + str(year)
            print df.to_string()


        if df_age_final is None:
            df_age_final = df
        else:
            df_age_final = df_age_final.merge(df)

    if verbose:
        print df_age_final.to_string()
        print df_age_final.dtypes

    from numpy import dtype
    df_age_final[u"âge"] = df_age_final[u"âge"].astype(dtype("int64"))
    store.put("insee", df_age_final)

コード例 #27

0

ファイルを表示

def main():
    knowledge_dir = '/home/tor/xprmnt/knowledge-construction'

    # read
    relloc_knowledge = kr.read(knowledge_dir+'/relative-location-knowledge/relloc-pickle')

    # write
    relloc_knowledge_hdf5 = HDFStore(knowledge_dir+'/relative-location-knowledge/relloc-hdf5/relloc.h5')

    for key, local in relloc_knowledge.iteritems():        
        for key2 in local:
            df = DataFrame(local[key2])
            df_id = key+'/'+key2
            print 'writing:', df_id
            relloc_knowledge_hdf5.put(df_id, df)

    print 'writing: obj_class'
    obj_class = Series(relloc_knowledge.keys())
    relloc_knowledge_hdf5.put('obj_class', obj_class)

コード例 #28

0

ファイルを表示

ファイル: temporary.py プロジェクト: nikhilwoodruff/openfisca-survey-manager

def save_hdf_r_readable(data_frame,
                        config_files_directory=default_config_files_directory,
                        file_name=None,
                        file_path=None):
    if file_path is None:
        parser = ConfigParser()
        config_ini = os.path.join(config_files_directory, 'config.ini')
        parser.read(config_ini)
        tmp_directory = parser.get('data', 'tmp_directory')
        if file_name is not None:
            if not file_name.endswith('.h5'):
                file_name = "{}.h5".format(file_name)
            file_path = os.path.join(tmp_directory, file_name)
        else:
            file_path = os.path.join(tmp_directory, 'temp.h5')

    store = HDFStore(file_path, "w", complib=str("zlib"), complevel=5)
    store.put("dataframe", data_frame, data_columns=data_frame.columns)
    store.close()

コード例 #29

0

ファイルを表示

    def aggregate(hdf_store_loc,
                  file_pattern,
                  headerfile=None,
                  remove_part_files=False):
        df = None

        store = HDFStore(hdf_store_loc)
        store_keys = [w.replace('/', '') for w in store.keys()]

        print(
            f'Aggregating part files in {hdf_store_loc} for {file_pattern} into single file'
        )

        for key in store_keys:
            if re.match(file_pattern.replace('*', '.+'), key):
                print(
                    f'********************* Key : {key} MAtches pattern : {file_pattern.replace("*",".+")}'
                )
                #thisdf = pd.read_hdf(store_loc, key)
                thisdf = store.select(key)

                if df is None:
                    df = thisdf
                else:
                    #' for gz file that not have headers assign headers.
                    try:
                        df = df.append(thisdf, ignore_index=True, sort=True)
                    except Exception as e:
                        print('Error while joining data {e}')

                if remove_part_files:
                    store.remove(key)

        try:
            #df.to_hdf(store_loc, key=file_pattern.replace('*',''))
            store.put(key=file_pattern.replace('*', ''), value=df)
        except Exception as e:
            print(
                f'Exception while combining flile for {file_pattern} exception {e}'
            )

        store.close()

コード例 #30

0

ファイルを表示

ファイル: util.py プロジェクト: ramav87/KMC-SVPG

 def write_to_hdf5_on_disk(self,
                           file_name,
                           writing_dataframe,
                           episode=None):
     if episode == 0:  # create HDFStore container
         print(file_name)
         hdf_container = HDFStore(file_name + '.h5')
         # print('--------------- Container type: {}'.format(type(hdf_container)))
         hdf_container.put(str(episode),
                           writing_dataframe,
                           format='table',
                           data_columns=True)
         hdf_container.close()
     else:
         with HDFStore(file_name + '.h5', mode='a') as store:
             store.append(str(episode),
                          writing_dataframe,
                          append=True,
                          format='table',
                          data_columns=True)

コード例 #31

0

ファイルを表示

ファイル: probs_saver.py プロジェクト: ikibardin/cdiscount-image-classification-challenge

class ProbStore:
    def __init__(self, path=PREDICT_PROBS_PATH):
        self._hdf = HDFStore(path)
        self._path = path
        self._length = 0

    def saveProbs(self, data):
        """
        data -- a DataFrame for one image, index -- index of an image in test
        """
        assert isinstance(data, DataFrame)
        # self._hdf['d' + str(self._length)] = data
        self._hdf.put('d' + str(self._length), data, table=True)
        self._length += 1

    def __len__(self):
        return self._length

    def __getitem__(self, index):
        return self._hdf['d' + str(index)]

コード例 #32

0

ファイルを表示

ファイル: MeasurementCampaignTools.py プロジェクト: jbebic/pv-inverters-islanding-tests

def PopulateMasterTestLogTable(mypath):
    """ Calls MergeTestLogs to iterate over all subdirs of mypath and 
    read in TestLogSnn Excel files and build a dataframe with all recorded
    test conditions. 
    Then, calls ProcessResults to extract scalar properties of 
    islanding test results and save them into placeholders in the 
    master table.
    Finally, saves the master table as h5 and Excel files

    Input:  Directory with the test result directories, e.g.: "aLabView2"

    Output: TestLogsAll.h5, TestLogsAll.xlsx
    Note:   ProcessResults generate Results.h5 and Results.pdf files in 
            the data subdirs.
    """
    from os import listdir
    from os.path import isdir, join

    from pandas import HDFStore, ExcelWriter
    import MeasurementGroupTools as mgt

    TestLog = MergeTestLogs(mypath)    

    mydirs = [d for d in listdir(mypath) if isdir(join(mypath,d)) ]

    # print mydirs + mydirs[1:2]
    for dname in mydirs:
        mysubdirpath = mypath + "\\" + dname
        print "Processing: " + dname
        mgt.ProcessResults(mysubdirpath, TestLog)

    h5store = HDFStore(mypath + "\\" + 'TestLogsAll.h5')
    h5store.put('TestLogsAll',TestLog)
    h5store.close()

    writer = ExcelWriter(mypath + "\\" + 'TestLogsAll.xlsx')
    TestLog.to_excel(writer,'TestLogsAll') # the second argument defines sheet name
    writer.save()

    return

コード例 #33

0

ファイルを表示

ファイル: pca.py プロジェクト: shevisjohnson/paysage

    def save(self,
             store: pandas.HDFStore,
             num_components_save: int = None) -> None:
        """
        Save the PCA transform in an HDFStore.
        Allows to save only the first num_components_save.

        Notes:
            Performs an IO operation.

        Args:
            store (pandas.HDFStore)
            num_components_save (int): the number of principal components to save.
                If None, all are saved.

        Returns:
            None

        """
        n = num_components_save if num_components_save is not None \
            else self.num_components
        assert n <= self.num_components

        # the config
        config = {'num_components': n, 'stepsize': self.stepsize}
        store.put('pca', pandas.DataFrame())
        store.get_storer('pca').attrs.config = config

        # the parameters
        store.put('pca/W', pandas.DataFrame(be.to_numpy_array(self.W[:, :n])))
        store.put('pca/var', pandas.DataFrame(be.to_numpy_array(self.var[:n])))
        # check if the mean exists before saving
        if self.mean is not None:
            store.put('pca/mean',
                      pandas.DataFrame(be.to_numpy_array(self.mean)))
        var_calc_df = self.var_calc.to_dataframe()
        # if fit from SVD, there is no calculator used
        if var_calc_df is not None:
            store.put('pca/var_calc', var_calc_df.iloc[:n])

コード例 #34

0

ファイルを表示

ファイル: utils.py プロジェクト: LouisePaulDelvaux/openfisca-france-data

def csv2hdf5(csv_name, h5_name, dfname, option='frame'):
    """
    Convert a csv file to a dataframe in a hdf5

    Parameters:

    csv_name: string
              csv file name
    h5_name : string
              hdf5 file name
    dfname  : string
              dataframe name
    option  : string, 'frame' or 'table', default to 'frame'
              stoing type in the pytable
    """

    table = read_csv(csv_name)
    store = HDFStore(h5_name)

    if option == 'frame':
        store.put(dfname, table)

    elif option == 'table': # for frame_table à la pytables
        object_cols =  table.dtypes[ table.dtypes == 'object']
        print object_cols.index
        try:
            store.append(dfname,table)
        except:
            print table.get_dtype_counts()
            object_cols =  table.dtypes[ table.dtypes == 'object']

            for col in object_cols.index:
                print 'removing object column :', col
                del table[col]

            store.append(dfname,table)

    print store
    store.close()

コード例 #35

0

ファイルを表示

ファイル: utils.py プロジェクト: AntoineSir/openfisca-france-data

def csv2hdf5(csv_name, h5_name, dfname, option='frame'):
    """
    Convert a csv file to a dataframe in a hdf5

    Parameters:

    csv_name: string
              csv file name
    h5_name : string
              hdf5 file name
    dfname  : string
              dataframe name
    option  : string, 'frame' or 'table', default to 'frame'
              stoing type in the pytable
    """

    table = read_csv(csv_name)
    store = HDFStore(h5_name)

    if option == 'frame':
        store.put(dfname, table)

    elif option == 'table':  # for frame_table à la pytables
        object_cols = table.dtypes[table.dtypes == 'object']
        print object_cols.index
        try:
            store.append(dfname, table)
        except:
            print table.get_dtype_counts()
            object_cols = table.dtypes[table.dtypes == 'object']

            for col in object_cols.index:
                print 'removing object column :', col
                del table[col]

            store.append(dfname, table)

    print store
    store.close()

コード例 #36

0

ファイルを表示

def pull(start, end):
    hdf = HDFStore('/home/slin/alldata/leaps/liq.h5', mode='a')

    dt = start
    while dt <= end:
        print dt

        try:
            fit_func = partial(fit_full, dt=dt)

            cores = 48
            vpool = Pool(cores)

            ids = pd.read_hdf('/home/slin/alldata/leaps/secids.h5', 'secids')
            # ids = ids.iloc[:10]

            grouped = ids.groupby(['ticker'], as_index=False, sort=False)
            keys = grouped.groups.keys()
            split = [list(arr) for arr in np.array_split(keys, cores)]
            groups = [map(lambda x: grouped.get_group(x), s) for s in split]

            results = vpool.map(fit_list, groups)
            frame = pd.concat(results).reset_index(drop=True)
            print frame

            try:
                hdf.put('{0}'.format(dt.strftime('%Y%m%d')),
                        frame,
                        format='table',
                        data_columns=True)
            except Exception as err:
                print err

        except Exception as err:
            print err

        finally:
            dt = (dt + us_bd).date()

コード例 #37

0

ファイルを表示

class Serialization(object):

    def __init__(self, filename):

        self._filename = filename

    def __enter__(self):

        self._store = HDFStore(self._filename, complib='blosc', complevel=9)

        return self

    def __exit__(self, exc_type, exc_val, exc_tb):

        self._store.close()

    @property
    def keys(self):

        return self._store.keys()

    def store_pandas_object(self, name, object, **metadata):

        self._store.put(name, object)

        self._store.get_storer(name).attrs.metadata = metadata

    def retrieve_pandas_object(self, name):

        # Get the metadata
        metadata = self._store.get_storer(name).attrs.metadata

        # Get the object
        obj = self._store[name]

        return obj, metadata

コード例 #38

0

ファイルを表示

    def save_content(self, name, filename):
        """
        Saves content from the simulation in an HDF store.
        We save output_table, input_table, and the default output_table dataframes,
        along with the other attributes using pickle.
        TODO : we don't save attributes P, P_default for simulation
                neither _param, _default_param for datatables.
        WARNING : Be careful when committing, you may have created a .pk data file.

        Parameters
        ----------
        name : the base name of the content inside the store.

        filename : the name of the .h5 file where the table is stored. Created if not existant.
        """

        sys.setrecursionlimit(32000)
        # Store the tables
        if self.verbose:
            print 'Saving content for simulation under name %s' % name
        ERF_HDF5_DATA_DIR = os.path.join(model.DATA_DIR, 'erf')
        store = HDFStore(os.path.join(os.path.dirname(ERF_HDF5_DATA_DIR), filename + '.h5'))
        if self.verbose:
            print 'Putting output_table in...'
        store.put(name + '_output_table', self.output_table.table)
        if self.verbose:
            print 'Putting input_table in...'
        store.put(name + '_input_table', self.input_table.table)
        if self.verbose:
            print 'Putting output_table_default in...'
        store.put(name + '_output_table_default', self.output_table_default.table)

        store.close()

        # Store all attributes from simulation
        with open(filename + '.pk', 'wb') as output:
            if self.verbose:
                print 'Storing attributes for simulation (including sub-attributes)'
            pickle.dump(self, output)

コード例 #39

0

ファイルを表示

ファイル: simulations.py プロジェクト: Jiss83/openfisca-core

    def save_content(self, name, filename):
        """
        Saves content from the simulation in an HDF store.
        We save output_table, input_table, and the default output_table dataframes,
        along with the other attributes using pickle.
        TODO : we don't save attributes P, P_default for simulation
                neither _param, _default_param for datatables.
        WARNING : Be careful when committing, you may have created a .pk data file.

        Parameters
        ----------
        name : the base name of the content inside the store.

        filename : the name of the .h5 file where the table is stored. Created if not existant.
        """

        sys.setrecursionlimit(32000)
        # Store the tables
        if self.verbose:
            print 'Saving content for simulation under name %s' %name
        ERF_HDF5_DATA_DIR = os.path.join(model.DATA_DIR, 'erf')
        store = HDFStore(os.path.join(os.path.dirname(ERF_HDF5_DATA_DIR),filename+'.h5'))
        if self.verbose:
            print 'Putting output_table in...'
        store.put(name + '_output_table', self.output_table.table)
        if self.verbose:
            print 'Putting input_table in...'
        store.put(name + '_input_table', self.input_table.table)
        if self.verbose:
            print 'Putting output_table_default in...'
        store.put(name + '_output_table_default', self.output_table_default.table)

        store.close()

        # Store all attributes from simulation
        with open(filename + '.pk', 'wb') as output:
            if self.verbose:
                print 'Storing attributes for simulation (including sub-attributes)'
            pickle.dump(self, output)

コード例 #40

0

ファイルを表示

ファイル: DataTable_from_liam.py プロジェクト: AnneDy/Til

        for k in diff1:           

            pd.set_printoptions(max_columns=30)
            listind = table['ind'][table['ind'][ident]==k]
            print listind
            for indiv in np.unique(listind['id']):
                print table['ind'].ix[table['ind']['id']==indiv,['id','period','sexe','idmen','quimen','idfoy','quifoy','conj','mere','pere']]
                pdb.set_trace()   
        
            
for year in years:
    goal.remove('survey_'+str(year))
    for ent in ('ind','men','foy','fam'):
        tab = table[ent].ix[table[ent]['period']==year]
        key = 'survey_'+str(year) + '/'+ent     
        goal.put(key, tab) 
#    if year == 2010:
#        pdb.set_trace()
#        tab = table[ent].ix[table[ent]['period']==year]
#        tab[:5]
#        len(tab['idfam'])
#        len(np.unique(tab['idfam']))
#        list_qui = tab['idfam']
#        double = list_qui.value_counts()[list_qui.value_counts()>1]
#        tabind = table['ind'].ix[table['ind']['period']==year]
        
        
store.close()
goal.close()

# on fais maintenant tourner le modèle OF

コード例 #41

0

ファイルを表示

ファイル: hdf.py プロジェクト: Itay4/pandas

class HDFStoreDataFrame(BaseIO):

    def setup(self):
        N = 25000
        index = tm.makeStringIndex(N)
        self.df = DataFrame({'float1': np.random.randn(N),
                             'float2': np.random.randn(N)},
                            index=index)
        self.df_mixed = DataFrame({'float1': np.random.randn(N),
                                   'float2': np.random.randn(N),
                                   'string1': ['foo'] * N,
                                   'bool1': [True] * N,
                                   'int1': np.random.randint(0, N, size=N)},
                                  index=index)
        self.df_wide = DataFrame(np.random.randn(N, 100))
        self.start_wide = self.df_wide.index[10000]
        self.stop_wide = self.df_wide.index[15000]
        self.df2 = DataFrame({'float1': np.random.randn(N),
                              'float2': np.random.randn(N)},
                             index=date_range('1/1/2000', periods=N))
        self.start = self.df2.index[10000]
        self.stop = self.df2.index[15000]
        self.df_wide2 = DataFrame(np.random.randn(N, 100),
                                  index=date_range('1/1/2000', periods=N))
        self.df_dc = DataFrame(np.random.randn(N, 10),
                               columns=['C%03d' % i for i in range(10)])

        self.fname = '__test__.h5'

        self.store = HDFStore(self.fname)
        self.store.put('fixed', self.df)
        self.store.put('fixed_mixed', self.df_mixed)
        self.store.append('table', self.df2)
        self.store.append('table_mixed', self.df_mixed)
        self.store.append('table_wide', self.df_wide)
        self.store.append('table_wide2', self.df_wide2)

    def teardown(self):
        self.store.close()
        self.remove(self.fname)

    def time_read_store(self):
        self.store.get('fixed')

    def time_read_store_mixed(self):
        self.store.get('fixed_mixed')

    def time_write_store(self):
        self.store.put('fixed_write', self.df)

    def time_write_store_mixed(self):
        self.store.put('fixed_mixed_write', self.df_mixed)

    def time_read_store_table_mixed(self):
        self.store.select('table_mixed')

    def time_write_store_table_mixed(self):
        self.store.append('table_mixed_write', self.df_mixed)

    def time_read_store_table(self):
        self.store.select('table')

    def time_write_store_table(self):
        self.store.append('table_write', self.df)

    def time_read_store_table_wide(self):
        self.store.select('table_wide')

    def time_write_store_table_wide(self):
        self.store.append('table_wide_write', self.df_wide)

    def time_write_store_table_dc(self):
        self.store.append('table_dc_write', self.df_dc, data_columns=True)

    def time_query_store_table_wide(self):
        self.store.select('table_wide', where="index > self.start_wide and "
                                              "index < self.stop_wide")

    def time_query_store_table(self):
        self.store.select('table', where="index > self.start and "
                                         "index < self.stop")

    def time_store_repr(self):
        repr(self.store)

    def time_store_str(self):
        str(self.store)

    def time_store_info(self):
        self.store.info()

コード例 #42

0

ファイルを表示

ファイル: run_all.py プロジェクト: cs109clone/GDELT_Predict

import get_trn_test as gtt
import save_dist_m
import traintest as tt
import plot_results

df = gdb.get_current_data()
df.to_csv("per_day_from_pandas.csv")
# df = pd.read_csv("per_day_from_pandas.csv")

preprocess.normalize_by_event_count(df)

# hdf5 doesn't like unicode
df["country"] = df["country"].apply(lambda x: x.encode("ascii", "ignore"))
countrydict = preprocess.get_country_lookup(df)
hdf = HDFStore("project_data.h5")
hdf.put("per_day_preprocessed", df, format="table", data_columns=True)
hdf.get_storer("per_day_preprocessed").attrs.country_lookup = countrydict

##END PREPROCESSING
train_years = 5
test_years = 1
hdf = HDFStore("project_data.h5")
df = hdf["per_day_preprocessed"]
basename_out = "last_6_years"
train_start = 20091030
trainxy, testxy, countrylist = gtt.get_train_test(df, train_start, train_years, test_years)
train_x = trainxy[0]
train_y = trainxy[1]
test_x = testxy[0]
test_y = testxy[1]
np.savez(

コード例 #43

0

ファイルを表示

ファイル: survey_1_to_4.py プロジェクト: stanislasrybak/openfisca-qt

                                                                8020084, 8001993, 8004222, 8027386,
                                                                9039721, 9047848, 9016763]) ]
    print len(table_in_one)
    for entity in ['ind','foy','men','fam']:
        key = 'survey_'+str(year) + '/'+str(entity)
        vars_entity = from_one_to_three(table_in_one,entity)
        print entity, vars_entity
        if entity == 'ind':
            table_entity = table_in_one[vars_entity]
        # we take care have all ident and selecting qui==0
        else:
            enum = 'qui'+entity
            table_entity = table_in_one.ix[table_in_one[enum] ==0 ,['noi','idmen','idfoy','idfam'] + vars_entity]
            table_entity= table_entity.rename_axis(table_entity['id'+entity],axis=1)
        print key
        output.put(key, table_entity)
    del table_in_one
    gc.collect()

store.close()
output.close()

# test pour voir si les "lignes" sont nulles
#enum = 'qui'+entity
#table_in_one[enum] == 0
#
#voir = np.array(table_in_one.ix[table_in_one[enum] == 0,vars_entity])
#voir[:,:-1] != 0
#len(np.where( voir[:,:-1] != 0 )[0])
#np.unique(np.where( voir != 0 )[1])

コード例 #44

0

ファイルを表示

ファイル: liam2of.py プロジェクト: antoineboiron/Til

def main(simulation, period=None, output=".h5"):
    temps = time.clock()    
    output_tab = path_til + "/output/to_run_leg.h5"
    name_convertion = {'person':'ind','declar':'foy','menage':'men', 'fam':'fam'}


    # on travaille d'abord sur l'ensemble des tables puis on selectionne chaque annee
    # on étudie d'abord la table individu pour pouvoir séléctionner les identifiants
    # step 1
    table = {}
    entities = simulation.entities
    for entity in entities:
        nom = entity.name
        if nom == 'person':
            ent = name_convertion[nom]
            # convert from PyTables to Pandas
            table[ent] = pd.DataFrame(entity.array.columns)
            # rename variables to make them OF ones
            table['ind'] = table['ind'].rename(columns={
                        'men': 'idmen', 'foy': 'idfoy', 'id': 'noi', 'statmarit': 'civilstate'})

    # get years
    years = np.unique(table['ind']['period'].values/100)
    ent = 'ind'
    # création de variable
    
# useless since agem is in simu    
#     table[ent]['agem'] = 12 * table[ent]['age'] 
    
    table[ent]['ageq'] = table[ent]['age']/5 - 4 
    table[ent]['ageq'] = table[ent]['ageq']*(table[ent]['ageq'] > 0) 
    table[ent]['ageq'] = 12 + (table[ent]['ageq']-12)*(table[ent]['ageq'] < 12) 
    #TODO: modifier pour les jeunes veufs 
    
    # create fam entity
    try:
        table[ent][['idfam','quifam']] = table[ent].loc[:,['idmen','quimen']]
    except:
        pdb.set_trace()
    
    # save information on qui == 0
    foy0 = table[ent].ix[table[ent]['quifoy']==0,['noi','idfoy','idmen','idfam','period']]
    men0 = table[ent].ix[table[ent]['quimen']==0,['noi','idfoy','idmen','idfam','period']]

#    # Travail sur les qui quand on ne controle pas dans la simulation que tout le monde n'est pas qui==2
## inutile car fait maintenant dans la simulation mais peut-être mieux à refaire ici un jour
## parce que ça prend du temps dans la simulation
#    time_qui = time.clock()
#    for ent in ('men','foy'): # 'fam' un jour...
#        print "Deal with qui for ", ent        
#        qui= 'qui'+ent
#        ident = 'id'+ent
#        trav = table['ind'].ix[table['ind'][qui]==2, [ident,qui,'period']]
#        for name, groupfor nom in ('menage','declar','fam'):for nom in ('menage','declar','fam'): in trav.groupby([ident,'period']):
#            to_add = range(len(group))
#            group[qui] = group[qui]+to_add
#            table['ind'].ix[group[qui].index, qui] = group[qui]
#        print "les qui pour ", ent," sont réglés"
#    time_qui = time.clock() - time_qui
#    print "le temps passé à s'occuper des qui a été",time_qui
    

    
    for entity in entities:
        nom = entity.name
        if nom in name_convertion:
            if nom != 'person': 
                pd.DataFrame(entity.array.columns)
                ent = name_convertion[nom]
                # convert from PyTables to Pandas
                table[ent] = pd.DataFrame(entity.array.columns)
                ident = 'id'+ent
                table[ent] = table[ent].rename(columns={'id': ident})
                table[ent] = merge(table[ent], eval(ent +'0'), how='left', left_on=[ident,'period'], right_on=[ident,'period'])
            # traduction de variable en OF pour ces entités
                
            if ent=='men':
                # nbinde est limité à 6 personnes et donc valeur = 5 en python
                table[ent]['nbinde'] = (table[ent]['nb_persons']-1) * (table[ent]['nb_persons']-1 <=5) +5*(table[ent]['nb_persons']-1 >5)

    table['fam'] = men0 
    
    if period is not None:
        years=[period]
        print years
    
    # a comnmenter quand on est sur du nodele pour gagner un peu de temps
#    test = {}
#    for year in years: 
#        for nom in ('menage','declar'):
#            ent = name_convertion[nom] 
##            print ent, base, ident
#            test[ent] = pd.DataFrame(entity.array.columns).rename(columns={'id': ident})
#            test[ent] = test[ent].ix[test[ent]['period']==year,:]
#            
#            test0 = eval(ent +'0')[eval(ent +'0')['period']==year]
#            
#            tab = table[ent].ix[table[ent]['period']==year,['noi','id'+ent,'idfam']]
#            ind = table['ind'].ix[table['ind']['period']==year,['qui'+ent]] 
#            try:
#                list_ind =  ind[ind==0]
#            except:
#                pdb.set_trace()            
#            lidmen = test[ent][ident]
#            lidmenU = np.unique(lidmen)
#            diff1 = set(test0[ident]).symmetric_difference(lidmenU)
#            print year, ent, diff1
#            for k in diff1:           
#    
#                pd.set_printoptions(max_columns=30)
#                listind = table['ind'][table['ind'][ident]==k]
#                print listind
#                for indiv in np.unique(listind['noi']):
#                    print table['ind'].ix[table['ind']['noi']==indiv,['noi','period','sexe','idmen','quimen','idfoy','quifoy','conj','mere','pere']]
#                    pdb.set_trace()   
              
              

    #available_years = sorted([int(x[-4:]) for x in  store.keys()])              
              
    for year in years:    
        if output=='.h5':
            try: 
                os.remove(output_tab)
            except: 
                print("Attention, la table intermediaire n'a pas ete supprimee")
            goal = HDFStore(output_tab)             
            goal.remove('survey_'+str(year))
            for ent in ('ind','men','foy','fam'):
                tab = table[ent].ix[table[ent]['period']/100==year]
                key = 'survey_'+str(year) + '/'+ent     
                goal.put(key, tab) 
            goal.close()
        else:
            for ent in ('ind','men','foy','fam'):
                table[ent] = table[ent].ix[table[ent]['period']/100==year] 
            return table

コード例 #45

0

ファイルを表示

ファイル: hdf.py プロジェクト: zhuomingliang/pandas

class HDFStoreDataFrame(BaseIO):

    goal_time = 0.2

    def setup(self):
        N = 25000
        index = tm.makeStringIndex(N)
        self.df = DataFrame(
            {
                'float1': np.random.randn(N),
                'float2': np.random.randn(N)
            },
            index=index)
        self.df_mixed = DataFrame(
            {
                'float1': np.random.randn(N),
                'float2': np.random.randn(N),
                'string1': ['foo'] * N,
                'bool1': [True] * N,
                'int1': np.random.randint(0, N, size=N)
            },
            index=index)
        self.df_wide = DataFrame(np.random.randn(N, 100))
        self.start_wide = self.df_wide.index[10000]
        self.stop_wide = self.df_wide.index[15000]
        self.df2 = DataFrame(
            {
                'float1': np.random.randn(N),
                'float2': np.random.randn(N)
            },
            index=date_range('1/1/2000', periods=N))
        self.start = self.df2.index[10000]
        self.stop = self.df2.index[15000]
        self.df_wide2 = DataFrame(np.random.randn(N, 100),
                                  index=date_range('1/1/2000', periods=N))
        self.df_dc = DataFrame(np.random.randn(N, 10),
                               columns=['C%03d' % i for i in range(10)])

        self.fname = '__test__.h5'

        self.store = HDFStore(self.fname)
        self.store.put('fixed', self.df)
        self.store.put('fixed_mixed', self.df_mixed)
        self.store.append('table', self.df2)
        self.store.append('table_mixed', self.df_mixed)
        self.store.append('table_wide', self.df_wide)
        self.store.append('table_wide2', self.df_wide2)

    def teardown(self):
        self.store.close()
        self.remove(self.fname)

    def time_read_store(self):
        self.store.get('fixed')

    def time_read_store_mixed(self):
        self.store.get('fixed_mixed')

    def time_write_store(self):
        self.store.put('fixed_write', self.df)

    def time_write_store_mixed(self):
        self.store.put('fixed_mixed_write', self.df_mixed)

    def time_read_store_table_mixed(self):
        self.store.select('table_mixed')

    def time_write_store_table_mixed(self):
        self.store.append('table_mixed_write', self.df_mixed)

    def time_read_store_table(self):
        self.store.select('table')

    def time_write_store_table(self):
        self.store.append('table_write', self.df)

    def time_read_store_table_wide(self):
        self.store.select('table_wide')

    def time_write_store_table_wide(self):
        self.store.append('table_wide_write', self.df_wide)

    def time_write_store_table_dc(self):
        self.store.append('table_dc_write', self.df_dc, data_columns=True)

    def time_query_store_table_wide(self):
        self.store.select('table_wide',
                          where="index > self.start_wide and "
                          "index < self.stop_wide")

    def time_query_store_table(self):
        self.store.select('table',
                          where="index > self.start and "
                          "index < self.stop")

    def time_store_repr(self):
        repr(self.store)

    def time_store_str(self):
        str(self.store)

    def time_store_info(self):
        self.store.info()

コード例 #46

0

ファイルを表示

def put_to_hdf(df):
    hdf = HDFStore(dirname + "\_InstrumentData.5")
    hdf.put('InstrumentData', df, format='table', data_columns=True)
    hdf.close() # closes the file

コード例 #47

0

ファイルを表示

ファイル: data_extraction_routines.py プロジェクト: carlosarceleon/article3_time_resolved

def extract_relevant_data( case_list = [], exceptions = [], y_delta_locs = [],
                         x_2h_locs = [] , plot = False):
    """ This will extract the wall normal data at the spanwise location
    TE at a certain y density
    """

    from os                           import listdir
    from os.path                      import join,split
    from pandas                       import DataFrame, HDFStore, read_pickle
    from boundary_layer_routines      import return_bl_parameters
    from raw_data_processing_routines import decript_case_name
    from progressbar                  import ProgressBar,Percentage
    from progressbar                  import Bar,ETA,SimpleProgress
    from numpy                        import array, round, linspace
    from data_cleaning_routines       import show_surface_from_df

    x_2h_locs    = round( array( x_2h_locs ),    2 )
    y_delta_locs = round( array( y_delta_locs ), 2 )

    # Get the available HDF5 files #############################################
    hdf5_root = '/media/carlos/6E34D2CD34D29783/' +\
                '2015-02_SerrationPIV/TR_Data_Location_Calibrated_Article3'

    if not len(case_list):
        hdf5_files = [f for f in listdir( hdf5_root ) \
                      if f.endswith('.hdf5') \
                      and not f in exceptions ]
    else:
        hdf5_files = [f for f in listdir( hdf5_root ) \
                      if f.endswith('.hdf5') \
                      and f in case_list ]
    # ##########################################################################

    for hf in [join( hdf5_root, f ) for f in hdf5_files]:

        f = split( hf )[1].replace('_AirfoilNormal','')\
                .replace('_Aligned.hdf5','')

        print "   Extracting data from {0}".format(f)
        print "     at the normalized streamwise locations:"
        print "     {0}".format( x_2h_locs )


        hdf_t = HDFStore( hf, 'r' )

        # Get the available coordinates ########################################
        hf_coords = hdf_t.select('data', where = [ 't = 0' ], 
                                 columns = [ 'x', 'y' ] )
        # ######################################################################

        # Turn the non-dim requested locations into physical coords ############
        requested_locations = []
        requested_normalized_locations = []
        #for x,x_norm in zip(x_2h_locs * tooth_length, x_2h_locs):
        #    for y_d in y_delta_locs:
        #        bl_params = return_bl_parameters( f , [x] )
        #        d_99 = bl_params.delta_99.values[0]
        #        #if "STE" in f:
        #        #    d_99 = 9.4
        #        y = y_d * d_99
        #        requested_locations.append( (x,y) )
        #        requested_normalized_locations.append( ( x_norm, y_d ) )

        # Get the normalization locations depending on the case ################
        if 'z00' in f and not 'STE' in f:
            x_bl_loc = 40
        elif 'z05' in f:
            x_bl_loc = 20
        elif 'z10' in f or 'STE' in f:
            x_bl_loc = 0

        bl_params = return_bl_parameters( f , [x_bl_loc] )
        d_99 = bl_params.delta_99.values[0]

        for x,x_norm in zip(x_2h_locs * tooth_length, x_2h_locs):
            for y_d in y_delta_locs:
                y = y_d * d_99
                requested_locations.append( (x,y) )
                requested_normalized_locations.append( ( x_norm, y_d ) )
        print "    Normalizing to a BL thickness of {0:.2f} mm".\
                format(d_99)
        # ######################################################################

        available_xy_locs = hf_coords[
            ( hf_coords.x > min( x_2h_locs ) * 40. ) & \
            ( hf_coords.x < max( x_2h_locs ) * 40. ) & \
            ( hf_coords.y > min( y_delta_locs ) * d_99 ) & \
            ( hf_coords.y < max( y_delta_locs ) * d_99 )
        ][ ['x','y'] ]
              
        available_xy_locs = [tuple(x) for x in available_xy_locs.values]

        if plot:

            trailing_edge,phi,alpha,U,z = decript_case_name( f )

            if trailing_edge == 'serrated': device = 'Sr20R21'
            elif trailing_edge == 'straight': device = 'STE'
            elif trailing_edge == 'slitted': device = 'Slit20R21'

            case_name = "{0}_phi{1}_alpha{2}_U{3}_loc{4}_tr.dat".format(
                device, phi, alpha, U, z
            )

            df_av = read_pickle( 'averaged_data/' + case_name + '.p' )
            show_surface_from_df( df_av , points = available_xy_locs ,
                                plot_name = 'ReservedData/' + f + '.png'
                                )

        query   = ''
        cnt_all = 0

        cnt = 0
        time_series_hdf = HDFStore( 'ReservedData/' + f + '.hdf5' , 'w' )

        vertical_split_blocks = 10

        progress = ProgressBar(
             widgets=[
                 Bar(),' ',
                 Percentage(),' ',
                 ETA(), ' (query bunch  ',
                 SimpleProgress(),')'], 
             maxval = vertical_split_blocks
             ).start()

        # Don't try to get it all at once; split the vertical in 4 pieces
        y_ranges = linspace( 
            min( y_delta_locs ),
            max( y_delta_locs ),
            vertical_split_blocks
        ) * d_99

        xmin = min(x_2h_locs) * 40.
        xmax = max(x_2h_locs) * 40.

        for ymin, ymax in zip( y_ranges[:-1], y_ranges[1:] ):

            query = " x>={0} & x<{1} & y>={2} & y<{3} ".\
                    format( xmin, xmax, ymin, ymax )

            df_t = hdf_t.select(
                key   = 'data',
                where = [ query ],
            )

            df_t['near_x_2h']    = round( df_t.x / 40.,  4 )
            df_t['near_y_delta'] = round( df_t.y / d_99, 4 )

            if not cnt:
                time_series_hdf.put( 'data', df_t , 
                                    data_columns = [
                                        'near_x_2h',
                                        'near_y_delta',
                                        't'
                                    ],
                                    format = 't')
            else:
                time_series_hdf.append( 'data', df_t , 
                                       data_columns = [
                                           'near_x_2h',
                                           'near_y_delta',
                                           't'
                                       ],
                               format = 't')

            cnt_all += 1
            cnt     += 1

            progress.update(cnt_all)

            df_t = DataFrame()


        progress.finish()
        hdf_t.close()
        time_series_hdf.close()

コード例 #48

0

ファイルを表示

def read_raw_tecplot_case_and_write_pandas_hdf5(
    case_folder,
    root                  = 0,
    output_file           = 0,
    serration_angle       = 0,
    angle_correction      = 0,
    height_correction     = 0,
    streamwise_correction = 0,
    overwrite             = False,
    time_step_limit       = 0,
    airfoil_normal        = False,
):
    from os.path import isfile,join,splitext
    from os import listdir
    from progressbar import ProgressBar,Percentage,Bar,ETA,SimpleProgress
    from pandas import HDFStore

    # File related things ######################################################
    if not output_file:
        output_file = case_folder+".hdf5"

    if airfoil_normal:
        output_file = output_file+"_AirfoilNormal"

    if not output_file.endswith('.hdf5'):
        output_file = output_file.replace(".hdf5","")+".hdf5"

    if isfile(output_file) and not overwrite:
        print "  Exiting; file exists:\n{0}".format(output_file)
        return 0
    else:
        print "  Writing\n{0}".format(output_file)
    # ##########################################################################


    hdf = HDFStore(output_file)

    time_step_files = sorted([f for f in listdir(join(root,case_folder)) \
             if splitext(f)[1] == '.dat'])

    if time_step_limit:
        time_step_files = time_step_files[:time_step_limit]

    progress = ProgressBar(
         widgets=[
             Bar(),' ',
             Percentage(),' ',
             ETA(), ' (file ',
             SimpleProgress(),')'], 
         maxval=len(time_step_files)
         ).start()

    cnt = 0
    for f,t in zip(time_step_files,range(len(time_step_files))):

       df_t = read_tecplot_file_and_correct_for_location_rotation(
           tecplot_file          = join(root,case_folder,f),
           serration_angle       = serration_angle,
           angle_correction      = angle_correction,
           height_correction     = height_correction,
           streamwise_correction = streamwise_correction,
           time_step             = t,
           airfoil_normal        = airfoil_normal,
       )

       df_t = get_vorticity(df_t)

       if cnt == 0:
           df = df_t.copy()
       else:
           df = df.append( df_t, drop_index = True)
           #df = df.drop_duplicates()

           try:
               x_cnt = df.x.value_counts().max() 
           except AttributeError:
               print df
               raise
           if not x_cnt.max() == x_cnt.min():
               print "  There's something wrong, counted {0} instances of x"\
                       .format(x_cnt.max())
               return 0

       if t == 30:
           hdf.put(case_folder,
                   df.convert_objects(), 
                   format='table', data_columns=True
                  )
       elif cnt == 30 and not t == cnt:
           hdf.append(case_folder,
                      df.convert_objects(), 
                      format='table', data_columns=True
                     )
           cnt = 0

       cnt += 1

       progress.update(t)

    progress.finish()
    hdf.close()

    return 1

コード例 #49

0

ファイルを表示

ファイル: liam2of.py プロジェクト: TaxIPP-Life/Til

def table_for_of(simulation, period=None, check_validity=False, save_tables=False):
    temps = time.clock()
    output_tab = os.path.join(path_til[0], "output", "to_run_leg.h5" )
    # on travaille d'abord sur l'ensemble des tables puis on selectionne chaque annee
    # on étudie d'abord la table individu pour pouvoir séléctionner les identifiants
    # step 1
    table = {}

    entities = simulation.entities
    entities_name =  map( lambda e: e.name, simulation.entities)
    def _get_entity(name):
        position = entities_name.index(name)
        return simulation.entities[position]
        
    ind = _get_entity('person')
    table['ind'] = DataFrame(ind.array.columns)
    table['ind'] = table['ind'].rename(columns={'men': 'idmen', 'foy': 'idfoy', 'id': 'noi', 'statmarit': 'civilstate'})
    
    # création de variable
    table['ind']['ageq'] = table['ind']['age']/5 - 4 
    table['ind']['ageq'] = table['ind']['ageq']*(table['ind']['ageq'] > 0) 
    table['ind']['ageq'] = 12 + (table['ind']['ageq']-12)*(table['ind']['ageq'] < 12) 
    #TODO: modifier pour les jeunes veufs 
    
    # create fam entity
    try:
        table['ind'][['idfam','quifam']] = table['ind'].loc[:,['idmen','quimen']]
    except:
        pdb.set_trace()

#    # Travail sur les qui quand on ne controle pas dans la simulation que tout le monde n'est pas qui==2
## inutile car fait maintenant dans la simulation mais peut-être mieux à refaire ici un jour
## parce que ça prend du temps dans la simulation
#    time_qui = time.clock()
#    for ent in ('men','foy'): # 'fam' un jour...
#        print "Deal with qui for ", ent        
#        qui= 'qui'+ent
#        ident = 'id'+ent
#        trav = table['ind'].ix[table['ind'][qui]==2, [ident,qui,'period']]
#        for name, groupfor nom in ('menage','declar','fam'):for nom in ('menage','declar','fam'): in trav.groupby([ident,'period']):
#            to_add = range(len(group)) 
#            group[qui] = group[qui]+to_add
#            table['ind'].ix[group[qui].index, qui] = group[qui]
#        print "les qui pour ", ent," sont réglés"
#    time_qui = time.clock() - time_qui
#    print "le temps passé à s'occuper des qui a été",time_qui
    ind = table['ind']
    for ent in ['men','foy']:
        entity = _get_entity(of_name_to_til[ent])

        table[ent] = DataFrame(entity.array.columns)
        id = 'id' + ent
        qui = 'qui' + ent
        table[ent] = table[ent].rename(columns={'id': id})
        # travail sur les qui
        nb_qui = ind.loc[ind[qui]>1, ['noi',id,qui]].groupby(id, sort=True).size()
        if len(nb_qui)>0:
            new_qui = concatenated_ranges(nb_qui) + 2 
            table['ind'] = table['ind'].sort(id) #note the sort
            col_qui = table['ind'][qui]
            col_qui[col_qui>1] = new_qui
            table['ind'][qui] = col_qui 
        
        
        # informations on qui == 0
        qui0 = table['ind'].loc[table['ind']['qui' + ent]==0,['noi','idfoy','idmen','idfam','period']] 
        table[ent] = merge(table[ent], qui0, how='left', left_on=[id,'period'], right_on=[id,'period'])
    
        if ent=='men':
            # nbinde est limité à 6 personnes et donc valeur = 5 en python
            table[ent]['nbinde'] = (table[ent]['nb_persons']-1) * (table[ent]['nb_persons']-1 <=5) +5*(table[ent]['nb_persons']-1 >5)
            table['fam'] = qui0
    
    # remove non-ordinary household
    cond = (table['ind']['idmen'] >= 10) & (table['ind']['idfoy'] >= 10)
    table['ind'] = table['ind'][cond]
    table['men'] = table['men'][table['men']['idmen']>=10]
    table['foy'] = table['foy'][table['foy']['idfoy']>=10]
    table['fam'] = table['fam'][table['fam']['idfam']>=10]
    # get years
    years = np.unique(table['ind']['period'].values/100)    
    if period is not None:
        years=[period]
        print years

    if check_validity:
        for year in years: 
            ind = table['ind'] 
            for ent in ['men','foy']: #fam
                id = 'id' + ent
                qui = 'qui' + ent
                tab = table[ent]
                try:
                    assert ind.groupby([id,qui]).size().max() == 1
                except:
                    print ent
                    pb = ind.groupby([id,qui]).size() > 1
                    print(ind.groupby([id,qui]).size()[pb])
                    pdb.set_trace()
                    print(ind[ind[id]==43][['noi',id,qui]])
                
                qui0 = ind[ind[qui]==0]
                try:  
                    assert qui0[id].isin(tab[id]).all()
                except:
                    cond = tab[id].isin(qui0[id])
                    print(tab[~cond])
                    pdb.set_trace()
                try:
                    assert tab[id].isin(qui0[id]).all()
                except:
                    cond = tab[id].isin(qui0[id])
                    print(tab[~cond])
                    pdb.set_trace()

    for year in years:    
        if save_tables:
            try: 
                os.remove(output_tab)
            except: 
                print("Attention, la table intermediaire n'a pas ete supprimee")
            goal = HDFStore(output_tab)             
            goal.remove('survey_'+str(year))
            for ent in ('ind','men','foy','fam'):
                tab = table[ent].loc[table[ent]['period']/100==year]
                key = 'survey_'+str(year) + '/'+ent     
                goal.put(key, tab) 
            goal.close()
        else:
            for ent in ('ind','men','foy','fam'):
                table[ent] = table[ent].loc[table[ent]['period']/100==year] 
            return table

コード例 #50

0

ファイルを表示

ファイル: stats_leg.py プロジェクト: TaxIPP-Life/til-core

from pandas import HDFStore, merge  # DataFrame
import numpy as np
import pdb
import time
import pandas as pd

output = "C:/openfisca/output/liam/"
get_years = HDFStore("C:/openfisca/src/countries/france/data/surveyLiam.h5")
input_h5 = HDFStore(output + "LiamLeg.h5")
output_h5 = HDFStore(output + "LiamLeg2.h5")

years = [x[-4:] for x in dir(get_years.root) if x[0] != "_"]
nb_year = len(years)
get_years.close()

ent = "ind"
list_tab = ["survey_" + x + "/" + ent for x in years]

output_h5["ind"] = pd.DataFrame()

for ent in ("ind", "men", "foy", "fam"):
    output_h5[ent] = pd.DataFrame()
    for year in years:
        name_tab = "survey_" + year + "/" + ent
        tab = input_h5[name_tab]
        tab["period"] = pd.Series(np.ones(len(tab)) * int(year))
        output_h5.put(ent, output_h5[ent].append(tab))


# par entité, lire les tables pour chaque année, ajouter périod, et ajouter tout ça

コード例 #51

0

ファイルを表示

ファイル: common.py プロジェクト: jasonshih/log_analysis

def save_df(path, df):      
    """Save DataFrame df in HDF5 store path in '\logs' table
        FIXME: Use a more neutral name for the table"""
    store = HDFStore(path)
    store.put('logs', df)
    store.close()

コード例 #52

0

ファイルを表示

ファイル: convert_csv_to_hdf5.py プロジェクト: minhnh/rossmann_lstm

def main(args):
    """ main function """
    if args.num_store and args.num_store > _NUM_STORE_TOTAL:
        sys.exit(1)
        print("[FAIL] Data should only have %d stores!" % (_NUM_STORE_TOTAL))
    if args.nrows and args.nrows < _NUM_STORE_TOTAL:
        print("[FAIL] Should read more rows than than the number of stores!")
        sys.exit(1)

    train_data = None
    test_data = None
    store_data = None
    additional_info = pd.DataFrame()
    print("Loading data files")
    if args.type == _ARG_TRAIN:
        train_data = load_data_file(args.file[0], _DTYPE_TRAIN, nrows=args.nrows)
    elif args.type == _ARG_TEST:
        test_data = load_data_file(args.file[0], _DTYPE_TEST, nrows=args.nrows)
    elif args.type == _ARG_STORE:
        store_data = load_data_file(args.file[0], _DTYPE_STORE, nrows=args.nrows,
                                    parse_date=False)
    elif args.type == "all":
        if len(args.file) < 3:
            print("all option requires three input files")
            sys.exit(1)
        train_data = load_data_file(args.file[0], _DTYPE_TRAIN, nrows=args.nrows)
        test_data = load_data_file(args.file[1], _DTYPE_TEST, nrows=args.nrows)
        store_data = load_data_file(args.file[2], _DTYPE_STORE, nrows=args.nrows,
                                    parse_date=False)
    else:
        print("Invalid database type")
        sys.exit(1)

    # Reverse dates
    if train_data is not None:
        train_data = train_data.iloc[::-1]
    if test_data is not None:
        test_data = test_data.iloc[::-1]

    if args.num_store:
        print("Selecting %d random stores" % (args.num_store))
        store_list = []
        random.seed()
        for _ in range(args.num_store + 1):
            store_list.append(random.randint(1, _NUM_STORE_TOTAL + 1))
        if train_data is not None:
            train_data = train_data.loc[train_data['Store'].isin(store_list)]
        if test_data is not None:
            test_data = test_data.loc[test_data['Store'].isin(store_list)]
        if store_data is not None:
            store_data = store_data.loc[store_data['Store'].isin(store_list)]

    print("Perform additional processing...")
    train_data, test_data, store_data, additional_info \
            = process_data(train_data, test_data, store_data, additional_info)

    print("Writing to HDF5 type file %s" % (args.file_out.name))
    hdf = HDFStore(args.file_out.name, mode='w')
    if train_data is not None:
        hdf.put(_ARG_TRAIN + "_data", train_data, format='table', data_columns=True)
    if test_data is not None:
        hdf.put(_ARG_TEST + "_data", test_data, format='table', data_columns=True)
    if store_data is not None:
        hdf.put(_ARG_STORE + "_data", store_data, format='table', data_columns=True)
    #if not additional_info.empty:
    hdf.put("common_info", additional_info, format='table', data_columns=True)

    return

コード例 #53

0

ファイルを表示

ファイル: load_data_hdf5.py プロジェクト: qichaotang/kaggle-rossmann

meanCompetitionDistance = np.mean(data_store.CompetitionDistance)
data_store['CompetitionDistance'] = data_store['CompetitionDistance'].fillna(meanCompetitionDistance)
data_store['CompetitionDistance'] = data_store['CompetitionDistance'].astype(np.float32)

# binary missing values
# classical replacement will be -1 for negatives, 0 for missing, 1 for positives
# but in open column there is really small amount of missing values
data_test.loc[data_test.Open.isnull(), 'Open'] = 1
data_test['Open'] = data_test['Open'].astype(np.int8)

print('Normalize data set ...')
min_max_scaler = preprocessing.MinMaxScaler()
data_store['CompetitionDistance'] = min_max_scaler.fit_transform(data_store['CompetitionDistance'])

print('Create ultimate data')
# this is concatenating datasets including info from stores and mean
data_ut_store = pd.merge(data_mean, data_store, on='Store')
data_ut_train = pd.merge(data_train,data_ut_store, on='Store')
data_ut_test  = pd.merge(data_test,data_ut_store, on='Store')

assert( len( data_ut_train ) == len( data_train ))
assert( len( data_ut_test ) == len( data_test ))

print('Storing data ...')
hdf = HDFStore(data_dir + 'data.h5')
hdf.put('data_train', data_ut_train, format='table', data_columns=True)
hdf.put('data_test', data_ut_test, format='table', data_columns=True)
hdf.put('data_store', data_ut_store, format='table', data_columns=True)

print('Done ...')

コード例 #54

0

ファイルを表示

ファイル: raw_data_processing_routines.py プロジェクト: carlosarceleon/article3_time_resolved

def read_raw_tecplot_folder_and_write_pandas_hdf5(
    case_folder,
    root                  = 0,
    output_file           = 0,
    output_root           = 0,
    overwrite             = False,
):
    from os.path     import isfile,join,splitext
    from os          import listdir
    from progressbar import ProgressBar,Percentage,Bar
    from progressbar import ETA,SimpleProgress
    from pandas      import DataFrame, HDFStore

    # File related things ######################################################
    if not output_file:
        output_file = case_folder+"_Aligned.hdf5"

    if not output_root:
        output_root = '/media/carlos/6E34D2CD34D29783/' +\
                '2015-02_SerrationPIV/TR_Data_Location_Calibrated_Article3'

    if not output_file.endswith('_Aligned.hdf5'):
        output_file = output_file.replace("_Aligned.hdf5","")+"_Aligned.hdf5"

    if 'STE' in case_folder or 'z10' in case_folder:
        output_file = output_file.replace( '.hdf5', '_AirfoilNormal.hdf5' )

    if isfile(join( output_root, output_file )) and not overwrite:
        print "  Exiting; file exists:\n      {0}".format(output_file)
        return 0
    else:
        print "  Writing\n      {0}".format(output_file)

    # ##########################################################################


    time_step_files = sorted(
        [join(root,case_folder,f) for f in listdir(join( root, case_folder )) \
         if splitext(f)[1] == '.dat']
    )

    progress = ProgressBar(
         widgets=[
             Bar(),' ',
             Percentage(),' ',
             ETA(), ' (file ',
             SimpleProgress(),')'], 
         maxval=len(time_step_files)
         ).start()

    cnt = 0

    hdf_store = HDFStore( join( output_root, output_file ) )

    for f,t in zip(time_step_files,range(len(time_step_files))):

       df_t = read_tecplot_file(
           tecplot_folder         = join( root, case_folder ),
           tecplot_time_step_file = f,
           time_step              = t,
       )

       if cnt == 0:
           df = df_t.copy()
       else:
           df = df.append( df_t, ignore_index = True)

       if cnt == 50:

           df = correct_df_translation_rotation( df )\
                   [['x','y','t','u','v','w']]

           df = df.sort_values( by = ['x','y','t'] )

           #df.set_index( ['x','y'], inplace = True)

           if t == 0:
               hdf_store.put( 'data', df , 
                                data_columns = ['x','y','t'],
                               format = 't')
           else:
               hdf_store.append( 'data', df , 
                                data_columns = ['x','y','t'],
                               format = 't')

           cnt = 0

           df = DataFrame()

       cnt += 1

       progress.update(t)

    progress.finish()

    hdf_store.close()

    return 1

コード例 #55

0

ファイルを表示

ファイル: load_logs.py プロジェクト: jasonshih/log_analysis

class LogSaver:
    """
        self.directory : Directory structure for temp and saved files
        self.log_list : List of server.log files to process
        self.extra : True if log messages and thread ids are to be saved too
        self.history_path : History of server.log conversions saved here
        self.progress_store_path : HDF5 file that holds one DataFrame for each server.log file 
        self.store_path : Final DataFrame of all server.log entries saved here
        self.history : History of server.log conversions
    """

    FINAL = 'logs'
    PROGRESS = 'progress'
    HISTORY = 'history'

    @staticmethod
    def normalize(name):
        return re.sub(r'[^a-zA-Z0-9]', '_', name)
     
    @staticmethod
    def make_name(base_name, extra):
        if extra:
            return base_name + '.extra'
        else:
            return base_name
     
    #@staticmethod
    #def temp_name(log_list, extra):
    #    hsh = hash(log_list)
    #    sgn = 'n' if hsh < 0 else 'p'
    #    temp = 'temp_%s%08X' % (sgn, abs(hsh))
    #    return LogSaver.make_name(temp, extra)    

    def __init__(self, store_path, log_list, extra):
        self.directory = ObjectDirectory(store_path)
        self.log_list = tuple(sorted(log_list))
        self.extra = extra

        self.history_path = self.directory.get_path(LogSaver.HISTORY, temp=True)
        self.progress_store_path = self.directory.get_path(LogSaver.PROGRESS, temp=True, is_df=True)
        self.store_path = self.directory.get_path(LogSaver.make_name(LogSaver.FINAL, extra), 
                            is_df=True)
        self.history = ObjectDirectory.load_object(self.history_path, {})
        self.saved = False
        
    def __repr__(self):
        return '\n'.join('%s: %s' % (k,v) for k,v in self.__dict__.items())
        
    def __str__(self):
        return '\n'.join([repr(self), '%d log files' % len(self.log_list)])    

    def save_all_logs(self, force=False):
         
        if os.path.exists(self.store_path):
            final_store = HDFStore(self.store_path)
            print 'Keys: %s' % final_store
            final_store.close()
            return
        if not force:
            assert not os.path.exists(self.history_path), '''
                %s exists but %s does not.
                There appears to be a conversion in progress.
                -f forces conversion to complete.
            ''' % (self.history_path, self.store_path)
        
        self.directory.make_dir_if_necessary(self.progress_store_path)
        self.progress_store = HDFStore(self.progress_store_path)
        for path in self.log_list:
            self.save_log(path)
        
        self.check()    
        print '--------'
        print 'All tables in %s' % self.progress_store_path
        print self.progress_store.keys()
        print '--------'
        
        def get_log(path):
            try:
                return self.progress_store.get(LogSaver.normalize(path))
            except Exception as e:
                print
                print path
                raise e
               
        
        df_list = [get_log(path) for path in self.log_list]     
        self.progress_store.close()
        print 'Closed %s' % self.progress_store_path
        
        df_all = pd.concat(df_list)
        print 'Final list has %d entries' % len(df_all)
        final_store = HDFStore(self.store_path)
        final_store.put('logs', df_all)
        print 'Keys: %s' % final_store
        final_store.close()
        print 'Closed %s' % self.store_path
        
        # Save the history in a corresponding file
        self.directory.save('history', self.history)
        print 'Saved history'
        
        self.saved = True
        

    def test_store(self):    
        final_store = HDFStore(self.store_path)
        print '----'
        print final_store.keys()
        print '-' * 80
        logs = final_store['/logs']
        print type(logs)
        print len(logs)
        print logs.columns
        final_store.close()

    def cleanup(self): 
        os.remove(self.progress_store_path)
        os.remove(self.history_path)
        
    def delete(self):
        os.remove(self.store_path)

    def save_log(self, path):
        """Return a pandas DataFrame for all the valid log entry lines in log_file
            The index of the DataFrame are the uniqufied timestamps of the log entries
        """
        if path in self.history:
            return
        
        print 'Processing %s' % path,
        start = time.time()
        header, df = load_log(path, extra=self.extra)
        if df is None:
            print 'Could not process %s' % path
            return
        self.progress_store.put(LogSaver.normalize(path), df)
        load_time = time.time() - start
        
        self.history[path] = {
            'start': df.index[0],
            'end': df.index[-1],
            'load_time': int(load_time),
            'num': len(df),
            'header': header
        }
        ObjectDirectory.save_object(self.history_path, self.history)
        del df
        print { k:v for k,v in self.history[path].items() if k != 'header' },
        print '%d of %d' % (len(self.history), len(self.log_list))

    def check(self):
        history = ObjectDirectory.load_object(self.history_path, {})
        sorted_keys = history.keys()
        sorted_keys.sort(key=lambda k: history[k]['start'])
        print '-' * 80
        print 'Time range by log file'
        for i, path in enumerate(sorted_keys):
            hist = history[path]
            print '%2d: %s  ---  %s : %s' % (i, hist['start'], hist['end'], path)
        
        path0 = sorted_keys[0]
        for path1 in sorted_keys[1:]:
            hist0,hist1 = history[path0],history[path1] 
            assert hist0['end'] < hist1['start'], '''
            -----------
            %s %s
            start: %s
            end  : %s
            -----------
            %s %s
            hist1['start']
            start: %s
            end  : %s
            ''' % (
                path0, hist0, hist0['start'],  hist0['end'],
                path1, hist1, hist1['start'],  hist1['end'])