コード例 #1
0
ファイル: age_structure.py プロジェクト: Iliato/openfisca-qt
def test():

    directory = os.path.dirname(__file__)
    fname = os.path.join(directory, H5_FILENAME)
    store = HDFStore(fname)
    print store
    print store.keys()
コード例 #2
0
ファイル: pca.py プロジェクト: shevisjohnson/paysage
    def from_saved(cls, store: pandas.HDFStore) -> None:
        """
        Create the PCA from its saved parameters.

        Notes:
            Performs an IO operation.

        Args:
            store (pandas.HDFStore)

        Returns:
            PCA

        """
        config = store.get_storer('pca').attrs.config
        pca = cls(config['num_components'], config['stepsize'])
        pca.W = be.float_tensor(store.get('pca/W').values)
        pca.var = be.float_tensor(store.get('pca/var').values[:, 0])
        # check the mean is present
        if 'pca/mean' in store.keys():
            pca.mean = be.float_tensor(store.get('pca/mean').values[:, 0])
        # if the saved PCA was fit from SVD, there is not calculator defined
        if 'pca/var_calc' in store.keys():
            pca.var_calc = math_utils.MeanVarianceArrayCalculator.from_dataframe(
                store.get('pca/var_calc'))
        else:
            pca.var_calc = math_utils.MeanVarianceArrayCalculator()
        return pca
コード例 #3
0
ファイル: common.py プロジェクト: jasonshih/log_analysis
def load_df(path, default=None):
    """Load DataFrame for HDF5 store path '\logs' table"""
    try:
        store = HDFStore(path)
        print store.keys()
        df = store.get('logs')
        store.close()
        return df
    except:
        return default
コード例 #4
0
ファイル: load_logs.py プロジェクト: jasonshih/log_analysis
 def test_store(self):    
     final_store = HDFStore(self.store_path)
     print '----'
     print final_store.keys()
     print '-' * 80
     logs = final_store['/logs']
     print type(logs)
     print len(logs)
     print logs.columns
     final_store.close()
コード例 #5
0
ファイル: kraken.py プロジェクト: portfolioscout/tf
def storeHdf5(data, tag, path):
    hdf = HDFStore(path,'a')
    if tag in hdf.keys():
        hdf.append(tag,data)
    else:
        hdf.put(tag,data)
    hdf.close()          
コード例 #6
0
def save_temp(dataframe, name = None, year = None, config_files_directory = default_config_files_directory):
    """
    Save a temporary table

    Parameters
    ----------
    dataframe : pandas DataFrame
                the dataframe to save
    name : string, default None

    year : integer, default None
           year of the data
    """
    if year is None:
        raise Exception("year is needed")
    if name is None:
        raise Exception("name is needed")
    hdf_file_path = get_tmp_file_path(config_files_directory = config_files_directory)
    store = HDFStore(hdf_file_path)
    log.info("{}".format(store))
    store_path = "{}/{}".format(year, name)

    if store_path in store.keys():
        del store["{}/{}".format(year, name)]

    dataframe.to_hdf(hdf_file_path, store_path)

    store.close()
    return True
コード例 #7
0
class PandasHDFHandler(FileHandler):
    r"""
    Handler for HDF5 files using Pandas.
    """
    def _open_for_read(self):
        self.handle = HDFStore(self.fname, mode='r')

    def _open_for_write(self):
        self.handle = HDFStore(self.fname)

    def list_items(self):
        keys = [key.strip('/') for key in self.handle.keys()]
        items = [(key, _get_type_from_attrs(self.handle.get_storer(key).attrs))
                 for key in keys if '/' not in key]
        # ---- for backward compatibility (LArray < 0.33) ----
        # axes
        items += [(key.split('/')[-1], 'Axis_Backward_Comp') for key in keys
                  if '__axes__' in key]
        # groups
        items += [(key.split('/')[-1], 'Group_Backward_Comp') for key in keys
                  if '__groups__' in key]
        return items

    def _read_item(self, key, typename, *args, **kwargs):
        if typename in _supported_typenames:
            hdf_key = '/' + key
        # ---- for backward compatibility (LArray < 0.33) ----
        elif typename == 'Axis_Backward_Comp':
            hdf_key = '__axes__/' + key
        elif typename == 'Group_Backward_Comp':
            hdf_key = '__groups__/' + key
        else:
            raise TypeError()
        return read_hdf(self.handle, hdf_key, *args, **kwargs)

    def _dump_item(self, key, value, *args, **kwargs):
        hdf_key = '/' + key
        if isinstance(value, (Array, Axis)):
            value.to_hdf(self.handle, hdf_key, *args, **kwargs)
        elif isinstance(value, Group):
            hdf_axis_key = '/' + value.axis.name
            value.to_hdf(self.handle, hdf_key, hdf_axis_key, *args, **kwargs)
        elif isinstance(value, _supported_scalars_types):
            s = pd.Series(data=value)
            self.handle.put(hdf_key, s)
            self.handle.get_storer(hdf_key).attrs.type = type(value).__name__
        else:
            raise TypeError()

    def _read_metadata(self):
        metadata = Metadata.from_hdf(self.handle)
        if metadata is None:
            metadata = Metadata()
        return metadata

    def _dump_metadata(self, metadata):
        metadata.to_hdf(self.handle)

    def close(self):
        self.handle.close()
コード例 #8
0
def storeHdf5(data, tag, path):
    hdf = HDFStore(path, 'a')
    if tag in hdf.keys():
        hdf.append(tag, data)
    else:
        hdf.put(tag, data)
    hdf.close()
コード例 #9
0
def convert_fiducial(filename, output_type="csv"):
    '''
    Converts the fiducial comparison HDF5 files into a CSV file.

    Parameters
    ----------
    filename : str
        HDF5 file.
    output_type : str, optional
           Type of file to output.
    '''

    store = HDFStore(filename)
    data_columns = dict()
    for key in store.keys():
        data = store[key].sort(axis=1)
        mean_data = data.mean(axis=1)
        data_columns[key[1:]] = mean_data
    store.close()

    df = DataFrame(data_columns)

    output_name = "".join(filename.split(".")[:-1]) + "." + output_type

    df.to_csv(output_name)
コード例 #10
0
def save_temp(dataframe,
              name=None,
              year=None,
              config_files_directory=default_config_files_directory):
    """
    Save a temporary table

    Parameters
    ----------
    dataframe : pandas DataFrame
                the dataframe to save
    name : string, default None

    year : integer, default None
           year of the data
    """
    if year is None:
        raise Exception("year is needed")
    if name is None:
        raise Exception("name is needed")
    hdf_file_path = get_tmp_file_path(
        config_files_directory=config_files_directory)
    store = HDFStore(hdf_file_path)
    log.info("{}".format(store))
    store_path = "{}/{}".format(year, name)

    if store_path in store.keys():
        del store["{}/{}".format(year, name)]

    dataframe.to_hdf(hdf_file_path, store_path)

    store.close()
    return True
コード例 #11
0
ファイル: hdf.py プロジェクト: liam2/larray
class PandasHDFHandler(FileHandler):
    """
    Handler for HDF5 files using Pandas.
    """
    def _open_for_read(self):
        self.handle = HDFStore(self.fname, mode='r')

    def _open_for_write(self):
        self.handle = HDFStore(self.fname)

    def list_items(self):
        keys = [key.strip('/') for key in self.handle.keys()]
        # axes
        items = [(key.split('/')[-1], 'Axis') for key in keys if '__axes__' in key]
        # groups
        items += [(key.split('/')[-1], 'Group') for key in keys if '__groups__' in key]
        # arrays
        items += [(key, 'Array') for key in keys if '/' not in key]
        return items

    def _read_item(self, key, type, *args, **kwargs):
        if type == 'Array':
            hdf_key = '/' + key
        elif type == 'Axis':
            hdf_key = '__axes__/' + key
            kwargs['name'] = key
        elif type == 'Group':
            hdf_key = '__groups__/' + key
            kwargs['name'] = key
        else:
            raise TypeError()
        return key, read_hdf(self.handle, hdf_key, *args, **kwargs)

    def _dump_item(self, key, value, *args, **kwargs):
        if isinstance(value, LArray):
            hdf_key = '/' + key
            value.to_hdf(self.handle, hdf_key, *args, **kwargs)
        elif isinstance(value, Axis):
            hdf_key = '__axes__/' + key
            value.to_hdf(self.handle, hdf_key, *args, **kwargs)
        elif isinstance(value, Group):
            hdf_key = '__groups__/' + key
            hdf_axis_key = '__axes__/' + value.axis.name
            value.to_hdf(self.handle, hdf_key, hdf_axis_key, *args, **kwargs)
        else:
            raise TypeError()

    def _read_metadata(self):
        metadata = Metadata.from_hdf(self.handle)
        if metadata is None:
            metadata = Metadata()
        return metadata

    def _dump_metadata(self, metadata):
        metadata.to_hdf(self.handle)

    def close(self):
        self.handle.close()
コード例 #12
0
 def load(self,dataFile):
     """load data from HDF"""
     if os.path.exists(dataFile):
         store = HDFStore(dataFile)
         symbols = [str(s).strip('/') for s in list(store.keys()) ]   
         data = dict(list(zip(symbols,[store[symbol] for symbol in symbols])))
         self.wp = Panel(data)
         store.close()
     else:
         raise IOError('Data file does not exist')
コード例 #13
0
ファイル: yahooFinance.py プロジェクト: mikimaus78/myIbPy
 def load(self, dataFile):
     """load data from HDF"""
     if os.path.exists(dataFile):
         store = HDFStore(dataFile)
         symbols = [str(s).strip('/') for s in store.keys()]
         data = dict(zip(symbols, [store[symbol] for symbol in symbols]))
         self.wp = WidePanel(data)
         store.close()
     else:
         raise IOError('Data file does not exist')
コード例 #14
0
 def load(self,dataFile):
     """load data from HDF"""
     if os.path.exists(dataFile):
         store = HDFStore(dataFile)
         symbols = store.keys()    
         data = dict(zip(symbols,[store[symbol] for symbol in symbols]))
         self.wp = WidePanel(data)
         store.close()
     else:
         raise IOError('Data file does not exist')
コード例 #15
0
def convert_fiducial(filename, output_type="csv", decimal_places=8,
                     append_comp=True, num_fids=5, return_name=True,
                     mode='mean', **kwargs):
    '''
    Converts the fiducial comparison HDF5 files into a CSV file.

    Parameters
    ----------
    filename : str
        HDF5 file.
    output_type : str, optional
           Type of file to output.
    decimal_places : int, optional
        Specify the number of decimal places to keep.
    append_comp : bool, optional
        Append on columns with fiducial numbers copy
    num_fids : int, optional
        Number of fiducials compared.
    '''

    store = HDFStore(filename)
    data_columns = dict()
    for key in store.keys():
        data = store[key].sort(axis=1)
        mean_data = timestep_choose(data, mode=mode, **kwargs)
        data_columns[key[1:]] = trunc_float(mean_data, decimal_places)
        comp_fids = store[key].index
    store.close()

    df = DataFrame(data_columns)

    if append_comp:
        fids = []
        for fid, num in zip(np.arange(0, num_fids - 1),
                            np.arange(num_fids - 1, 0, -1)):
            for _ in range(num):
                fids.append(fid)

        df["Fiducial 1"] = Series(np.asarray(fids).T, index=df.index)
        df["Fiducial 2"] = Series(comp_fids.T, index=df.index)

    for comp in all_comparisons:
        if comp in filename:
            break
    else:
        raise StandardError("Could not find a face comparison match for " +
                            filename)

    output_name = "fiducials" + comp[:-1] + "." + output_type

    df.to_csv(output_name)

    if return_name:
        return output_name
コード例 #16
0
class Serialization(object):
    def __init__(self, filename, mode='r', compress=True):

        self._filename = filename
        self._compress = compress
        self._mode = mode

    def __enter__(self):

        if self._compress:

            self._store = HDFStore(self._filename,
                                   complib='blosc:lz4',
                                   complevel=9,
                                   mode=self._mode)

        else:  # pragma: no cover

            self._store = HDFStore(self._filename, mode=self._mode)

        return self

    def __exit__(self, exc_type, exc_val, exc_tb):

        self._store.close()

    @property
    def keys(self):

        return self._store.keys()

    def store_pandas_object(self, path, obj, **metadata):

        self._store.put(path, obj, format='fixed')

        self._store.get_storer(path).attrs.metadata = metadata

    def retrieve_pandas_object(self, path):

        # Get the metadata
        metadata = self._store.get_storer(path).attrs.metadata

        # Get the object
        obj = self._store.get(path)

        return obj, metadata
コード例 #17
0
ファイル: geolife_data.py プロジェクト: yrevar/inverse_rl
    def load(hdf_file_name,
             dataset_dir,
             hdf5_data_name="/geolife_trajectories_labelled",
             process_labels=True):
        """Parse geolife data grouped by user.

        Store the datatframe in HDF store to speed up subsequent retrievals.
        """
        store = HDFStore(hdf_file_name)
        if hdf5_data_name in store.keys():
            data = store[hdf5_data_name]
        else:
            dirs_with_labels = GeoLifeData.find_dirs_with_labels(dataset_dir)
            data = GeoLifeData.get_dataframe_grouped_by_user(
                dirs_with_labels, process_labels)
            store[hdf5_data_name] = data
        store.close()
        return data
コード例 #18
0
    def aggregate(hdf_store_loc,
                  file_pattern,
                  headerfile=None,
                  remove_part_files=False):
        df = None

        store = HDFStore(hdf_store_loc)
        store_keys = [w.replace('/', '') for w in store.keys()]

        print(
            f'Aggregating part files in {hdf_store_loc} for {file_pattern} into single file'
        )

        for key in store_keys:
            if re.match(file_pattern.replace('*', '.+'), key):
                print(
                    f'********************* Key : {key} MAtches pattern : {file_pattern.replace("*",".+")}'
                )
                #thisdf = pd.read_hdf(store_loc, key)
                thisdf = store.select(key)

                if df is None:
                    df = thisdf
                else:
                    #' for gz file that not have headers assign headers.
                    try:
                        df = df.append(thisdf, ignore_index=True, sort=True)
                    except Exception as e:
                        print('Error while joining data {e}')

                if remove_part_files:
                    store.remove(key)

        try:
            #df.to_hdf(store_loc, key=file_pattern.replace('*',''))
            store.put(key=file_pattern.replace('*', ''), value=df)
        except Exception as e:
            print(
                f'Exception while combining flile for {file_pattern} exception {e}'
            )

        store.close()
コード例 #19
0
ファイル: Parameters.py プロジェクト: benjello/ga
    def init_parameters(self):
        '''
        Initialize the parameters of the simulation 
        '''        
        try:
            population_file = CONF.get('paths', 'population_file')         
            store_pop = HDFStore(population_file,'r')                
            self.population_choices = store_pop.keys()
            store_pop.close()

            profiles_file = CONF.get('paths', 'profiles_file')         
            store_prof = HDFStore(profiles_file,'r')
            profiles = store_prof['profiles']
            
            self.set_population_prolong()
            self.set_taxes_proj()
            
        except Exception, e:
            self.population_loaded = False
            QMessageBox.warning(self, u"Impossible de lire les données de population", 
                                u"GA n'a pas réussi à lire les données de population. L'erreur suivante a été renvoyée:\n%s\n\nVous pouvez configuer le chemin vers le fichier de données  Fichier>Paramètres>Chemins>Fichier données population"%e)
            return False
コード例 #20
0
ファイル: __init__.py プロジェクト: Iliato/openfisca-france
def save_temp(dataframe, name=None, year=None):
    """
    Save a temporary table

    Parameters
    ----------
    dataframe : pandas DataFrame
                the dataframe to save
    name : string, default None

    year : integer, default None
           year of the data
    """
    if year is None:
        raise Exception("year is needed")
    if name is None:
        raise Exception("year is needed")
    store = HDFStore(os.path.join(ERF_HDF5_DATA_DIR,'temp.h5'))
    if str(year)+"/"+name in store.keys():
        del store[str(year)+"/"+name]
    store[str(year)+"/"+name] = dataframe
    store.close()
    return True
コード例 #21
0
def save_temp(dataframe, name=None, year=None):
    """
    Save a temporary table

    Parameters
    ----------
    dataframe : pandas DataFrame
                the dataframe to save
    name : string, default None

    year : integer, default None
           year of the data
    """
    if year is None:
        raise Exception("year is needed")
    if name is None:
        raise Exception("year is needed")
    store = HDFStore(os.path.join(ERF_HDF5_DATA_DIR, 'temp.h5'))
    if str(year) + "/" + name in store.keys():
        del store[str(year) + "/" + name]
    store[str(year) + "/" + name] = dataframe
    store.close()
    return True
コード例 #22
0
ファイル: session.py プロジェクト: smritigambhir/larray
class PandasHDFHandler(FileHandler):
    """
    Handler for HDF5 files using Pandas.
    """
    def _open_for_read(self):
        self.handle = HDFStore(self.fname, mode='r')

    def _open_for_write(self):
        self.handle = HDFStore(self.fname)

    def list(self):
        return [key.strip('/') for key in self.handle.keys()]

    def _to_hdf_key(self, key):
        return '/' + key

    def _read_array(self, key, *args, **kwargs):
        return read_hdf(self.handle, self._to_hdf_key(key), *args, **kwargs)

    def _dump(self, key, value, *args, **kwargs):
        value.to_hdf(self.handle, self._to_hdf_key(key), *args, **kwargs)

    def close(self):
        self.handle.close()
コード例 #23
0
    def init_parameters(self):
        '''
        Initialize the parameters of the simulation 
        '''
        try:
            population_file = CONF.get('paths', 'population_file')
            store_pop = HDFStore(population_file, 'r')
            self.population_choices = store_pop.keys()
            store_pop.close()

            profiles_file = CONF.get('paths', 'profiles_file')
            store_prof = HDFStore(profiles_file, 'r')
            profiles = store_prof['profiles']

            self.set_population_prolong()
            self.set_taxes_proj()

        except Exception, e:
            self.population_loaded = False
            QMessageBox.warning(
                self, u"Impossible de lire les données de population",
                u"GA n'a pas réussi à lire les données de population. L'erreur suivante a été renvoyée:\n%s\n\nVous pouvez configuer le chemin vers le fichier de données  Fichier>Paramètres>Chemins>Fichier données population"
                % e)
            return False
コード例 #24
0
class Serialization(object):

    def __init__(self, filename):

        self._filename = filename

    def __enter__(self):

        self._store = HDFStore(self._filename, complib='blosc', complevel=9)

        return self

    def __exit__(self, exc_type, exc_val, exc_tb):

        self._store.close()

    @property
    def keys(self):

        return self._store.keys()

    def store_pandas_object(self, name, object, **metadata):

        self._store.put(name, object)

        self._store.get_storer(name).attrs.metadata = metadata

    def retrieve_pandas_object(self, name):

        # Get the metadata
        metadata = self._store.get_storer(name).attrs.metadata

        # Get the object
        obj = self._store[name]

        return obj, metadata
コード例 #25
0
def convert_format(path, face1, face2=None, design=None, mode='mean',
                   output_type="csv", parameters=None, decimal_places=8,
                   append_comp=True, keep_index=True, **kwargs):
    '''
    Takes all HDF5 files in given path comparing face1 to face2 and combines
    them into a single file.

    Parameters
    ----------
    path : str
        Path where files are located.
    face1 : int
        Face of the cube.
    face2: int, optional
        Face of the cube compared to. Disabled for observational comparison.
    design : str or pandas.DataFrame, optional
        If str, assumes a 'csv' file. Disabled for observational
        comparison.
    output_type : str, optional
        Type of file to output.
    parameters : list, optional
        Contains column names of design that are the parameters
        varied in the set. If None, all columns are appended to
        the output file.
    decimal_places : int, optional
        Specify the number of decimal places to keep.
    append_comp : bool, optional
        Append on columns with fiducial numbers copy
    '''

    if face2 is not None:
        files = [os.path.join(path, f) for f in os.listdir(path)
                 if os.path.isfile(os.path.join(path, f)) and
                 "_" + str(face1) + "_" + str(face2) + "_" in f and
                 "fid_comp" not in f]
    else:
        # Observational comparisons explicitly have 'face' in filename
        files = [os.path.join(path, f) for f in os.listdir(path)
                 if os.path.isfile(os.path.join(path, f)) and
                 "face_" + str(face1) in f and
                 "fid_comp" not in f]

    files.sort()
    print("Files used: %s" % (files))

    if len(files) == 0:
        raise StandardError("No files found for " + str(face1) + " and " +
                            str(face2))

    if design is not None:
        if isinstance(design, str):
            design = read_csv(design)

        if isinstance(parameters, list):
            design_df = {}
            for param in parameters:
                design_df[param] = Series(design[param])
            design_df = DataFrame(design_df)
        else:
            design_df = design

    for i, f in enumerate(files):
        store = HDFStore(f)
        data_columns = {}
        # Get data from HDF5
        for key in store.keys():
            data = store[key].sort(axis=0).sort(axis=1)
            index = data.index
            mean_data = timestep_choose(data, mode=mode, **kwargs)
            data_columns[key[1:]] = trunc_float(mean_data, decimal_places)
        store.close()

        # Add on design matrix
        if design is not None:
            for key in design_df:
                # can get nans if the file was made in excel
                design_df = design_df.dropna()
                design_df.index = index
                data_columns[key] = design_df[key]

        if keep_index:
            data_columns = DataFrame(data_columns, index=index)
        else:
            data_columns = DataFrame(data_columns)

        if append_comp:
            data_columns["Fiducial"] = \
                Series(np.asarray([i] * len(index)).T, index=index)
            data_columns["Designs"] = Series(index.T, index=index)

        if i == 0:  # Create dataframe
            df = data_columns
        else:  # Add on to dataframe
            df = concat([df, data_columns])

    if face2 is not None:
        filename = "distances_" + str(face1) + "_" + str(face2)
    else:
        filename = "complete_distances_face_" + str(face1)

    if "Name" in df.keys():
        del df["Name"]

    if output_type == "csv":
        df.to_csv(os.path.join(path, filename + ".csv"))
コード例 #26
0
def concat_convert_HDF5(path, face=None, combine_axis=0, mode='mean',
                        average_axis=None, interweave=True, statistics=None,
                        extension="h5", return_df=False, output_type="csv",
                        **kwargs):
    '''
    A more general function for combining sets of results. The output format
    defaults to a csv file and should be compatible with the plotting routines
    included in this module.

    Parameters
    ----------
    path : str
        Path to folder with the HDF5 files.
    face : int, optional
        If using a specific face to compare to, specify it here. This will
        look for files in the provided path that contain, for example,
        "face_0".
    combine_axis : int, optional
        The axis along which the data should be concatenated together.
        Defaults to the first axis (ie. 0).
    average_axis : int, optional
        If specified, the data is averaged along this axis.
    interweave : bool, optional
        Instead of appending directly together, this order the indices by
        grouping like labels.
    statistics : list, optional
        Which statistics to be extracted from the HDF5 files. If the statistic
        is not contained in all of the files, an error will be raised. By
        default, all statistics contained in all of the files will be returned.
    extension : str, optional
        The extension used for the HDF5 files. Defaults to ".h5". Several
        extensions are permitted and this is in place to allow whichever has
        been used.
    '''

    # Grab the files in the path
    if face is None:
        hdf5_files = glob.glob(os.path.join(path, "*", extension))
    else:
        if not isinstance(face, int):
            raise TypeError("face must be an integer.")

        hdf5_files = \
            glob.glob(os.path.join(path, "*face_" +
                                   str(face) + "*" + extension))

        if len(hdf5_files) == 0:
            raise Warning(
                "Did not find any HDF5 files in the path %s" % (path))

    if statistics is None:

        for i, hdf5 in enumerate(hdf5_files):
            store = HDFStore(hdf5)

            individ_stats = store.keys()

            store.close()

            if i == 0:
                statistics = individ_stats
            else:
                statistics = list(set(statistics) & set(individ_stats))

        if len(statistics) == 0:
            raise Warning(
                "There are no statistics that are contained in every file.")

        statistics = [stat[1:] for stat in statistics]

    for j, stat in enumerate(statistics):

        # Loop through the files and extract the statistic's table
        dfs = []
        for hdf5 in hdf5_files:

            store = HDFStore(hdf5)
            dfs.append(DataFrame(store[stat]))
            store.close()

        if average_axis is not None:
            for i in range(len(dfs)):
                dfs[i] = DataFrame(dfs[i].mean(average_axis))
                # dfs[i] = \
                #     DataFrame(timestep_choose(dfs[i],
                #                               avg_axis=average_axis, **kwargs))

        for i in range(len(dfs)):
            num = dfs[i].shape[0]

            dfs[i]['Names'] = dfs[i].index

            dfs[i]['Order'] = Series([i] * num, index=dfs[i].index)

            dfs[i].index = Index(range(num))

        stats_df = concat(dfs, axis=combine_axis)

        if interweave:
            stats_df = stats_df.sort_index()

            num = len(hdf5_files)

            num_splits = stats_df.shape[0] / num
            split_dfs = []
            for i in range(num_splits):

                split_df = stats_df[i * num:(i + 1) * num].copy()
                split_df = split_df.sort(columns=['Order'])

                split_dfs.append(split_df)

            stats_df = concat(split_dfs, axis=0)

        if j == 0:
            master_df = stats_df.copy()
            del master_df[0]
        master_df[stat] = DataFrame(stats_df[0], index=master_df.index)

    if return_df:
        return master_df
    else:
        if face is not None:
            master_df.to_csv(os.path.join(
                path, "distances_" + str(face) + ".csv"))
        else:
            master_df.to_csv(os.path.join(path, "combined_distances.csv"))
コード例 #27
0
from pandas import HDFStore
import os

folder_path = sys.argv[1]

faces = ["_0_0_", "_0_2_", "_2_0_", "_2_2_"]

for face in faces:
    old_comp = glob(os.path.join(folder_path,
                                 "*_comparisons_*" + face + "*.h5"))
    new_comp = glob(os.path.join(folder_path,
                                 "*8_fiducialfid_comp" + face + "*.h5"))

    print(old_comp)
    print(new_comp)
    assert len(old_comp) == 1
    assert len(new_comp) == 1

    old_result = HDFStore(old_comp[0])
    new_result = HDFStore(new_comp[0])

    for key in old_result.keys():
        if key in new_result.keys():
            continue
        new_result[key] = old_result[key].copy()

    print("New file keys: " + str(new_result.keys()))

    old_result.close()
    new_result.close()
コード例 #28
0
ファイル: kraken.py プロジェクト: portfolioscout/tf
        hdf.append(tag,data)
    else:
        hdf.put(tag,data)
    hdf.close()          

def getKrakenData(interval=1440,since=0):
    directory = krakenutl.SRCDIR
    if not os.path.exists(directory):
        os.makedirs(directory)
    for p in krakenutl.PAIRS:
        logger.debug('download data for: '+p+' interval: '+str(interval)+' since:'+str(krakenutl.localTimeFromEpoch(since)))
        pdata = getOhlc(p, interval,since)
        storeHdf5(pdata,krakenutl.getTagFromPair(p,interval),krakenutl.getH5source())


if __name__ == '__main__':   
    getKrakenData(krakenutl.DAY,STARTDATE) 
    getKrakenData(krakenutl.WEEK,STARTDATE)            
    getKrakenData(krakenutl.H3,STARTDATE) 
    getKrakenData(krakenutl.H1,STARTDATE)
    getKrakenData(krakenutl.M30,STARTDATE) 
    getKrakenData(krakenutl.M15,STARTDATE)
    getKrakenData(krakenutl.M5,STARTDATE) 
    #df=getOhlc("XXBTZUSD",5,1441148619) 
    #print(df) 
    hdf = HDFStore(krakenutl.getH5source())
    for k in hdf.keys():
        print(k,len(hdf[k]))
    hdf.close()              
    
コード例 #29
0
folder_path = sys.argv[1]

# All HDF5 files in the path
all_files = glob(os.path.join(folder_path, "*.h5"))

# Remove the PDF only ones (relevant results are in the PDF_KS and
# PDF_Hellinger keywords)
remove_keys = ["PDF", "PDF_AD"]

# Rename keys
rename_keys = {"VCS_Density": "VCS_Small_Scale",
               "VCS_Velocity": "VCS_Large_Scale"}

for f in all_files:
    store = HDFStore(f)

    # Removals
    for key in remove_keys:
        if "/" + key in store.keys():
            del store[key]

    # Rename
    for old_key in rename_keys:
        if "/" + old_key in store.keys():
            store[rename_keys[old_key]] = store[old_key].copy()
            del store[old_key]

    print("Final keys: " + str(store.keys()))

    store.close()
コード例 #30
0
ファイル: dump.py プロジェクト: wiso/grid_space_display
    monitor = Monitor(len(datelist))
    from pandas import HDFStore
    store = HDFStore('store.h5', complevel=9)
    fmap = wrap_monitor(wrap_write(partial(fetch_safe, rse=args.rse), store, overwrite=args.overwrite),
                        monitor)
    p.map(fmap, datelist)
    monitor.close()
    logging.info("closing file")
    store.close()

    logging.info("trying to open output")
    store = HDFStore('store.h5')
    data = []

    for k in store.keys():
        try:
            d = store.get(k)
            d['timestamp'] = pd.to_datetime(k.split("_")[1], format='%d%m%Y')
            data.append(d)
        except Exception as e:
            print "Problem reading", k
            print e
    store.close()
    data = pd.concat(data)
    data = data.set_index(['timestamp', 'owner'])

    data_to_plot = data['size'].unstack().fillna(0)
    dataplot = data_to_plot.iplot(kind='area', fill=True, asFigure=True)
    for d in dataplot['data']:
        d['hoverinfo'] = 'text+x+name'
コード例 #31
0
ファイル: load_logs.py プロジェクト: jasonshih/log_analysis
class LogSaver:
    """
        self.directory : Directory structure for temp and saved files
        self.log_list : List of server.log files to process
        self.extra : True if log messages and thread ids are to be saved too
        self.history_path : History of server.log conversions saved here
        self.progress_store_path : HDF5 file that holds one DataFrame for each server.log file 
        self.store_path : Final DataFrame of all server.log entries saved here
        self.history : History of server.log conversions
    """

    FINAL = 'logs'
    PROGRESS = 'progress'
    HISTORY = 'history'

    @staticmethod
    def normalize(name):
        return re.sub(r'[^a-zA-Z0-9]', '_', name)
     
    @staticmethod
    def make_name(base_name, extra):
        if extra:
            return base_name + '.extra'
        else:
            return base_name
     
    #@staticmethod
    #def temp_name(log_list, extra):
    #    hsh = hash(log_list)
    #    sgn = 'n' if hsh < 0 else 'p'
    #    temp = 'temp_%s%08X' % (sgn, abs(hsh))
    #    return LogSaver.make_name(temp, extra)    

    def __init__(self, store_path, log_list, extra):
        self.directory = ObjectDirectory(store_path)
        self.log_list = tuple(sorted(log_list))
        self.extra = extra

        self.history_path = self.directory.get_path(LogSaver.HISTORY, temp=True)
        self.progress_store_path = self.directory.get_path(LogSaver.PROGRESS, temp=True, is_df=True)
        self.store_path = self.directory.get_path(LogSaver.make_name(LogSaver.FINAL, extra), 
                            is_df=True)
        self.history = ObjectDirectory.load_object(self.history_path, {})
        self.saved = False
        
    def __repr__(self):
        return '\n'.join('%s: %s' % (k,v) for k,v in self.__dict__.items())
        
    def __str__(self):
        return '\n'.join([repr(self), '%d log files' % len(self.log_list)])    

    def save_all_logs(self, force=False):
         
        if os.path.exists(self.store_path):
            final_store = HDFStore(self.store_path)
            print 'Keys: %s' % final_store
            final_store.close()
            return
        if not force:
            assert not os.path.exists(self.history_path), '''
                %s exists but %s does not.
                There appears to be a conversion in progress.
                -f forces conversion to complete.
            ''' % (self.history_path, self.store_path)
        
        self.directory.make_dir_if_necessary(self.progress_store_path)
        self.progress_store = HDFStore(self.progress_store_path)
        for path in self.log_list:
            self.save_log(path)
        
        self.check()    
        print '--------'
        print 'All tables in %s' % self.progress_store_path
        print self.progress_store.keys()
        print '--------'
        
        def get_log(path):
            try:
                return self.progress_store.get(LogSaver.normalize(path))
            except Exception as e:
                print
                print path
                raise e
               
        
        df_list = [get_log(path) for path in self.log_list]     
        self.progress_store.close()
        print 'Closed %s' % self.progress_store_path
        
        df_all = pd.concat(df_list)
        print 'Final list has %d entries' % len(df_all)
        final_store = HDFStore(self.store_path)
        final_store.put('logs', df_all)
        print 'Keys: %s' % final_store
        final_store.close()
        print 'Closed %s' % self.store_path
        
        # Save the history in a corresponding file
        self.directory.save('history', self.history)
        print 'Saved history'
        
        self.saved = True
        

    def test_store(self):    
        final_store = HDFStore(self.store_path)
        print '----'
        print final_store.keys()
        print '-' * 80
        logs = final_store['/logs']
        print type(logs)
        print len(logs)
        print logs.columns
        final_store.close()

    def cleanup(self): 
        os.remove(self.progress_store_path)
        os.remove(self.history_path)
        
    def delete(self):
        os.remove(self.store_path)

    def save_log(self, path):
        """Return a pandas DataFrame for all the valid log entry lines in log_file
            The index of the DataFrame are the uniqufied timestamps of the log entries
        """
        if path in self.history:
            return
        
        print 'Processing %s' % path,
        start = time.time()
        header, df = load_log(path, extra=self.extra)
        if df is None:
            print 'Could not process %s' % path
            return
        self.progress_store.put(LogSaver.normalize(path), df)
        load_time = time.time() - start
        
        self.history[path] = {
            'start': df.index[0],
            'end': df.index[-1],
            'load_time': int(load_time),
            'num': len(df),
            'header': header
        }
        ObjectDirectory.save_object(self.history_path, self.history)
        del df
        print { k:v for k,v in self.history[path].items() if k != 'header' },
        print '%d of %d' % (len(self.history), len(self.log_list))

    def check(self):
        history = ObjectDirectory.load_object(self.history_path, {})
        sorted_keys = history.keys()
        sorted_keys.sort(key=lambda k: history[k]['start'])
        print '-' * 80
        print 'Time range by log file'
        for i, path in enumerate(sorted_keys):
            hist = history[path]
            print '%2d: %s  ---  %s : %s' % (i, hist['start'], hist['end'], path)
        
        path0 = sorted_keys[0]
        for path1 in sorted_keys[1:]:
            hist0,hist1 = history[path0],history[path1] 
            assert hist0['end'] < hist1['start'], '''
            -----------
            %s %s
            start: %s
            end  : %s
            -----------
            %s %s
            hist1['start']
            start: %s
            end  : %s
            ''' % (
                path0, hist0, hist0['start'],  hist0['end'],
                path1, hist1, hist1['start'],  hist1['end'])    
コード例 #32
0
ファイル: ml_data.py プロジェクト: enthought/pygotham
class WeatherStore(object):
    """ WeatherStore serves as a datasource for weather data

    """
    def __init__(self, filename):
        """
        Parameters
        ----------
        filename : filename pointing to an existing HDFStore with
            valid data in it.

        """
        self._store = HDFStore(filename)

    def dframe(self, city):
        """ Get weather data for specified city

        Parameters
        ----------
        city : string
            City for which to fetch data

        Returns
        -------
        result : pandas DataFrame

        """
        val = self._store[city]
        if isinstance(val, Panel):
            key = val.items[0]
            val = val[key]
        return val

    def field_numpy(self, city, field):
        """ Get weather field for specified city

        Parameters
        ----------
        city : string
            City for which data is being requested

        field : string
            Weather field being requested

        Returns
        -------
        result : numpy ndarray
            Value of requested weather field for city

        """
        df = self.dframe(city)
        y = np.empty((df.shape[0], ), dtype=np.float64)
        y[:] = df[field]
        return y

    def time_indices(self, df):
        """ Get time indices out of Pandas DataFrame

        Parameters
        ----------
        df : Pandas DataFrame

        Returns
        -------
        result : numpy ndarray
            Time index for given DataFrame

        """
        X = np.empty((df.shape[0], 3), dtype=np.float64)
        X[:, 0] = df.index.year
        X[:, 1] = df.index.month
        X[:, 2] = df.index.day
        return X

    def learning_data(self, city, field):
        """ Get input parameters and output values so that
        it can be shipped to a learning method.

        Returns
        -------
        X : numpy array of shape (n,2).
            Columns are month and day
        y : numpy array of shape (n,).
            value of field being requested

        """
        df = self.dframe(city)
        X = self.time_indices(df)[:, 1:]
        y = self.field_numpy(city, field)
        return X, y

    def dataseries(self, city, field):
        """ Get dataseries containing field data for city

        Parameters
        ----------
        city : string
            City for which data is being requested

        field : string
            Weather field being requested

        Returns
        -------
        result : DataSeries
            get the specified fieldute for city as a DataSeries

        """
        df = self.dframe(city)
        indices = self.time_indices(df)
        data = self.field_numpy(city, field)
        return DataSeries(city, data, indices)

    def cities(self):
        """ Get cities contained in this WeatherStore

        Returns
        -------
        result : List of strings
            Names of cities for which this Store has some
            weather data.

        """
        return self._store.keys()
コード例 #33
0
ファイル: ml_data.py プロジェクト: JuergenNeubauer/pygotham
class WeatherStore(object):
    """ WeatherStore serves as a datasource for weather data

    """
    def __init__(self, filename):
        """
        Parameters
        ----------
        filename : filename pointing to an existing HDFStore with
            valid data in it.

        """
        self._store = HDFStore(filename)

    def dframe(self, city):
        """ Get weather data for specified city

        Parameters
        ----------
        city : string
            City for which to fetch data

        Returns
        -------
        result : pandas DataFrame

        """
        val = self._store[city]
        if isinstance(val, Panel):
            key = val.items[0]
            val = val[key]
        return val

    def field_numpy(self, city, field):
        """ Get weather field for specified city

        Parameters
        ----------
        city : string
            City for which data is being requested

        field : string
            Weather field being requested

        Returns
        -------
        result : numpy ndarray
            Value of requested weather field for city

        """
        df = self.dframe(city)
        y = np.empty((df.shape[0], ), dtype=np.float64)
        y[:] = df[field]
        return y

    def time_indices(self, df):
        """ Get time indices out of Pandas DataFrame

        Parameters
        ----------
        df : Pandas DataFrame

        Returns
        -------
        result : numpy ndarray
            Time index for given DataFrame

        """
        X = np.empty((df.shape[0], 3), dtype=np.float64)
        X[:, 0] = df.index.year
        X[:, 1] = df.index.month
        X[:, 2] = df.index.day
        return X

    def learning_data(self, city, field):
        """ Get input parameters and output values so that
        it can be shipped to a learning method.

        Returns
        -------
        X : numpy array of shape (n,2).
            Columns are month and day
        y : numpy array of shape (n,).
            value of field being requested

        """
        df = self.dframe(city)
        X = self.time_indices(df)[:, 1:]
        y = self.field_numpy(city, field)
        return X, y

    def dataseries(self, city, field):
        """ Get dataseries containing field data for city

        Parameters
        ----------
        city : string
            City for which data is being requested

        field : string
            Weather field being requested

        Returns
        -------
        result : DataSeries
            get the specified fieldute for city as a DataSeries

        """
        df = self.dframe(city)
        indices = self.time_indices(df)
        data = self.field_numpy(city, field)
        return DataSeries(city, data, indices)

    def cities(self):
        """ Get cities contained in this WeatherStore

        Returns
        -------
        result : List of strings
            Names of cities for which this Store has some
            weather data.

        """
        return self._store.keys()
コード例 #34
0
ファイル: cutils.py プロジェクト: wayne1800/earth
def append_store_mod( module, path_store, n_days_refresh=None, b_ptrk=False ):
    """ append all new rows in module.field to store. Resize store as appropriate. """ 
    store = HDFStore( path_store )
    for field in module.__dict__.keys():
        if ( type( getattr( module, field ) ) is DataFrame or type( getattr( module, field ) ) is Panel ) and "/{}".format( field ) in store.keys():
            if "tdate" in field:
                getattr( module, field ).to_hdf( path_store, field, mode='a', format='fixed' )
            else:
                solbasic.logger.info( "Working on {}...".format( field ) )
                df = store[ field ].copy()
                df_new = getattr( module, field ).copy()
                if n_days_refresh == None:
                    l_index = sorted( list( set( df_new.index ) - set( df.index ) ) )
                else:
                    l_index = sorted( list( df_new.index[ -n_days_refresh: ] ) )
                l_columns = sorted( list( set( df_new.columns ) - set( df.columns ) ) )
                l_columns_rev = sorted( list( set( df.columns ) - set( df_new.columns ) ) )
                if l_columns:
                    solbasic.logger.info( "Adding {} instruments: {}".format( len( l_columns ), l_columns ) )
                    for col in l_columns:
                        df[ col ] = np.nan
                if l_columns_rev:
                    for col in l_columns_rev:
                        df_new[ col ] = df[ col ]
                if l_index:
                    solbasic.logger.info( "Refreshing {} dates: {}".format( len( l_index ), l_index ) )
                    for ind in l_index:
                        df.ix[ ind ] = df_new.ix[ ind ]
                    df.to_hdf( path_store, field, mode='a', format='fixed' )
    store.close()
    if b_ptrk:
        ptrk_store( path_store )
コード例 #35
0
def test_hdf5(h5_name):
    store = HDFStore(h5_name)
    for key in store.keys():
        print key

    store.close()
コード例 #36
0
        "lying", "sitting", "standing", "walking", "running", "cycling"
    ]] = totals_row[[
        "lying", "sitting", "standing", "walking", "running", "cycling"
    ]].divide(12, axis="index")
    return totals_row


host = 'http://localhost'
port = 10200
hf = HDFStore('/Volumes/LaCie/dataset/timestamped_predictions.hdf')
limit = 27007

subjectlist = pd.read_csv(
    api.subject_names(host, port, limit=limit, successful_only=True))
subjectlist = subjectlist['name'].tolist()

frames = []
for subject in subjectlist:
    print(subject)
    data = pd.read_csv(api.timestamped_predictions(host, port, subject),
                       names=["timestamp", "label", "probability"])
    hf.put('s' + subject.__str__(), data, format='table', data_columns=True)
    #summary_df = create_summary(subject,data)
    #frames.append(summary_df)

#appended_data = pd.concat(frames, axis=0)
#appended_data.to_csv("../output/AAAAAA-summary-all-classes.csv", index=False)

print(hf.keys())

hf.close()
コード例 #37
0
def preprocess(directory, n_entries):

    hdf_path = directory.get_path("logs.h5", temp=False)
    print "hdf_path: %s" % hdf_path

    store = HDFStore(hdf_path)
    print "Keys: %s" % store.keys()
    print store
    store.close()
    df = pd.read_hdf(hdf_path, "logs")

    # df = directory.load('logs.h5')
    print "df: %s" % df

    if n_entries >= 0:
        df = df[:n_entries]

    secs = (df.index.max() - df.index.min()).total_seconds()
    hours = secs / 3600
    levels = df.level.unique()

    print "%.1f hours of logs" % hours

    print "%d log entries/hour" % int(len(df) / hours)
    print "%.1f thousand log entries/hour" % (int(len(df) / hours) / 1000.0)
    print df.shape, df.columns
    for level in levels:
        print "%-5s : %5d" % (level, len(df[df.level == level]))
    print "df : %s" % str(df.shape)

    if False:

        def get_peak(counts):
            """Retun the peak value in Series counts"""
            if len(counts) == 0:
                return None
            return counts.indmax()
            # return counts.index[counts.argmax()]

    start_time, end_time = df.index.min(), df.index.max()
    print "orginal: start_time, end_time = %s, %s" % (start_time, end_time)

    # Start time and end time trunctated to whole minutes
    start_time = truncate_to_minutes(start_time + timedelta(minutes=2))
    end_time = truncate_to_minutes(end_time - timedelta(minutes=2))
    print "cleaned: start_time, end_time = %s, %s" % (start_time, end_time)

    details = get_details(df)
    directory.save("details", details)

    # The counts for each 1 minute bin
    minute_counts = get_minute_counts(df, start_time, end_time)
    print "minute_counts: %s\n%s" % (type(minute_counts), minute_counts.describe())
    print "total entries: %s" % minute_counts.sum()

    level_counts = {level: get_minute_counts(df[df.level == level], start_time, end_time) for level in levels}

    # level_peaks = {level: get_peak(level_counts[level])  for level in levels}
    # print 'level_peaks: %s' % level_peaks

    if False:
        unique_files = df.file.unique()
        print "%d source files" % len(unique_files)
        for i, fl in enumerate(sorted(unique_files)[:5]):
            print "%3d: %s" % (i, fl)

        directory.save("unique_files", unique_files)

    #
    # Get all the unique log messages
    #
    level_file_line = df.groupby(["level", "file", "line"])
    lfl_size = level_file_line.size()
    lfl_sorted = lfl_size.order(ascending=False)
    print "lfl_sorted: %s" % str(lfl_sorted.shape)

    # directory.save('level_file_line', tuple(level_file_line))
    directory.save("lfl_sorted", lfl_sorted)

    # file:line uniquely identifies each level,file,line
    # Construct mappings in both directions
    lfl_to_string = OrderedDict(((lvl, fl, ln), "%s:%d" % (fl, ln)) for lvl, fl, ln in lfl_sorted.index)
    string_to_lfl = OrderedDict(("%s:%d" % (fl, ln), (lvl, fl, ln)) for lvl, fl, ln in lfl_sorted.index)
    print "string_to_lfl: %s" % len(string_to_lfl)

    # [((level,file,line),count)] sorted by count in descending order
    entry_types_list = zip(lfl_sorted.index, lfl_sorted)

    # {(level,file,line) : count}
    entry_types = OrderedDict(entry_types_list)
    directory.save("entry_types", entry_types)
    print "entry_types: %s" % len(entry_types)

    #
    # Build the correlation table
    #
    threshold = min(100, len(df) // 1000)
    lfl_freq_dict = {
        s: get_minute_counts(df[(df.file == fl) & (df.line == ln)], start_time, end_time)
        for s, (lvl, fl, ln) in string_to_lfl.items()
        if len(df[(df.file == fl) & (df.line == ln)]) >= threshold
    }
    print "++++"
    lfl_freq = DataFrame(lfl_freq_dict, columns=string_to_lfl.keys())
    directory.save("lfl_freq", lfl_freq)

    lfl_freq_corr = lfl_freq.corr()
    directory.save("lfl_freq_corr", lfl_freq_corr)
    print "lfl_freq_corr: %s" % str(lfl_freq_corr.shape)
コード例 #38
0
ファイル: datatables.py プロジェクト: Iliato/openfisca-core
    def populate_from_survey_data(self, fname, year = None):
        '''
        Populates a DataTable from survey data
        '''
        list_entities = self.list_entities

        if isinstance(fname, str) or isinstance(fname, unicode):
            if fname[-4:] == '.csv':
                # TODO: implement it for _num_table==3 (or remove)
                if self.num_table == 1 :
                    with open(fname) as survey_data_file:
                        self.table = read_csv(survey_data_file)
                else :
                    raise Exception('For now, use three csv table is not allowed'
                                    'although there is no major difficulty. Please,'
                                    'feel free to code it')

            elif fname[-3:] == '.h5':
                store = HDFStore(fname)
                if self.num_table == 1 :
                    available_years = sorted([int(x[-4:]) for x in  store.keys()])
                elif self.num_table == 3 :
                    available_years = (sorted([int(x[-8:-4]) for x in  store.keys()]))
                # note+ we have a repetition here in available_years but it doesn't matter

                if year is None:
                    if self.datesim is not None:
                        year_ds = self.datesim.year
                    else:
                        raise Exception('self.datesim or year should be defined')
                else:
                    year_ds = year

                yr = year_ds + 0  # to avoid pointers problem
                while yr not in available_years and yr > available_years[0]:
                    yr = yr - 1
                base_name = 'survey_' + str(yr)
                if year_ds != yr:
                    print 'Survey data for year %s not found. Using year %s' % (str(year_ds), str(yr))
                else:
                    print 'Survey data for year %s found' % str(year_ds)

                if yr in available_years:
                    self.survey_year = yr


                if self.num_table == 1 :
                    self.table = _survey_subset(store[str(base_name)], self.subset)

                elif self.num_table == 3 :
                    for entity in self.list_entities:
                        self.table3[entity] = _survey_subset(store[str(base_name) + '/' + entity], self.subset)
                store.close()

        else:
            if self.num_table == 1:
                if not isinstance(fname, DataFrame):
                    raise Exception("When num_table=1, the object given as survey data must be a pandas DataFrame")
                else:
                    self.table = _survey_subset(fname, self.subset)
            elif self.num_table == 3:
                try:
                    for entity in list_entities:
                        assert isinstance(fname[entity], DataFrame)
                        self.table3[entity] = _survey_subset(fname[entity], self.subset)
                except:
                    log.error("When num_table=3, the object given as survey data"
                        " must be a dictionary of pandas DataFrame with each entity in keys")
                    raise

        missing_col = []
        var_entity = {}
        if self.num_table == 1 :
            self._nrows = self.table.shape[0]
            # Intialize to default value the missing variables
            for col in self.column_by_name.itervalues():
                if col.name not in self.table:
                    missing_col.append(col.name)
                    self.table[col.name] = col._default
                try:
                    if self.table[col.name].isnull().any():
                        self.table[col.name].fillna(col._default, inplace = True)
                    self.table[col.name] = self.table[col.name].astype(col._dtype)
                except:
                    log.error("Impossible de lire la variable suivante issue des données d'enquête :\n%s\n" % col.name)
                    raise
            # Keeping only valid input variables
            drop_variables = list(set(self.table.columns) - set(self.column_by_name.keys()))
            self.table.drop(drop_variables, inplace = True, axis = 1)

        elif self.num_table == 3 :
            self._nrows = self.table3['ind'].shape[0]
            for ent in list_entities:
                var_entity[ent] = [x for x in self.column_by_name.itervalues() if x.entity == ent]
                for col in var_entity[ent]:
                    if not col.name in self.table3[ent]:
                        missing_col.append(col.name)
                        self.table3[ent][col.name] = col._default
                    if self.table3[ent][col.name].isnull().any():
                        self.table3[ent][col.name].fillna(col._default, inplace = True)
                    self.table3[ent][col.name] = self.table3[ent][col.name].astype(col._dtype)
                if ent == 'foy':
                    self.table3[ent] = self.table3[ent].to_sparse(fill_value = 0)

        if missing_col:
            message = "%i input variables missing\n" % len(missing_col)
            messagef = ""
            messageb = ""
            missing_col.sort()
            for var in missing_col:
                if var[0] == 'f':
                    messagef += '  - ' + var + '\n'
                elif var[0] == 'b':
                    messageb += '  - ' + var + '\n'
                else:
                    message += '  - ' + var + '\n'
            if self.print_missing:
                print Warning(message + messagef + messageb)

        for var in model.ENTITIES_INDEX:
            if ('id' + var) in missing_col:
                raise Exception('Survey data needs variable %s' % ('id' + var))

            if ('qui' + var) in missing_col:
                raise Exception('Survey data needs variable %s' % ('qui' + var))

        self.gen_index(model.ENTITIES_INDEX)
コード例 #39
0
 def get_group_names(self):
     s = HDFStore(self.path)
     names = s.keys()
     s.close()
     return names
コード例 #40
0
 def get_population_choices(self, filename):
     store_pop = HDFStore(filename,'r')
     choices = store_pop.keys()
     store_pop.close()
     return choices
コード例 #41
0
__author__ = 'Gleb'
import warnings
from datetime import timedelta
from datetime import datetime

import pandas as pd
import numpy as np
from pandas import HDFStore

warnings.filterwarnings("ignore")
prices = ['F:\\DataBase\\BestBidAsk.h5']
deals = ['F:\\DataBase\\DealsFrom27Apr.h5', 'F:\\DataBase\\Deals.h5']

prices_store = HDFStore(prices[0])
instruments = prices_store.keys()
deals_store1 = HDFStore(deals[0])
deals_store2 = HDFStore(deals[1])
sizes = Data_merger('Deals', ' Nonaggr.csv')

def get_client_order_book(aggr_id, index):
    bid_path = '/' + str(aggr_id) + '/BidQuotes'
    ask_path = '/' + str(aggr_id) + '/AskQuotes'

    bool = pd.to_datetime(index) > datetime(2015,4,25)
    if bool:
        try:
            bid_quotes = deals_store1.select(bid_path).drop_duplicates(subset='QuoteId', take_last=False)
            bid_aval = 1
        except KeyError:
            bid_quotes = pd.DataFrame()
コード例 #42
0
# All HDF5 files in the path
all_files = glob(os.path.join(folder_path, "*.h5"))

# Remove the PDF only ones (relevant results are in the PDF_KS and
# PDF_Hellinger keywords)
remove_keys = ["PDF", "PDF_AD"]

# Rename keys
rename_keys = {
    "VCS_Density": "VCS_Small_Scale",
    "VCS_Velocity": "VCS_Large_Scale"
}

for f in all_files:
    store = HDFStore(f)

    # Removals
    for key in remove_keys:
        if "/" + key in store.keys():
            del store[key]

    # Rename
    for old_key in rename_keys:
        if "/" + old_key in store.keys():
            store[rename_keys[old_key]] = store[old_key].copy()
            del store[old_key]

    print("Final keys: " + str(store.keys()))

    store.close()
コード例 #43
0
class WikiStore(object):
    """
    WikiStore is a HDFStore storage for a Quandl WIKI dataset.

    The Quandl WIKI dataset can be retrieved from: https://www.quandl.com/data/WIKI-Wiki-EOD-Stock-Prices.
    """
    def __init__(self, base_dir, date_index=True):
        self.base_dir = base_dir
        assert os.path.exists(self.base_dir)
        self.date_index = date_index
        self._init()

    def keys(self):
        return self.tickers

    @lru_cache(maxsize=100)
    def __getitem__(self, item):
        df = self.store[item]
        if self.date_index:
            df.set_index('date', inplace=True)
        return df

    @staticmethod
    def store_snapshot(base_dir, snapshot_file):
        w_df = pd.read_csv(snapshot_file, parse_dates=[1])
        w_df.columns = [c.replace('-', '_') for c in w_df.columns]
        w_df.set_index('ticker', inplace=True)
        w_df.sort_index(inplace=True)

        snapshot_file = datetime.today().strftime('%Y%m%d')

        with HDFStore(os.path.join(base_dir, '{}.h5'.format(snapshot_file)),
                      'w',
                      complevel=6,
                      complib='blosc') as store:
            tickers = set(w_df.index)
            for ticker in tickers:
                df = w_df.loc[ticker, :]
                df.reset_index(inplace=True)
                df = df.drop('ticker', 1)

                store[ticker] = df

    def _init(self):
        self.store = HDFStore(latest_filename('{}/*.h5'.format(self.base_dir)))
        self.tickers = [t[1:] for t in self.store.keys()]

    def close(self):
        self.store.close()

    def tickers_column(self, tickers, col='adj_close', fun_filter=None):
        if not tickers:
            return None

        def fetch_column(ticker):
            ticker_dat = self[ticker]
            df = ticker_dat[[col]]
            df.columns = [ticker]
            if fun_filter:
                df = fun_filter(df)
            return df

        buf = [fetch_column(ticker) for ticker in tickers]

        if len(tickers) == 1:
            return buf[0]

        return buf[0].join(buf[1:])
コード例 #44
0
def test_hdf5(h5_name):
    store = HDFStore(h5_name)
    for key in store.keys():
        print key

    store.close()
コード例 #45
0
    def populate_from_survey_data(self, fname, year=None):
        '''
        Populates a DataTable from survey data
        '''
        list_entities = self.list_entities

        if isinstance(fname, str) or isinstance(fname, unicode):
            if fname[-4:] == '.csv':
                # TODO: implement it for _num_table==3 (or remove)
                if self.num_table == 1:
                    with open(fname) as survey_data_file:
                        self.table = read_csv(survey_data_file)
                else:
                    raise Exception(
                        'For now, use three csv table is not allowed'
                        'although there is no major difficulty. Please,'
                        'feel free to code it')

            elif fname[-3:] == '.h5':
                store = HDFStore(fname)
                if self.num_table == 1:
                    available_years = sorted(
                        [int(x[-4:]) for x in store.keys()])
                elif self.num_table == 3:
                    available_years = (sorted(
                        [int(x[-8:-4]) for x in store.keys()]))
                # note+ we have a repetition here in available_years but it doesn't matter

                if year is None:
                    if self.datesim is not None:
                        year_ds = self.datesim.year
                    else:
                        raise Exception(
                            'self.datesim or year should be defined')
                else:
                    year_ds = year

                yr = year_ds + 0  # to avoid pointers problem
                while yr not in available_years and yr > available_years[0]:
                    yr = yr - 1
                base_name = 'survey_' + str(yr)
                if year_ds != yr:
                    print 'Survey data for year %s not found. Using year %s' % (
                        str(year_ds), str(yr))
                else:
                    print 'Survey data for year %s found' % str(year_ds)

                if yr in available_years:
                    self.survey_year = yr

                if self.num_table == 1:
                    self.table = _survey_subset(store[str(base_name)],
                                                self.subset)

                elif self.num_table == 3:
                    for entity in self.list_entities:
                        self.table3[entity] = _survey_subset(
                            store[str(base_name) + '/' + entity], self.subset)
                store.close()

        else:
            if self.num_table == 1:
                if not isinstance(fname, DataFrame):
                    raise Exception(
                        "When num_table=1, the object given as survey data must be a pandas DataFrame"
                    )
                else:
                    self.table = _survey_subset(fname, self.subset)
            elif self.num_table == 3:
                try:
                    for entity in list_entities:
                        assert isinstance(fname[entity], DataFrame)
                        self.table3[entity] = _survey_subset(
                            fname[entity], self.subset)
                except:
                    log.error(
                        "When num_table=3, the object given as survey data"
                        " must be a dictionary of pandas DataFrame with each entity in keys"
                    )
                    raise

        missing_col = []
        var_entity = {}
        if self.num_table == 1:
            self._nrows = self.table.shape[0]
            # Intialize to default value the missing variables
            for col in self.column_by_name.itervalues():
                if col.name not in self.table:
                    missing_col.append(col.name)
                    self.table[col.name] = col._default
                try:
                    if self.table[col.name].isnull().any():
                        self.table[col.name].fillna(col._default, inplace=True)
                    self.table[col.name] = self.table[col.name].astype(
                        col._dtype)
                except:
                    log.error(
                        "Impossible de lire la variable suivante issue des données d'enquête :\n%s\n"
                        % col.name)
                    raise
            # Keeping only valid input variables
            drop_variables = list(
                set(self.table.columns) - set(self.column_by_name.keys()))
            self.table.drop(drop_variables, inplace=True, axis=1)

        elif self.num_table == 3:
            self._nrows = self.table3['ind'].shape[0]
            for ent in list_entities:
                var_entity[ent] = [
                    x for x in self.column_by_name.itervalues()
                    if x.entity == ent
                ]
                for col in var_entity[ent]:
                    if not col.name in self.table3[ent]:
                        missing_col.append(col.name)
                        self.table3[ent][col.name] = col._default
                    if self.table3[ent][col.name].isnull().any():
                        self.table3[ent][col.name].fillna(col._default,
                                                          inplace=True)
                    self.table3[ent][col.name] = self.table3[ent][
                        col.name].astype(col._dtype)
                if ent == 'foy':
                    self.table3[ent] = self.table3[ent].to_sparse(fill_value=0)

        if missing_col:
            message = "%i input variables missing\n" % len(missing_col)
            messagef = ""
            messageb = ""
            missing_col.sort()
            for var in missing_col:
                if var[0] == 'f':
                    messagef += '  - ' + var + '\n'
                elif var[0] == 'b':
                    messageb += '  - ' + var + '\n'
                else:
                    message += '  - ' + var + '\n'
            if self.print_missing:
                print Warning(message + messagef + messageb)

        for var in model.ENTITIES_INDEX:
            if ('id' + var) in missing_col:
                raise Exception('Survey data needs variable %s' % ('id' + var))

            if ('qui' + var) in missing_col:
                raise Exception('Survey data needs variable %s' %
                                ('qui' + var))

        self.gen_index(model.ENTITIES_INDEX)
コード例 #46
0
ファイル: test_file_handling.py プロジェクト: Aathi410/Pro123
def test_multiple_open_close(setup_path):
    # gh-4409: open & close multiple times

    with ensure_clean_path(setup_path) as path:

        df = tm.makeDataFrame()
        df.to_hdf(path, "df", mode="w", format="table")

        # single
        store = HDFStore(path)
        assert "CLOSED" not in store.info()
        assert store.is_open

        store.close()
        assert "CLOSED" in store.info()
        assert not store.is_open

    with ensure_clean_path(setup_path) as path:

        if pytables._table_file_open_policy_is_strict:
            # multiples
            store1 = HDFStore(path)
            msg = (
                r"The file [\S]* is already opened\.  Please close it before "
                r"reopening in write mode\."
            )
            with pytest.raises(ValueError, match=msg):
                HDFStore(path)

            store1.close()
        else:

            # multiples
            store1 = HDFStore(path)
            store2 = HDFStore(path)

            assert "CLOSED" not in store1.info()
            assert "CLOSED" not in store2.info()
            assert store1.is_open
            assert store2.is_open

            store1.close()
            assert "CLOSED" in store1.info()
            assert not store1.is_open
            assert "CLOSED" not in store2.info()
            assert store2.is_open

            store2.close()
            assert "CLOSED" in store1.info()
            assert "CLOSED" in store2.info()
            assert not store1.is_open
            assert not store2.is_open

            # nested close
            store = HDFStore(path, mode="w")
            store.append("df", df)

            store2 = HDFStore(path)
            store2.append("df2", df)
            store2.close()
            assert "CLOSED" in store2.info()
            assert not store2.is_open

            store.close()
            assert "CLOSED" in store.info()
            assert not store.is_open

            # double closing
            store = HDFStore(path, mode="w")
            store.append("df", df)

            store2 = HDFStore(path)
            store.close()
            assert "CLOSED" in store.info()
            assert not store.is_open

            store2.close()
            assert "CLOSED" in store2.info()
            assert not store2.is_open

    # ops on a closed store
    with ensure_clean_path(setup_path) as path:

        df = tm.makeDataFrame()
        df.to_hdf(path, "df", mode="w", format="table")

        store = HDFStore(path)
        store.close()

        msg = r"[\S]* file is not open!"
        with pytest.raises(ClosedFileError, match=msg):
            store.keys()

        with pytest.raises(ClosedFileError, match=msg):
            "df" in store

        with pytest.raises(ClosedFileError, match=msg):
            len(store)

        with pytest.raises(ClosedFileError, match=msg):
            store["df"]

        with pytest.raises(ClosedFileError, match=msg):
            store.select("df")

        with pytest.raises(ClosedFileError, match=msg):
            store.get("df")

        with pytest.raises(ClosedFileError, match=msg):
            store.append("df2", df)

        with pytest.raises(ClosedFileError, match=msg):
            store.put("df3", df)

        with pytest.raises(ClosedFileError, match=msg):
            store.get_storer("df2")

        with pytest.raises(ClosedFileError, match=msg):
            store.remove("df2")

        with pytest.raises(ClosedFileError, match=msg):
            store.select("df")

        msg = "'HDFStore' object has no attribute 'df'"
        with pytest.raises(AttributeError, match=msg):
            store.df
コード例 #47
0
    hdf.close()


def getKrakenData(interval=1440, since=0):
    directory = krakenutl.SRCDIR
    if not os.path.exists(directory):
        os.makedirs(directory)
    for p in krakenutl.PAIRS:
        logger.debug('download data for: ' + p + ' interval: ' +
                     str(interval) + ' since:' +
                     str(krakenutl.localTimeFromEpoch(since)))
        pdata = getOhlc(p, interval, since)
        storeHdf5(pdata, krakenutl.getTagFromPair(p, interval),
                  krakenutl.getH5source())


if __name__ == '__main__':
    getKrakenData(krakenutl.DAY, STARTDATE)
    getKrakenData(krakenutl.WEEK, STARTDATE)
    getKrakenData(krakenutl.H3, STARTDATE)
    getKrakenData(krakenutl.H1, STARTDATE)
    getKrakenData(krakenutl.M30, STARTDATE)
    getKrakenData(krakenutl.M15, STARTDATE)
    getKrakenData(krakenutl.M5, STARTDATE)
    #df=getOhlc("XXBTZUSD",5,1441148619)
    #print(df)
    hdf = HDFStore(krakenutl.getH5source())
    for k in hdf.keys():
        print(k, len(hdf[k]))
    hdf.close()
コード例 #48
0
import os
import pdb

import numpy as np
from pandas import HDFStore # DataFrame
from openfisca_core import model


filename = os.path.join(model.DATA_DIR, 'survey.h5')
filename3 = os.path.join(model.DATA_DIR, 'survey3.h5')

store = HDFStore(filename)
output = HDFStore(filename3)

#faire un remove de output pour pouvoir ecraser
available_years = sorted([int(x[-4:]) for x in  store.keys()])
available_years = [2006]


def from_one_to_three(table, entity):
    return [
        name
        for name, column in model.column_by_name.iteritems()
        if name in table.columns and column.entity == entity
        ]


# on peut en profiter pour faire l'index ici ? Ca tournerait un peu plus vite
# mais surtout de maniere plus "essentielle"

for year in available_years:
コード例 #49
0
ファイル: base.py プロジェクト: afeldmei/wsynphot
    def load_filter(cls,
                    filter_name=None,
                    wavelength_unit=None,
                    interpolation_kind='linear'):
        """

        Parameters
        ----------

        filter_name: str or None

        wavelength_unit: str or astropy.units.Unit
            for some filtersets (e.g. gemini) this can be autodetected

        interpolation_kind: str
            see scipy.interpolation.interp1d


        """
        if filter_name is None:
            filter_store = HDFStore(filter_data_fname, mode='r')
            available_filters = filter_store.keys()
            filter_store.close()
            print("Available Filters\n"
                  "-----------------\n\n" + '\n'.join(available_filters))

        else:
            filter_store = HDFStore(filter_data_fname, mode='r')
            try:
                filter = filter_store[filter_name]
            except KeyError:
                filter_store.close()
                raise ValueError(
                    'Requested filter ({0}) does not exist'.format(
                        filter_name))
            finally:
                filter_store.close()

            if 'gemini' in filter_name:
                wavelength_unit = 'nm'
            elif 'bessell' in filter_name:
                wavelength_unit = 'angstrom'

            elif 'hst' in filter_name:
                wavelength_unit = 'angstrom'

            elif 'decam' in filter_name:
                wavelength_unit = 'angstrom'

            elif 'sdss' in filter_name:
                wavelength_unit = 'angstrom'

            if wavelength_unit is None:
                raise ValueError('No "wavelength_unit" given and none '
                                 'autodetected')

            wavelength = filter.wavelength.values * u.Unit(wavelength_unit)

            return cls(wavelength,
                       filter.transmission_lambda.values,
                       interpolation_kind=interpolation_kind,
                       filter_name=filter_name)
コード例 #50
0
 def get_population_choices(self, filename):
     store_pop = HDFStore(filename, 'r')
     choices = store_pop.keys()
     store_pop.close()
     return choices
コード例 #51
0
ファイル: hdf5_to_csv.py プロジェクト: CUUATS/landuse-model

# In[46]:

def hdf5_to_csv(filename): 
    
      """
    Converts hdf5 files to csv

    Parameters
    ----------
    filename: string or list of strings
        Name of the hdf5 file being converted

    Returns
    -------
    Rewrites hdf5 keys to invidual csv files (returns nothing)
    """
        
    store = HDFStore(filename)
    for key in range(len(store.keys())):
        store[store.keys()[key]].to_csv(store.keys()[key][1:] + '.csv')


# In[39]:

# hdf5 files can also be dumped into an asci file with the following line in the command prompt

h5dump -o dset.asci -y -w 400 sanfran_public.h5

コード例 #52
0
def convert_format(path, design, face1, face2, output_type="csv",
                   parameters=None):
    '''
    Takes all HDF5 files in given path comparing face1 to face2 and combines
    them into a single file.

    Parameters
    ----------
    path : str
        Path where files are located.
    design : str or pandas.DataFrame
             If str, assumes a 'csv' file.
    face1 : int
        Face of the cube.
    face2: int
        Face of the cube compared to.
    output_type : str, optional
           Type of file to output.
    parameters : list, optional
                 Contains column names of design that are the parameters
                 varied in the set. If None, all columns are appended to
                 the output file.
    '''

    files = [path + f for f in os.listdir(path) if os.path.isfile(path + f)
             and str(face1) + "_" + str(face2) in f and f[:9] != "fiducial_"]
    print "Files used: %s" % (files)

    if isinstance(design, str):
        design = read_csv(design)

    if isinstance(parameters, list):
        design_df = {}
        for param in parameters:
            design_df[param] = Series(design[param])
        design_df = DataFrame(design_df)
    else:
        design_df = design

    for i, f in enumerate(files):
        store = HDFStore(f)
        data_columns = {}
        # Get data from HDF5
        for key in store.keys():
            data = store[key].sort(axis=0).sort(axis=1)
            index = data.index
            mean_data = data.mean(axis=1)
            data_columns[key[1:]] = mean_data
        store.close()

        # Add on design matrix
        for key in design_df:
            # can get nans if the file was made in excel
            design_df = design_df.dropna()
            design_df.index = index
            data_columns[key] = design_df[key]

        if i == 0:  # Create dataframe
            df = DataFrame(data_columns)
        else:  # Add on to dataframe
            data_columns = DataFrame(data_columns)
            df = concat([df, data_columns])

    filename = "distances_"+str(face1)+"_"+str(face2)

    if output_type == "csv":
        df.to_csv(path+filename+".csv")
コード例 #53
0
 def get_keys(self):
     s = HDFStore(self.path)
     keys = s.keys()
     s.close()
     return keys