Beispiel #1
0
def merge():
    dfs = []
    with pd.get_store('/Volumes/HDD/Users/tom/DataStorage/CPS/analyzed/clean.h5') as store:

        for k, v in store.iteritems():
            if k.startswith('/y'):
                df = store.select(k)

                df = df[(df['age'] > 18) & (df['age'] < 65)]

                dfs.append(df)

    df = pd.concat(dfs)

    with pd.get_store('/Volumes/HDD/Users/tom/DataStorage/CPS/analyzed/analyzed_store.h5') as store:
        deflator = store.select('deflator')['deflator']
        deflator.index.set_names(['qmonth'], inplace=True)

    df.loc[:, 'wage'] = df.earnwke.div(df.uhourse)
    df.loc[:, 'r_wage'] = df.wage.div(deflator, level='qmonth') * 100
    df.loc[:, 'lr_wage'] = np.log(df.r_wage)

    assert df.index.is_unique
    with pd.get_store('/Volumes/HDD/Users/tom/DataStorage/CPS/analyzed/clean.h5') as store:
        df.to_hdf(store, 'merged', format='t', append=False)
def run_all_analysis_generation(results_path):
    with pd.get_store(results_path + '.h5') as store:
        trust_observations = store.trust.dropna()


    def power_set_map(powerset_string):
        strings = powerset_string[1:].split('_')
        metrics = strings[:-3]
        t = strings[-2]
        signed = strings[-1]
        return ({
            'metrics': metrics,
            'type': t,
            'dataset': powerset_string[1:],
            'signed': signed
        })

    shared_big_h5_path = "/home/bolster/src/aietes/results/powerset_shared.h5"
    with pd.get_store(shared_big_h5_path) as store:
        keys = store.keys()
        random.shuffle(keys)
        result = map(power_set_map, keys)
        powerset_list = filter(lambda t: t['type'] == 'feats', result)
        for d in powerset_list:
            d['data'] = store[d['dataset']]

    best_weight_valences_and_runs_for_metric_subset = {}
    visited = {}
    with Parallel(n_jobs=-1) as par:
        for subset_d in tqdm(powerset_list): # This maps to a single powerset_shared.h5 dataset (comms_alt, etc)
            best_output_file = '{}.json'.format(os.path.join(json_path,subset_d['dataset']))

            if os.path.isfile(best_output_file):
                print("Skipping: File already exists, consider deleting it {}".format(subset_d['dataset']))
                visited[subset_d['dataset']]='exists'
            else:
                print"Preparing {}".format(subset_d['dataset'])
                open(best_output_file, 'a').close()

            if subset_d['dataset'] in visited:
                continue
            try:
                feat_weights = categorise_dataframe(non_zero_rows(subset_d['data']).T)
                if subset_d['metrics'] is not None:
                    best = best_of_all(feat_weights, trust_observations[subset_d['metrics']], par=par)
                else:
                    best = best_of_all(feat_weights, trust_observations, par=par)
                best_weight_valences_and_runs_for_metric_subset[subset_d['dataset']] = best


                with open(best_output_file, 'w') as f:
                    json.dump(best, f, cls=NumpyAwareJSONEncoder)
                copy(best_output_file, results_path)
            except:
                print("Failed on {}".format(subset_d['dataset']))
                os.remove(best_output_file)

    return best_weight_valences_and_runs_for_metric_subset
Beispiel #3
0
 def setUp(self):
     _skip_if_no_pytables()
     filename = os.path.join(path, 'features_size9_masscut2000.df')
     f = pd.read_pickle(filename)
     self.key = 'features'
     with pd.get_store('temp1.h5') as store:
         store.put(self.key, f)
     with pd.get_store('temp2.h5') as store:
         store.append(self.key, f, data_columns=['frame'])
Beispiel #4
0
 def setUp(self):
     _skip_if_no_pytables()
     filename = os.path.join(path, 'features_size9_masscut2000.df')
     f = pd.read_pickle(filename)
     self.key = 'features'
     with pd.get_store('temp1.h5') as store:
         store.put(self.key, f)
     with pd.get_store('temp2.h5') as store:
         store.append(self.key, f, data_columns=['frame'])
def per_scenario_gd_mal_trusts(gd_file, mal_file):
    if not all(map(os.path.isfile, [gd_file, mal_file])):
        if all(map(os.path.isfile, map(Tools.in_results, [gd_file, mal_file]))):
            gd_file, mal_file = map(Tools.in_results, [gd_file, mal_file])
        else:
            raise OSError("Either {0} or {1} is not present".format(gd_file, mal_file))
    with pd.get_store(mal_file) as store:
        mal_trust = store.get('trust')
        map_levels(mal_trust, scenario_map)
    with pd.get_store(gd_file) as store:
        gd_trust = store.get('trust')
        map_levels(gd_trust, scenario_map)
    return gd_trust, mal_trust
Beispiel #6
0
	def project_phases(self, nr_cycles = 20, freqs_of_interest = [1.8, 2.2]):

		self.assert_data_intern()
		self.read_trans_counts()

		replay_phase = np.loadtxt(os.path.join(self.analyzer.sj_dir, 'phase_delay.txt'))[0]
		with pd.get_store(self.analyzer.h5_file) as h5_file: 
			real_data = h5_file.get("/%s/tf/cycles_%s_%s"%(self.file_alias, nr_cycles, 'tf_complex_real'))
			imag_data = h5_file.get("/%s/tf/cycles_%s_%s"%(self.file_alias, nr_cycles, 'tf_complex_imag'))
			power_data = h5_file.get("/%s/tf/cycles_%s_%s"%(self.file_alias, nr_cycles, 'tf_power_Z'))

		trial_numbers = np.array(real_data.keys())
		frequencies = np.array(real_data.major_axis)
		timepoints = np.array(real_data.minor_axis)

		real_m = np.array(real_data)[:,(frequencies>freqs_of_interest[0]) & (frequencies<freqs_of_interest[1]),:].mean(axis = 1)
		imag_m = np.array(imag_data)[:,(frequencies>freqs_of_interest[0]) & (frequencies<freqs_of_interest[1]),:].mean(axis = 1)
		power_m = np.array(power_data)[:,(frequencies>freqs_of_interest[0]) & (frequencies<freqs_of_interest[1]),:].mean(axis = 1)

		expected_phase_real = np.cos(replay_phase + timepoints * self.stim_frequency * 2.0 * np.pi)
		expected_phase_imag = np.sin(replay_phase + timepoints * self.stim_frequency * 2.0 * np.pi)

		complex_data = np.array([real_m, imag_m]).transpose((1,2,0))
		template_data = np.array([expected_phase_real, expected_phase_imag]).transpose((1,0))

		projected_data = np.zeros(complex_data.shape[:-1])
		for x in range(len(complex_data)):
			projected_data[x] = np.array([np.dot(c, t)/np.dot(t,t) for c, t in zip(complex_data[x], template_data)])
		
		# plot these timecourses per trial
		f = pl.figure(figsize = (8,24))
		for x in range(len(complex_data)):
			s = f.add_subplot(len(complex_data), 1, x+1)
			pl.plot(timepoints, projected_data[x], 'k', lw = 4.0)
			pl.plot(timepoints, power_m[x], 'k--', lw = 2.0)
			s.axhline(np.median(projected_data[x]), color = 'b', lw = 3.0)
			s.axhline(np.mean(projected_data[x]), color = 'r', lw = 3.0, ls = '--', alpha = 0.6)
			if hasattr(self, 'trans_counts'):
				s.annotate('%i'%self.trans_counts[x], (0.5,0.65), textcoords = 'axes fraction')
			s.set_ylim([-2,4])
			sn.despine(offset=10)
		pl.tight_layout()
		pl.savefig(os.path.join(self.analyzer.fig_dir, self.file_alias + '_projected.pdf'))

		# save out as dataframe
		pdf = pd.DataFrame(projected_data, index = trial_numbers, columns = timepoints)
		with pd.get_store(self.analyzer.h5_file) as h5_file: 
			h5_file.put("/%s/tf/cycles_%s_%s"%(self.file_alias, nr_cycles, 'projected'), pdf)
Beispiel #7
0
def write_failure_log(node, hdf5_path, node_dir, window_length,
                      unique_fhsseverity):
    with pd.get_store(hdf5_path) as store:
        failures = store.select('tacos',
                                columns=['fhsseverity'],
                                where='node == node').sort()

    node_filename = uuid.uuid4()

    with pd.HDFStore('{}/{}.h5'.format(node_dir, node_filename),
                     'w',
                     complevel=9,
                     complib='blosc') as node_store:
        targets = []
        for index in failures.index:
            target = {key: 0 for key in unique_fhsseverity}
            # Adding one second weeds out concurrent events. Should they be included?
            window = failures[index + pd.Timedelta(1, unit='s'):index +
                              pd.Timedelta(window_length, unit='m')].copy()
            window.fillna('nan', inplace=True)
            window['decay'] = window.apply(lambda row: np.exp(
                math.log(.01) * (row.index - index).seconds /
                (60 * window_length)))
            for _, row in window[::-1].iterrows():
                target[row.fhsseverity] = row.decay
            targets.append(target.values())

        failure_windows = np.array(targets)
        for i, key in enumerate(target.keys()):
            failures['error_{}'.format(key)] = failure_windows[:, i]

        node_store.append('data', failures, data_columns=True)
        node_store.append('node', node)
Beispiel #8
0
def write_space(f_name, context, targets, matrix):
    """Write a vector space without creating it.

    :param pandas.DataFrame context: the column labels
    :param pandas.DataFrame targets: the row lables

    ``row_labels`` and ``column_labels`` contain two columns: ``ngram`` and
    ``id``. ``ngram`` is frame's index.

    :param pandas.DataFrame matrix: the co-occurrence counts. The frame consists
        of three columns: ``count``, ``id_target`` and ``id_context``.

        ``id_target`` and ``id_context`` is frame's index.

    """

    assert np.isfinite(matrix['count']).all()

    assert context.reset_index().notnull().all().all(), 'There are NULL row labels.'
    assert targets.reset_index().notnull().all().all(), 'There are NULL column labels.'

    with pd.get_store(
        f_name,
        mode='w',
        complevel=9,
        complib='blosc',
    ) as store:
        # Reset index for context and tartgets, so the string "nan" is not
        # converted to NULL! Once https://github.com/pydata/pandas/issues/9604
        # is fixed, there is no need to reset the index.
        if context is not None:
            store['context'] = context.reset_index()

        store['targets'] = targets.reset_index()
        store['matrix'] = matrix
Beispiel #9
0
def datalogger_backup(filename='default'): 
    "saves all datalogger data to filename"
    written = 0
    if filename=='default':
        filename = os.path.join(MEDIA_ROOT,'datalogger.h5')
    metadata = dict()
    data=dict()
    sensors = Sensor.objects.all()
    if sensors is None:
        raise ValueError("No sensors in datalogger!")
    for sens in sensors:
        metadata[sens.name]=sens.description
        sd = SensingDevice(name=sens.name)
        data[sens.name]=sd.getallpoints_raw()
    with pandas.get_store(filename) as store:
        for sens in sensors:
            store[sens.name] = data[sens.name]
            written+=len(data[sens.name])
    with h5py.File(filename) as the_file:
        try:
            params = the_file["params"]
        except KeyError:
            params = the_file.create_group("params")
        for key, value in metadata.iteritems():
            try:
                params[key]
            except KeyError:  
                params.create_dataset(key, data=value)
            else:
                del params[key]
                params.create_dataset(key, data=value)
    print "Backup finished! "+str(written)+" MeasurementPoints written to "+filename
Beispiel #10
0
def table_slice(fname, table, bounds):
    '''
    Small utility function to get the data only between bounds

    Parameters:
    -----------

    fname: str, path to the HDF file
    bounds: dict
        key, value pairs of the form 'indexable':(min_value, max_value)
        if one of those values is None, it will not be taken into account
        (ie take the absolute min or max) as for range(), start is included and stop excluded)
    Each of the bounds will be joined to the 'where' keywarg (by logical and)

    Note:
    -----
    This will only work from indexable columns, aka data columns, which must be set
    explicetely when recording the table. See the corresponding error when you don't
    use one of those. See
    http://pandas.pydata.org/pandas-docs/stable/io.html#query-via-data-columns

    '''
    conditions = []
    for column, (min_value, max_value) in bounds.items():
        start = "{} >= {}".format(column, min_value) if min_value is not None else ""
        stop = "{} < {}".format(column, max_value) if max_value is not None else ""
        conditions.extend([start, stop])
    where = " & ".join([cnd for cnd in conditions if len(cnd)])
    with pd.get_store(fname) as store:
        if len(where):
            return store.select(table, where=where)
        else:
            return store.select(table)
Beispiel #11
0
def plot_correlations(filenames, limit=None):
    DataFrame = pd.DataFrame()
    index = 0
    for fileName in filenames:
        with pd.get_store(fileName, 'r') as store:
            tempDataFrame = pd.DataFrame({'Event': store.Hits.Event[:15000], 'Row' + str(index): store.Hits.Row[:15000]})
            tempDataFrame = tempDataFrame.set_index('Event')
            DataFrame = tempDataFrame.join(DataFrame)
            DataFrame = DataFrame.dropna()
            index += 1
            del tempDataFrame
    DataFrame["index"] = DataFrame.index
    DataFrame.drop_duplicates(take_last=True, inplace=True)
    del DataFrame["index"]
    correlationNames = ('Row')
    index = 0
    for corName in correlationNames:
        for colName in itertools.permutations(DataFrame.filter(regex=corName), 2):
            if(corName == 'Col'):
                heatmap, xedges, yedges = np.histogram2d(DataFrame[colName[0]], DataFrame[colName[1]], bins=(80, 80), range=[[1, 80], [1, 80]])
            else:
                heatmap, xedges, yedges = np.histogram2d(DataFrame[colName[0]], DataFrame[colName[1]], bins=(336, 336), range=[[1, 336], [1, 336]])
            extent = [yedges[0] - 0.5, yedges[-1] + 0.5, xedges[-1] + 0.5, xedges[0] - 0.5]
            cmap = cm.get_cmap('hot', 40)
            fig = Figure()
            FigureCanvas(fig)
            ax = fig.add_subplot(111)
            ax.imshow(heatmap, extent=extent, cmap=cmap, interpolation='nearest')
            ax.invert_yaxis()
            ax.set_xlabel(colName[0])
            ax.set_ylabel(colName[1])
            ax.set_title('Correlation plot(' + corName + ')')
            fig.savefig(colName[0] + '_' + colName[1] + '.pdf')
            index += 1
Beispiel #12
0
 def save(self, filename, with_data=True):
     if not os.path.exists(filename) and not with_data:
         print "Tried to save a new file without data. Overriding false with_data argument! "
         with_data=True
     if self.data is None and with_data:
         raise ValueError("could not save curve, no data was set")
     if self.params is None:
         raise ValueError("could not save curve, no params were set")
     
     if with_data:
         with pandas.get_store(filename) as store:
             store["data"] = self._data
     
     with h5py.File(filename) as the_file:
         try:
             params = the_file["params"]
         except KeyError:
             params = the_file.create_group("params")
         for key, value in self.params.iteritems():
             if isinstance(value, basestring):
                 value = str(value)
             if isinstance(value, datetime):
                 value = value.strftime("%y/%m/%d/%H/%M/%S/%f")
             try:
                 params[key]
             except KeyError:
                 params.create_dataset(key, data=value)
             else:
                 del params[key]
                 params.create_dataset(key, data=value)
Beispiel #13
0
    def time_period_for_trial_phases(self, trial_nr, trial_phases, alias):
        """the time period corresponding to the trial phases requested.
		"""
        with pd.get_store(self.input_object) as h5_file:
            table = h5_file['%s/trial_phases' % alias]
            # check whether one of the trial phases is the end or the beginning of the trial.
            # if so, then supplant the time of that phase with its trial's end or start time.
            if trial_phases[0] == 0:
                start_time = table[table['trial_start_index'] ==
                                   trial_nr]['trial_start_EL_timestamp']
            else:
                start_time = table[(
                    (table['trial_phase_index'] == trial_phases[0]) *
                    (table['trial_phase_trial']
                     == trial_nr))]['trial_phase_EL_timestamp']
            if trial_phases[-1] == -1:
                end_time = table[table['trial_start_index'] ==
                                 trial_nr]['trial_end_EL_timestamp']
            else:
                end_time = table[(
                    (table['trial_phase_index'] == trial_phases[1]) *
                    (table['trial_phase_trial']
                     == trial_nr))]['trial_phase_EL_timestamp']
            time_period = np.array([np.array(start_time),
                                    np.array(end_time)]).squeeze()
        return time_period
def test_write_hdf():

    trajs = data.brownian_trajs_df()
    trajs = Trajectories(trajs)
    tmp_store = tempfile.NamedTemporaryFile(suffix='h5')
    with pd.get_store(tmp_store.name) as store:
        store['trajs'] = trajs
Beispiel #15
0
def find_attr(attr, fields=None, dd=None, settings=None):
    """
    Dictionary may lie.  Check here.

    Parameters
    ----------
    attr: str; e.g. "AGE", "RACE", "SEX"
    fields: array-like; probably columns of the dataframe.
    dd : str or DataFrame; path inside the store or the DD itself.
    settings: dict with "store_path"
    Returns
    -------

    List of strs possible matches.
    """
    if settings is None:
        settings = json.load(open('settings.txt'))

    with pd.get_store(settings["store_path"]) as store:
        if fields is not None and dd is not None:
            raise ValueError('One of fields and dd must be specified.')
        elif fields is None and dd is None:
            raise ValueError('One of fields and dd must be specified.')
        elif dd and isinstance(dd, str):
            dd = store.select(dd)
            fields = dd.id.tolist()
        elif dd and isinstance(dd, pd.DataFrame):
            fields = dd.id.tolist()

    match_with = re.compile(r'[\w|$%\-]*' + attr + r'[\w|$%\-]*')
    maybe_matches = (match_with.match(x) for x in fields)
    return [x.string for x in filter(None, maybe_matches)]
Beispiel #16
0
def loadFullNoLabelsHDF5():
    ### DONT USE!!!!!!!!!!!!
    with pd.get_store(getDataFilePath('mystore.h5')) as store:
        filename = "opendata_essays_2014_11_05.csv"
        filepath = getDataFilePath(filename)
        chunksize = 10000
        Chunker = pd.read_csv(filepath,iterator=True,chunksize=chunksize)
        
        try:
            del store['full_no_labels']
        except:
            pass
        
        cols = ['_projectid','_teacher_acctid','title','short_description','need_statement','essay']
        chunk = Chunker.get_chunk(chunksize)
        chunk = chunk[cols]
        chunk._projectid = chunk._projectid.str.replace('"','')
        chunk._teacher_acctid = chunk._teacher_acctid.str.replace('"','')
        store.append('full_no_labels',chunk,min_itemsize=20000)
        
        for chunk in Chunker:
            chunk = chunk[cols]
            chunk._projectid = chunk._projectid.str.replace('"','')
            chunk._teacher_acctid = chunk._teacher_acctid.str.replace('"','')
            store.append('full_no_labels',chunk)
        print store
Beispiel #17
0
def main():

    args = parser.parse_args()
    store = os.path.expanduser(args.hdf_store)
    data_dir = Path(os.path.expanduser(args.data_dir))

    cached = cached_games(data_dir)

    # first time. Generate the store
    if not os.path.isfile(store):
        pd.HDFStore(store)

    with pd.get_store(store) as s:

        try:
            stored = s.select('drs')['match_id'].unique()
        except KeyError:
            stored = []

    new_games = filter(lambda x: int(x.stem) not in stored, cached)

    dfs = []
    i = 0  # if no new games
    for i, game in enumerate(new_games, 1):
        dr = api.DetailsResponse.from_json(str(game))
        dfs.append(format_df(dr))
    else:
        append_to_store(store, dfs)
        print("Added {} games.".format(i))
 def __init__(self):
     with pd.get_store('/Volumes/HDD/Users/tom/DataStorage/'
                       'Comext/yearly/for_gmm.h5') as f:
         df = f.select('ctry_001')
         df = df.dropna()
         df = df[~(df == np.inf)]
         self.data = df.xs('01', level='good')
    def setUpClass(cls):
        super(MultiRunSigmaAndConfidenceGraphing, cls).setUpClass()
        #Grab results h5 file from set "good" results
        results_path_multi_run = "/home/bolster/src/aietes/results/Malicious Behaviour Trust Comparison-2016-03-24-21-10-00"
        cls.results_path = results_path_multi_run
        cls.fig_path = '/home/bolster/src/thesis/papers/active/16_MASS/figures'

        cls.target = 'Alfa'
        cls.observer = 'Bravo'
        cls.good_bev = 'Control'
        cls.trust_period = 10

        with pd.get_store(cls.results_path + '.h5') as store:
            cls.trust_observations = store.trust.drop('Tail',level='var')
            cls.trust_observations.columns.name = 'metric'
            map_levels(cls.trust_observations, {'Waypoint': cls.good_bev})
            # Convert Sample Times into proper stuff
            cls.trust_observations.reset_index(inplace=True)
            cls.trust_observations['t']*=cls.trust_period
            cls.trust_observations.set_index(['var','run','observer','t','target'], inplace=True)

        data = cls.trust_observations.xs(cls.observer, level='observer')
        cls.deviance = pd.concat([gf - gf.mean() for g, gf in data.groupby(level=['var', 'run', 't'])])
        cls.sigmas = pd.concat([(gf / (gf.std(axis=0))).abs() for g, gf in cls.deviance.groupby(level=['var', 'run', 't'])])
        # Optimise the below; it's naaaaasty for long runs
        cls.summed_sigmas = cls.sigmas.unstack('target').groupby(level=['var','run']).sum()
        cls.dixon_df = pd.concat(
            [(group_dixon_test(g, gf)) for g, gf in cls.summed_sigmas.stack('target').groupby(level=['var', 'run'])])
        cls.dixon_df['correct'] = (cls.dixon_df['target'] == cls.target) != ((cls.dixon_df['var'] == cls.good_bev) & (cls.dixon_df['target'] == 'None'))
Beispiel #20
0
def get_space_targets(gold_standard, store_file):
    """Build a vector space from the store file.

    The output space contains only vectors for targets in the gold standard.

    :return: the space and it's targets

    """
    with pd.get_store(store_file, mode='r') as store:
        targets = store['targets']
        matrix = store['matrix'].reset_index()
        context_len = len(store['context'])

    # Get the targets used in the gold standard together with their ids.
    targets_of_interest = pd.DataFrame({
        'ngram':
        list(set(gold_standard['Word 1']).union(gold_standard['Word 2']))
    }).merge(targets, left_on='ngram', right_index=True)
    # We are only interested in that targets which appear in the gold standard.
    matrix = matrix[matrix['id_target'].isin(targets_of_interest['id'])]

    counts = matrix['count'].values
    ij = matrix[['id_target', 'id_context']].values.T

    # Sparse *row* matrix behaves faster, because we select certain rows.
    space = csr_matrix((counts, ij), shape=(len(targets), context_len))

    return space, targets_of_interest.set_index('ngram')
Beispiel #21
0
def test_write_all_tables(df, store_name):
    orca.add_table('table', df)
    orca.write_tables(store_name)

    with pd.get_store(store_name, mode='r') as store:
        for t in orca.list_tables():
            assert t in store
Beispiel #22
0
def get_space_targets(gold_standard, store_file):
    """Build a vector space from the store file.

    The output space contains only vectors for targets in the gold standard.

    :return: the space and it's targets

    """
    with pd.get_store(store_file, mode='r') as store:
        targets = store['targets']
        matrix = store['matrix'].reset_index()
        context_len = len(store['context'])

    # Get the targets used in the gold standard together with their ids.
    targets_of_interest = pd.DataFrame(
        {'ngram': list(set(gold_standard['Word 1']).union(gold_standard['Word 2']))}
    ).merge(targets, left_on='ngram', right_index=True)
    # We are only interested in that targets which appear in the gold standard.
    matrix = matrix[matrix['id_target'].isin(targets_of_interest['id'])]

    counts = matrix['count'].values
    ij = matrix[['id_target', 'id_context']].values.T

    # Sparse *row* matrix behaves faster, because we select certain rows.
    space = csr_matrix((counts, ij), shape=(len(targets), context_len))

    return space, targets_of_interest.set_index('ngram')
def test_run_and_write_tables(df, store_name):
    sim.add_table('table', df)

    def year_key(y):
        return '{}'.format(y)

    def series_year(y):
        return pd.Series([y] * 3, index=df.index)

    @sim.model()
    def model(year, table):
        table[year_key(year)] = series_year(year)

    sim.run(['model'], years=range(11), data_out=store_name, out_interval=3)

    with pd.get_store(store_name, mode='r') as store:
        for year in range(3, 11, 3):
            key = '{}/table'.format(year)
            assert key in store

            for x in range(year):
                pdt.assert_series_equal(
                    store[key][year_key(x)], series_year(x))

        assert 'base/table' in store

        for x in range(11):
            pdt.assert_series_equal(
                store['final/table'][year_key(x)], series_year(x))
def test_write_hdf():

    trajs = data.brownian_trajs_df()
    trajs = Trajectories(trajs)
    tmp_store = tempfile.NamedTemporaryFile(suffix='h5')
    with pd.get_store(tmp_store.name) as store:
        store['trajs'] = trajs
def frequency(column, path):
    with pd.get_store(path) as store:
        result = store.select_column('tacos', column)
        unique = len(result.unique())
        nan = sum(pd.isnull(result))
        
        return [ column, [ unique, 100.0*unique/store.get_storer('tacos').nrows, nan, 100.0*nan/store.get_storer('tacos').nrows ] ]
Beispiel #26
0
def brownian_trajs_df():
    """
    """
    store_path = os.path.join(data_path, "brownian_trajectories.h5")
    with pd.get_store(store_path) as store:
        trajs = store['trajs']
    return trajs
Beispiel #27
0
def frames_to_hdf(vertex_df, edge_df, fname, reset=False, vert_kwargs={}, edge_kwargs={}):
    '''
    Records the two  DataFrame in the hdf file filename
    '''
    with pd.get_store(fname) as store:
        if not len(store.keys()):
            store.put('edges', edge_df,
                      format='table', **vert_kwargs)
            store.put('vertices', vertex_df,
                      format='table', **edge_kwargs)
        ## FIXME: should only remove the matching index
        elif reset:
            try:
                store.remove('vertices')
            except KeyError:
                pass
            try:
                store.remove('edges')
            except KeyError:
                pass
            store.put('vertices', vertex_df,
                      format='table', **vert_kwargs)
            store.put('edges', edge_df,
                      format='table', **edge_kwargs)
        else:
            store.append('vertices', vertex_df,
                         format='table', **vert_kwargs)
            store.append('edges', edge_df,
                         format='table', **edge_kwargs)
Beispiel #28
0
def store_groupby_apply(store_file_name,
                        col_names,
                        fun,
                        df_name='df',
                        group_keys=True):

    if type(col_names) is str:
        col_names = [col_names]

    with pd.get_store(store_file_name) as store:
        groups = store.select(df_name, columns=col_names).drop_duplicates()
        df_list = []
        for tup in groups.itertuples():
            mask = [
                "{}={}".format(col, getattr(tup, col)) for col in col_names
            ]
            grp_df = store.select(df_name, where=mask)
            stats_df = fun(grp_df)
            if group_keys:
                stats_df = (stats_df.assign(**dict(
                    (col, getattr(tup, col))
                    for col in col_names)).set_index(col_names))
            df_list.append(stats_df)

    return pd.concat(df_list)
Beispiel #29
0
def frames_from_hdf(fname, stamp=None,
                    vertex_kwargs={}, edge_kwargs={}):

    with pd.get_store(fname) as store:
        if stamp == -1:
            vertex_df = store.select('vertices',
                                     #where="'stamp'=stamp",
                                     **vertex_kwargs)
            stamps = vertex_df.index.get_level_values('stamp').unique()
            last = stamps.max()
            vertex_df = vertex_df.xs(last, level='stamp')
            edge_df = store.select('edges',
                                   where="'stamp'={}".format(last),
                                   **edge_kwargs)
        elif stamp is not None:
            vertex_df = store.select('vertices',
                                     where="'stamp'={}".format(stamp),
                                     **vertex_kwargs)
            edge_df = store.select('edges',
                                   where="'stamp'={}".format(stamp),
                                   **edge_kwargs)
        else:
            vertex_df = store.select('vertices',
                                     **vertex_kwargs)
            edge_df = store.select('edges',
                                   **edge_kwargs)
    return vertex_df, edge_df
    def setUpClass(cls):
        super(SingleRunGraphing, cls).setUpClass()
        #Grab results h5 file from set "good" results
        cls.fig_path = '/home/bolster/src/thesis/papers/active/16_MASS/figures'
        results_path_good_as_of_monday210416 = "/home/bolster/src/aietes/results/Malicious Behaviour Trust Comparison-2016-03-22-01-56-00"
        results_path_multi_run = "/home/bolster/src/aietes/results/Malicious Behaviour Trust Comparison-2016-03-24-19-38-14"
        cls.results_path = results_path_multi_run

        cls.run_id = 0
        cls.target = 'Alfa'
        cls.observer = 'Bravo'
        cls.trust_period = 10

        with pd.get_store(cls.results_path + '.h5') as store:
            cls.positions = store.positions
            cls.trust_observations = store.trust.drop('Tail',level='var')
            cls.trust_observations.columns.name = 'metric'
            map_levels(cls.trust_observations, {'Waypoint': 'Control'})
            # Convert Sample Times into proper stuff
            cls.trust_observations.reset_index(inplace=True)
            cls.trust_observations['t']*=cls.trust_period
            cls.trust_observations.set_index(['var','run','observer','t','target'], inplace=True)

        data = cls.trust_observations.xs(cls.observer, level='observer').xs(cls.run_id, level='run')

        cls.deviance = pd.concat([gf - gf.mean() for g, gf in data.groupby(level=['var', 't'])])
        cls.sigmas = pd.concat([(gf / (gf.std(axis=0))).abs() for g, gf in cls.deviance.groupby(level=['var', 't'])])
    def testGetBestFullRuns(self):
        """
        Purpose of this is to get the best results for full-metric scope
        (as defined as max(T_~Alfa.mean() - T_Alfa.mean())
        A "Run" is from
            an individual node on an individual run
        This returns a run for
            each non-control behaviour

        i.e. something that will be sensibly processed by
        _inner = lambda x: map(np.nanmean,np.split(x, [1], axis=1))
        assess = lambda x: -np.subtract(*_inner(x))
        assess_run = lambda x: assess(x.xs(target_str, level='var').xs(0,level='run').values)

        :return: best run
        """
        feat_d = {
            'full': (self.joined_feat_weights, key_order),
            # 'comms':(self.comms_feat_weights, comm_keys),
            # 'phys': (self.phys_feat_weights, phys_keys),
            # 'comms_alt': (self.comms_alt_feat_weights, comm_keys_alt),
            # 'phys_alt': (self.phys_alt_feat_weights, phys_keys_alt),
        }
        with pd.get_store(results_path + '.h5') as store:
            trust_observations = store.trust.dropna()
        for feat_str, (feats, keys) in feat_d.items():
            print(feat_str)
            if keys is not None:
                best = best_of_all(feats, trust_observations[keys])
            else:
                best = best_of_all(feats, trust_observations)
            aietes.Tools.mkcpickle('best_{0}_runs'.format(feat_str), dict(best))
def find_attr(attr, fields=None, dd=None, settings=None):
    """
    Dictionary may lie.  Check here.

    Parameters
    ----------
    attr: str; e.g. "AGE", "RACE", "SEX"
    fields: array-like; probably columns of the dataframe.
    dd : str or DataFrame; path inside the store or the DD itself.
    settings: dict with "store_path"
    Returns
    -------

    List of strs possible matches.
    """
    if settings is None:
        settings = json.load(open('settings.txt'))

    with pd.get_store(settings["store_path"]) as store:
        if fields is not None and dd is not None:
            raise ValueError('One of fields and dd must be specified.')
        elif fields is None and dd is None:
            raise ValueError('One of fields and dd must be specified.')
        elif dd and isinstance(dd, str):
            dd = store.select(dd)
            fields = dd.id.tolist()
        elif dd and isinstance(dd, pd.DataFrame):
            fields = dd.id.tolist()

    match_with = re.compile(r'[\w|$%\-]*' + attr + r'[\w|$%\-]*')
    maybe_matches = (match_with.match(x) for x in fields)
    return [x.string for x in filter(None, maybe_matches)]
Beispiel #33
0
def get_datasets():
    #datasets={}
    for dfile in DSfiles():

        with pd.get_store(dfile) as storer:
           #datasets[dfile]=storer.keys()
           yield dfile,storer.keys()
Beispiel #34
0
def write_tables(fname, table_names=None, prefix=None, write_attached=False):
    """
    Writes tables to a pandas.HDFStore file.

    Parameters
    ----------
    fname : str
        File name for HDFStore. Will be opened in append mode and closed
        at the end of this function.
    table_names: list of str, optional, default None
        List of tables to write. If None, all registered tables will
        be written.
    prefix: str
        If not None, used to prefix the output table names so that
        multiple iterations can go in the same file.
    write_attached: bool, optional, default False
        If True, all columns are written out. If False, only the
        local columns will be written.

    """
    if table_names is None:
        table_names = list_tables()

    tables = (get_injectable(t) for t in table_names)
    key_template = '{}/{{}}'.format(prefix) if prefix is not None else '{}'

    with pd.get_store(fname, mode='a') as store:
        for t in tables:
            if write_attached:
                store[key_template.format(t.name)] = t.to_frame()
            else:
                store[key_template.format(t.name)] = t.local
Beispiel #35
0
def directed_motion_trajs_df():
    """
    """
    store_path = os.path.join(data_path, "directed_motion_trajectories.h5")
    with pd.get_store(store_path) as store:
        trajs = store['trajs']
    return trajs
Beispiel #36
0
def frames_to_hdf(vertex_df,
                  edge_df,
                  fname,
                  reset=False,
                  vert_kwargs={},
                  edge_kwargs={}):
    '''
    Records the two  DataFrame in the hdf file filename
    '''
    with pd.get_store(fname) as store:
        if not len(store.keys()):
            store.put('edges', edge_df, format='table', **vert_kwargs)
            store.put('vertices', vertex_df, format='table', **edge_kwargs)
        ## FIXME: should only remove the matching index
        elif reset:
            try:
                store.remove('vertices')
            except KeyError:
                pass
            try:
                store.remove('edges')
            except KeyError:
                pass
            store.put('vertices', vertex_df, format='table', **vert_kwargs)
            store.put('edges', edge_df, format='table', **edge_kwargs)
        else:
            store.append('vertices', vertex_df, format='table', **vert_kwargs)
            store.append('edges', edge_df, format='table', **edge_kwargs)
Beispiel #37
0
def datalogger_backup(filename='default'):
    "saves all datalogger data to filename"
    written = 0
    if filename == 'default':
        filename = os.path.join(MEDIA_ROOT, 'datalogger.h5')
    metadata = dict()
    data = dict()
    sensors = Sensor.objects.all()
    if sensors is None:
        raise ValueError("No sensors in datalogger!")
    for sens in sensors:
        metadata[sens.name] = sens.description
        sd = SensingDevice(name=sens.name)
        data[sens.name] = sd.getallpoints_raw()
    with pandas.get_store(filename) as store:
        for sens in sensors:
            store[sens.name] = data[sens.name]
            written += len(data[sens.name])
    with h5py.File(filename) as the_file:
        try:
            params = the_file["params"]
        except KeyError:
            params = the_file.create_group("params")
        for key, value in metadata.iteritems():
            try:
                params[key]
            except KeyError:
                params.create_dataset(key, data=value)
            else:
                del params[key]
                params.create_dataset(key, data=value)
    print "Backup finished! " + str(
        written) + " MeasurementPoints written to " + filename
Beispiel #38
0
def table_slice(fname, table, bounds):
    '''
    Small utility function to get the data only between bounds

    Parameters:
    -----------

    fname: str, path to the HDF file
    bounds: dict
        key, value pairs of the form 'indexable':(min_value, max_value)
        if one of those values is None, it will not be taken into account
        (ie take the absolute min or max) as for range(), start is included and stop excluded)
    Each of the bounds will be joined to the 'where' keywarg (by logical and)

    Note:
    -----
    This will only work from indexable columns, aka data columns, which must be set
    explicetely when recording the table. See the corresponding error when you don't
    use one of those. See
    http://pandas.pydata.org/pandas-docs/stable/io.html#query-via-data-columns

    '''
    conditions = []
    for column, (min_value, max_value) in bounds.items():
        start = "{} >= {}".format(column,
                                  min_value) if min_value is not None else ""
        stop = "{} < {}".format(column,
                                max_value) if max_value is not None else ""
        conditions.extend([start, stop])
    where = " & ".join([cnd for cnd in conditions if len(cnd)])
    with pd.get_store(fname) as store:
        if len(where):
            return store.select(table, where=where)
        else:
            return store.select(table)
Beispiel #39
0
def write_tables(fname, models, year):
    """
    Write all tables injected into `models` to a pandas.HDFStore file.
    If year is not None it will be used to prefix the table names so that
    multiple years can go in the same file.

    Parameters
    ----------
    fname : str
        File name for HDFStore. Will be opened in append mode and closed
        at the end of this function.
    models : list of str
        Models from which to gather injected tables for saving.
    year : int or None
        If an integer, used as a prefix along with table names for
        labeling DataFrames in the HDFStore.

    """
    models = (get_model(m) for m in toolz.unique(models))
    table_names = toolz.unique(toolz.concat(m._tables_used() for m in models))
    tables = (get_table(t) for t in table_names)

    key_template = '{}/{{}}'.format(year) if year is not None else '{}'

    with pd.get_store(fname, mode='a') as store:
        for t in tables:
            store[key_template.format(t.name)] = t.to_frame()
Beispiel #40
0
def frames_from_hdf(fname, stamp=None, vertex_kwargs={}, edge_kwargs={}):

    with pd.get_store(fname) as store:
        if stamp == -1:
            vertex_df = store.select(
                'vertices',
                #where="'stamp'=stamp",
                **vertex_kwargs)
            stamps = vertex_df.index.get_level_values('stamp').unique()
            last = stamps.max()
            vertex_df = vertex_df.xs(last, level='stamp')
            edge_df = store.select('edges',
                                   where="'stamp'={}".format(last),
                                   **edge_kwargs)
        elif stamp is not None:
            vertex_df = store.select('vertices',
                                     where="'stamp'={}".format(stamp),
                                     **vertex_kwargs)
            edge_df = store.select('edges',
                                   where="'stamp'={}".format(stamp),
                                   **edge_kwargs)
        else:
            vertex_df = store.select('vertices', **vertex_kwargs)
            edge_df = store.select('edges', **edge_kwargs)
    return vertex_df, edge_df
Beispiel #41
0
def write_tables(fname, models, year):
    """
    Write all tables injected into `models` to a pandas.HDFStore file.
    If year is not None it will be used to prefix the table names so that
    multiple years can go in the same file.

    Parameters
    ----------
    fname : str
        File name for HDFStore. Will be opened in append mode and closed
        at the end of this function.
    models : list of str
        Models from which to gather injected tables for saving.
    year : int or None
        If an integer, used as a prefix along with table names for
        labeling DataFrames in the HDFStore.

    """
    models = (get_model(m) for m in toolz.unique(models))
    table_names = toolz.unique(toolz.concat(m._tables_used() for m in models))
    tables = (get_table(t) for t in table_names)

    key_template = '{}/{{}}'.format(year) if year is not None else '{}'

    with pd.get_store(fname, mode='a') as store:
        for t in tables:
            store[key_template.format(t.name)] = t.to_frame()
Beispiel #42
0
def test_write_cooccurrence_matrix_hd5(counter, output, utterances, metadata):
    io.write_cooccurrence_matrix_hd5(counter, output, utterances, metadata)

    with pd.get_store(output) as store:

        row2id = store['row2id']
        assert row2id.ix['a'] != row2id.ix['b']
        with pytest.raises(KeyError):
            row2id.ix['Z']

        col2id = store['col2id']
        assert col2id.ix['Z'] != col2id.ix['Y']
        with pytest.raises(KeyError):
            col2id['a']

        assert tuple(store['row_labels']) == tuple('abac')
        assert tuple(store['col_labels']) == tuple('ZYXW')
        assert tuple(store['row_ids']) == tuple(row2id[r]
                                                for r in store['row_labels'])
        assert tuple(store['col_ids']) == tuple(col2id[c]
                                                for c in store['col_labels'])

        assert tuple(store['data'].values) == (3, 4, 2, 1)

        assert store.get_storer('data').attrs.metadata == metadata
Beispiel #43
0
 def save(self, out_filename, out_key):
     with pd.get_store(out_filename) as out_store:
         with pd.get_store(self.filename) as store:
             while True:
                 try:
                     frame_no, labels = next(self.label_generator)
                 except StopIteration:
                     break
                 # Fetch data (redundantly) this time taking all columns.
                 frame = store.select(self.key, 'frame == %d' % frame_no)
                 frame['probe'] = -1  # an integer placeholder
                 frame['probe'].update(labels)
                 out_store.append(out_key, frame, 
                                  data_columns=frame.columns)
                 print "Frame %d written with %d probes tracked." \
                     % (frame_no, len(frame))
Beispiel #44
0
def load_oldformat(filename):
    """loads the curve at filename"""
    with pandas.get_store(filename, "r") as store:
            data = store["data"]
    kwds = dict()
    with h5py.File(filename) as the_file:
        try:
            meta = the_file["meta"]
        except KeyError:
            print "In filename "+filename+" not even the meta attribute exists!"
        else:
            for key, value in meta.iteritems():
                kwds[key] = convert_from_numpy(value.value)
        dir_name, file_name = os.path.split(filename)
        file_root, file_ext = os.path.splitext(file_name)
        kwds["name"] = file_root
        if file_ext!='.h5':
            print "Error: file ended not with .h5"
        kwds["id"]=1
        kwds["date"]=datetime.fromtimestamp(os.path.getmtime(filename))
        kwds["oldformat"]=True
    curve = Curve()
    curve.set_data(data)
    curve.set_params(**kwds)
    return curve
Beispiel #45
0
def test_run_and_write_tables(df, store_name):
    orca.add_table('table', df)

    def year_key(y):
        return '{}'.format(y)

    def series_year(y):
        return pd.Series([y] * 3, index=df.index, name=str(y))

    @orca.step()
    def step(iter_var, table):
        table[year_key(iter_var)] = series_year(iter_var)

    orca.run(
        ['step'], iter_vars=range(11), data_out=store_name, out_interval=3)

    with pd.get_store(store_name, mode='r') as store:
        for year in range(0, 11, 3):
            key = '{}/table'.format(year)
            assert key in store

            for x in range(year):
                pdt.assert_series_equal(
                    store[key][year_key(x)], series_year(x))

        assert 'base/table' in store

        for x in range(11):
            pdt.assert_series_equal(
                store['10/table'][year_key(x)], series_year(x))
Beispiel #46
0
def main():

    args = parser.parse_args()
    store = os.path.expanduser(args.hdf_store)
    data_dir = Path(os.path.expanduser(args.data_dir))

    cached = cached_games(data_dir)

    # first time. Generate the store
    if not os.path.isfile(store):
        pd.HDFStore(store)

    with pd.get_store(store) as s:

        try:
            stored = s.select("drs")["match_id"].unique()
        except KeyError:
            stored = []

    new_games = filter(lambda x: int(x.stem) not in stored, cached)

    dfs = []
    i = 0  # if no new games
    for i, game in enumerate(new_games, 1):
        dr = api.DetailsResponse.from_json(str(game))
        dfs.append(format_df(dr))
    else:
        append_to_store(store, dfs)
        print("Added {} games.".format(i))
Beispiel #47
0
 def test_get_store(self):
     pytest.importorskip('tables')
     with tm.ensure_clean() as path:
         with tm.assert_produces_warning(FutureWarning,
                                         check_stacklevel=False):
             s = pd.get_store(path)
             s.close()
Beispiel #48
0
 def sample_rate_during_period(self, time_period, alias):
     """docstring for eye_during_period"""
     with pd.get_store(self.input_object) as h5_file:
         period_block_nr = self.sample_in_block(
             sample=time_period[0],
             block_table=h5_file['%s/blocks' % alias])
         return h5_file['%s/blocks' % alias]['sample_rate'][period_block_nr]
Beispiel #49
0
 def test_get_store(self):
     pytest.importorskip('tables')
     with tm.ensure_clean() as path:
         with tm.assert_produces_warning(FutureWarning,
                                         check_stacklevel=False):
             s = pd.get_store(path)
             s.close()
Beispiel #50
0
 def numbered_frames(self):
     with pd.get_store(self.filename) as store:
         for frame_no in xrange(self.first_frame, 1 + self.last_frame):
             frame = store.select(self.key,
                                  'frame == %d' % frame_no,
                                  columns=self.pos_columns)
             yield frame_no, frame
Beispiel #51
0
    def from_h5(cls, store_path, base_dir=None, minimum_metadata_keys=[], clean_store=False):
        """Load ObjectsIO from HDF5 file.

        Parameters
        ----------
        store_path : str
            HDF5 file path.
        base_dir : str
            Root directory

        """

        if base_dir:
            full_store_path = os.path.join(base_dir, store_path)
        else:
            full_store_path = store_path

        with pd.get_store(full_store_path) as store:
            metadata_serie = store['metadata']

        metadata = metadata_serie.to_dict()

        return cls(metadata=metadata,
                   store_path=store_path,
                   base_dir=base_dir,
                   minimum_metadata_keys=minimum_metadata_keys,
                   clean_store=clean_store)
Beispiel #52
0
 def save(self, out_filename, out_key):
     with pd.get_store(out_filename) as out_store:
         with pd.get_store(self.filename) as store:
             while True:
                 try:
                     frame_no, labels = next(self.label_generator)
                 except StopIteration:
                     break
                 # Fetch data (redundantly) this time taking all columns.
                 frame = store.select(self.key, 'frame == %d' % frame_no)
                 frame['probe'] = -1  # an integer placeholder
                 frame['probe'].update(labels)
                 out_store.append(out_key,
                                  frame,
                                  data_columns=frame.columns)
                 print "Frame %d written with %d probes tracked." \
                     % (frame_no, len(frame))
Beispiel #53
0
    def keys(self):
        """Return list of objects in HDF5 file.
        """

        objs = []
        with pd.get_store(self.store_path) as store:
            objs = store.keys()
        return objs