def merge(): dfs = [] with pd.get_store('/Volumes/HDD/Users/tom/DataStorage/CPS/analyzed/clean.h5') as store: for k, v in store.iteritems(): if k.startswith('/y'): df = store.select(k) df = df[(df['age'] > 18) & (df['age'] < 65)] dfs.append(df) df = pd.concat(dfs) with pd.get_store('/Volumes/HDD/Users/tom/DataStorage/CPS/analyzed/analyzed_store.h5') as store: deflator = store.select('deflator')['deflator'] deflator.index.set_names(['qmonth'], inplace=True) df.loc[:, 'wage'] = df.earnwke.div(df.uhourse) df.loc[:, 'r_wage'] = df.wage.div(deflator, level='qmonth') * 100 df.loc[:, 'lr_wage'] = np.log(df.r_wage) assert df.index.is_unique with pd.get_store('/Volumes/HDD/Users/tom/DataStorage/CPS/analyzed/clean.h5') as store: df.to_hdf(store, 'merged', format='t', append=False)
def run_all_analysis_generation(results_path): with pd.get_store(results_path + '.h5') as store: trust_observations = store.trust.dropna() def power_set_map(powerset_string): strings = powerset_string[1:].split('_') metrics = strings[:-3] t = strings[-2] signed = strings[-1] return ({ 'metrics': metrics, 'type': t, 'dataset': powerset_string[1:], 'signed': signed }) shared_big_h5_path = "/home/bolster/src/aietes/results/powerset_shared.h5" with pd.get_store(shared_big_h5_path) as store: keys = store.keys() random.shuffle(keys) result = map(power_set_map, keys) powerset_list = filter(lambda t: t['type'] == 'feats', result) for d in powerset_list: d['data'] = store[d['dataset']] best_weight_valences_and_runs_for_metric_subset = {} visited = {} with Parallel(n_jobs=-1) as par: for subset_d in tqdm(powerset_list): # This maps to a single powerset_shared.h5 dataset (comms_alt, etc) best_output_file = '{}.json'.format(os.path.join(json_path,subset_d['dataset'])) if os.path.isfile(best_output_file): print("Skipping: File already exists, consider deleting it {}".format(subset_d['dataset'])) visited[subset_d['dataset']]='exists' else: print"Preparing {}".format(subset_d['dataset']) open(best_output_file, 'a').close() if subset_d['dataset'] in visited: continue try: feat_weights = categorise_dataframe(non_zero_rows(subset_d['data']).T) if subset_d['metrics'] is not None: best = best_of_all(feat_weights, trust_observations[subset_d['metrics']], par=par) else: best = best_of_all(feat_weights, trust_observations, par=par) best_weight_valences_and_runs_for_metric_subset[subset_d['dataset']] = best with open(best_output_file, 'w') as f: json.dump(best, f, cls=NumpyAwareJSONEncoder) copy(best_output_file, results_path) except: print("Failed on {}".format(subset_d['dataset'])) os.remove(best_output_file) return best_weight_valences_and_runs_for_metric_subset
def setUp(self): _skip_if_no_pytables() filename = os.path.join(path, 'features_size9_masscut2000.df') f = pd.read_pickle(filename) self.key = 'features' with pd.get_store('temp1.h5') as store: store.put(self.key, f) with pd.get_store('temp2.h5') as store: store.append(self.key, f, data_columns=['frame'])
def per_scenario_gd_mal_trusts(gd_file, mal_file): if not all(map(os.path.isfile, [gd_file, mal_file])): if all(map(os.path.isfile, map(Tools.in_results, [gd_file, mal_file]))): gd_file, mal_file = map(Tools.in_results, [gd_file, mal_file]) else: raise OSError("Either {0} or {1} is not present".format(gd_file, mal_file)) with pd.get_store(mal_file) as store: mal_trust = store.get('trust') map_levels(mal_trust, scenario_map) with pd.get_store(gd_file) as store: gd_trust = store.get('trust') map_levels(gd_trust, scenario_map) return gd_trust, mal_trust
def project_phases(self, nr_cycles = 20, freqs_of_interest = [1.8, 2.2]): self.assert_data_intern() self.read_trans_counts() replay_phase = np.loadtxt(os.path.join(self.analyzer.sj_dir, 'phase_delay.txt'))[0] with pd.get_store(self.analyzer.h5_file) as h5_file: real_data = h5_file.get("/%s/tf/cycles_%s_%s"%(self.file_alias, nr_cycles, 'tf_complex_real')) imag_data = h5_file.get("/%s/tf/cycles_%s_%s"%(self.file_alias, nr_cycles, 'tf_complex_imag')) power_data = h5_file.get("/%s/tf/cycles_%s_%s"%(self.file_alias, nr_cycles, 'tf_power_Z')) trial_numbers = np.array(real_data.keys()) frequencies = np.array(real_data.major_axis) timepoints = np.array(real_data.minor_axis) real_m = np.array(real_data)[:,(frequencies>freqs_of_interest[0]) & (frequencies<freqs_of_interest[1]),:].mean(axis = 1) imag_m = np.array(imag_data)[:,(frequencies>freqs_of_interest[0]) & (frequencies<freqs_of_interest[1]),:].mean(axis = 1) power_m = np.array(power_data)[:,(frequencies>freqs_of_interest[0]) & (frequencies<freqs_of_interest[1]),:].mean(axis = 1) expected_phase_real = np.cos(replay_phase + timepoints * self.stim_frequency * 2.0 * np.pi) expected_phase_imag = np.sin(replay_phase + timepoints * self.stim_frequency * 2.0 * np.pi) complex_data = np.array([real_m, imag_m]).transpose((1,2,0)) template_data = np.array([expected_phase_real, expected_phase_imag]).transpose((1,0)) projected_data = np.zeros(complex_data.shape[:-1]) for x in range(len(complex_data)): projected_data[x] = np.array([np.dot(c, t)/np.dot(t,t) for c, t in zip(complex_data[x], template_data)]) # plot these timecourses per trial f = pl.figure(figsize = (8,24)) for x in range(len(complex_data)): s = f.add_subplot(len(complex_data), 1, x+1) pl.plot(timepoints, projected_data[x], 'k', lw = 4.0) pl.plot(timepoints, power_m[x], 'k--', lw = 2.0) s.axhline(np.median(projected_data[x]), color = 'b', lw = 3.0) s.axhline(np.mean(projected_data[x]), color = 'r', lw = 3.0, ls = '--', alpha = 0.6) if hasattr(self, 'trans_counts'): s.annotate('%i'%self.trans_counts[x], (0.5,0.65), textcoords = 'axes fraction') s.set_ylim([-2,4]) sn.despine(offset=10) pl.tight_layout() pl.savefig(os.path.join(self.analyzer.fig_dir, self.file_alias + '_projected.pdf')) # save out as dataframe pdf = pd.DataFrame(projected_data, index = trial_numbers, columns = timepoints) with pd.get_store(self.analyzer.h5_file) as h5_file: h5_file.put("/%s/tf/cycles_%s_%s"%(self.file_alias, nr_cycles, 'projected'), pdf)
def write_failure_log(node, hdf5_path, node_dir, window_length, unique_fhsseverity): with pd.get_store(hdf5_path) as store: failures = store.select('tacos', columns=['fhsseverity'], where='node == node').sort() node_filename = uuid.uuid4() with pd.HDFStore('{}/{}.h5'.format(node_dir, node_filename), 'w', complevel=9, complib='blosc') as node_store: targets = [] for index in failures.index: target = {key: 0 for key in unique_fhsseverity} # Adding one second weeds out concurrent events. Should they be included? window = failures[index + pd.Timedelta(1, unit='s'):index + pd.Timedelta(window_length, unit='m')].copy() window.fillna('nan', inplace=True) window['decay'] = window.apply(lambda row: np.exp( math.log(.01) * (row.index - index).seconds / (60 * window_length))) for _, row in window[::-1].iterrows(): target[row.fhsseverity] = row.decay targets.append(target.values()) failure_windows = np.array(targets) for i, key in enumerate(target.keys()): failures['error_{}'.format(key)] = failure_windows[:, i] node_store.append('data', failures, data_columns=True) node_store.append('node', node)
def write_space(f_name, context, targets, matrix): """Write a vector space without creating it. :param pandas.DataFrame context: the column labels :param pandas.DataFrame targets: the row lables ``row_labels`` and ``column_labels`` contain two columns: ``ngram`` and ``id``. ``ngram`` is frame's index. :param pandas.DataFrame matrix: the co-occurrence counts. The frame consists of three columns: ``count``, ``id_target`` and ``id_context``. ``id_target`` and ``id_context`` is frame's index. """ assert np.isfinite(matrix['count']).all() assert context.reset_index().notnull().all().all(), 'There are NULL row labels.' assert targets.reset_index().notnull().all().all(), 'There are NULL column labels.' with pd.get_store( f_name, mode='w', complevel=9, complib='blosc', ) as store: # Reset index for context and tartgets, so the string "nan" is not # converted to NULL! Once https://github.com/pydata/pandas/issues/9604 # is fixed, there is no need to reset the index. if context is not None: store['context'] = context.reset_index() store['targets'] = targets.reset_index() store['matrix'] = matrix
def datalogger_backup(filename='default'): "saves all datalogger data to filename" written = 0 if filename=='default': filename = os.path.join(MEDIA_ROOT,'datalogger.h5') metadata = dict() data=dict() sensors = Sensor.objects.all() if sensors is None: raise ValueError("No sensors in datalogger!") for sens in sensors: metadata[sens.name]=sens.description sd = SensingDevice(name=sens.name) data[sens.name]=sd.getallpoints_raw() with pandas.get_store(filename) as store: for sens in sensors: store[sens.name] = data[sens.name] written+=len(data[sens.name]) with h5py.File(filename) as the_file: try: params = the_file["params"] except KeyError: params = the_file.create_group("params") for key, value in metadata.iteritems(): try: params[key] except KeyError: params.create_dataset(key, data=value) else: del params[key] params.create_dataset(key, data=value) print "Backup finished! "+str(written)+" MeasurementPoints written to "+filename
def table_slice(fname, table, bounds): ''' Small utility function to get the data only between bounds Parameters: ----------- fname: str, path to the HDF file bounds: dict key, value pairs of the form 'indexable':(min_value, max_value) if one of those values is None, it will not be taken into account (ie take the absolute min or max) as for range(), start is included and stop excluded) Each of the bounds will be joined to the 'where' keywarg (by logical and) Note: ----- This will only work from indexable columns, aka data columns, which must be set explicetely when recording the table. See the corresponding error when you don't use one of those. See http://pandas.pydata.org/pandas-docs/stable/io.html#query-via-data-columns ''' conditions = [] for column, (min_value, max_value) in bounds.items(): start = "{} >= {}".format(column, min_value) if min_value is not None else "" stop = "{} < {}".format(column, max_value) if max_value is not None else "" conditions.extend([start, stop]) where = " & ".join([cnd for cnd in conditions if len(cnd)]) with pd.get_store(fname) as store: if len(where): return store.select(table, where=where) else: return store.select(table)
def plot_correlations(filenames, limit=None): DataFrame = pd.DataFrame() index = 0 for fileName in filenames: with pd.get_store(fileName, 'r') as store: tempDataFrame = pd.DataFrame({'Event': store.Hits.Event[:15000], 'Row' + str(index): store.Hits.Row[:15000]}) tempDataFrame = tempDataFrame.set_index('Event') DataFrame = tempDataFrame.join(DataFrame) DataFrame = DataFrame.dropna() index += 1 del tempDataFrame DataFrame["index"] = DataFrame.index DataFrame.drop_duplicates(take_last=True, inplace=True) del DataFrame["index"] correlationNames = ('Row') index = 0 for corName in correlationNames: for colName in itertools.permutations(DataFrame.filter(regex=corName), 2): if(corName == 'Col'): heatmap, xedges, yedges = np.histogram2d(DataFrame[colName[0]], DataFrame[colName[1]], bins=(80, 80), range=[[1, 80], [1, 80]]) else: heatmap, xedges, yedges = np.histogram2d(DataFrame[colName[0]], DataFrame[colName[1]], bins=(336, 336), range=[[1, 336], [1, 336]]) extent = [yedges[0] - 0.5, yedges[-1] + 0.5, xedges[-1] + 0.5, xedges[0] - 0.5] cmap = cm.get_cmap('hot', 40) fig = Figure() FigureCanvas(fig) ax = fig.add_subplot(111) ax.imshow(heatmap, extent=extent, cmap=cmap, interpolation='nearest') ax.invert_yaxis() ax.set_xlabel(colName[0]) ax.set_ylabel(colName[1]) ax.set_title('Correlation plot(' + corName + ')') fig.savefig(colName[0] + '_' + colName[1] + '.pdf') index += 1
def save(self, filename, with_data=True): if not os.path.exists(filename) and not with_data: print "Tried to save a new file without data. Overriding false with_data argument! " with_data=True if self.data is None and with_data: raise ValueError("could not save curve, no data was set") if self.params is None: raise ValueError("could not save curve, no params were set") if with_data: with pandas.get_store(filename) as store: store["data"] = self._data with h5py.File(filename) as the_file: try: params = the_file["params"] except KeyError: params = the_file.create_group("params") for key, value in self.params.iteritems(): if isinstance(value, basestring): value = str(value) if isinstance(value, datetime): value = value.strftime("%y/%m/%d/%H/%M/%S/%f") try: params[key] except KeyError: params.create_dataset(key, data=value) else: del params[key] params.create_dataset(key, data=value)
def time_period_for_trial_phases(self, trial_nr, trial_phases, alias): """the time period corresponding to the trial phases requested. """ with pd.get_store(self.input_object) as h5_file: table = h5_file['%s/trial_phases' % alias] # check whether one of the trial phases is the end or the beginning of the trial. # if so, then supplant the time of that phase with its trial's end or start time. if trial_phases[0] == 0: start_time = table[table['trial_start_index'] == trial_nr]['trial_start_EL_timestamp'] else: start_time = table[( (table['trial_phase_index'] == trial_phases[0]) * (table['trial_phase_trial'] == trial_nr))]['trial_phase_EL_timestamp'] if trial_phases[-1] == -1: end_time = table[table['trial_start_index'] == trial_nr]['trial_end_EL_timestamp'] else: end_time = table[( (table['trial_phase_index'] == trial_phases[1]) * (table['trial_phase_trial'] == trial_nr))]['trial_phase_EL_timestamp'] time_period = np.array([np.array(start_time), np.array(end_time)]).squeeze() return time_period
def test_write_hdf(): trajs = data.brownian_trajs_df() trajs = Trajectories(trajs) tmp_store = tempfile.NamedTemporaryFile(suffix='h5') with pd.get_store(tmp_store.name) as store: store['trajs'] = trajs
def find_attr(attr, fields=None, dd=None, settings=None): """ Dictionary may lie. Check here. Parameters ---------- attr: str; e.g. "AGE", "RACE", "SEX" fields: array-like; probably columns of the dataframe. dd : str or DataFrame; path inside the store or the DD itself. settings: dict with "store_path" Returns ------- List of strs possible matches. """ if settings is None: settings = json.load(open('settings.txt')) with pd.get_store(settings["store_path"]) as store: if fields is not None and dd is not None: raise ValueError('One of fields and dd must be specified.') elif fields is None and dd is None: raise ValueError('One of fields and dd must be specified.') elif dd and isinstance(dd, str): dd = store.select(dd) fields = dd.id.tolist() elif dd and isinstance(dd, pd.DataFrame): fields = dd.id.tolist() match_with = re.compile(r'[\w|$%\-]*' + attr + r'[\w|$%\-]*') maybe_matches = (match_with.match(x) for x in fields) return [x.string for x in filter(None, maybe_matches)]
def loadFullNoLabelsHDF5(): ### DONT USE!!!!!!!!!!!! with pd.get_store(getDataFilePath('mystore.h5')) as store: filename = "opendata_essays_2014_11_05.csv" filepath = getDataFilePath(filename) chunksize = 10000 Chunker = pd.read_csv(filepath,iterator=True,chunksize=chunksize) try: del store['full_no_labels'] except: pass cols = ['_projectid','_teacher_acctid','title','short_description','need_statement','essay'] chunk = Chunker.get_chunk(chunksize) chunk = chunk[cols] chunk._projectid = chunk._projectid.str.replace('"','') chunk._teacher_acctid = chunk._teacher_acctid.str.replace('"','') store.append('full_no_labels',chunk,min_itemsize=20000) for chunk in Chunker: chunk = chunk[cols] chunk._projectid = chunk._projectid.str.replace('"','') chunk._teacher_acctid = chunk._teacher_acctid.str.replace('"','') store.append('full_no_labels',chunk) print store
def main(): args = parser.parse_args() store = os.path.expanduser(args.hdf_store) data_dir = Path(os.path.expanduser(args.data_dir)) cached = cached_games(data_dir) # first time. Generate the store if not os.path.isfile(store): pd.HDFStore(store) with pd.get_store(store) as s: try: stored = s.select('drs')['match_id'].unique() except KeyError: stored = [] new_games = filter(lambda x: int(x.stem) not in stored, cached) dfs = [] i = 0 # if no new games for i, game in enumerate(new_games, 1): dr = api.DetailsResponse.from_json(str(game)) dfs.append(format_df(dr)) else: append_to_store(store, dfs) print("Added {} games.".format(i))
def __init__(self): with pd.get_store('/Volumes/HDD/Users/tom/DataStorage/' 'Comext/yearly/for_gmm.h5') as f: df = f.select('ctry_001') df = df.dropna() df = df[~(df == np.inf)] self.data = df.xs('01', level='good')
def setUpClass(cls): super(MultiRunSigmaAndConfidenceGraphing, cls).setUpClass() #Grab results h5 file from set "good" results results_path_multi_run = "/home/bolster/src/aietes/results/Malicious Behaviour Trust Comparison-2016-03-24-21-10-00" cls.results_path = results_path_multi_run cls.fig_path = '/home/bolster/src/thesis/papers/active/16_MASS/figures' cls.target = 'Alfa' cls.observer = 'Bravo' cls.good_bev = 'Control' cls.trust_period = 10 with pd.get_store(cls.results_path + '.h5') as store: cls.trust_observations = store.trust.drop('Tail',level='var') cls.trust_observations.columns.name = 'metric' map_levels(cls.trust_observations, {'Waypoint': cls.good_bev}) # Convert Sample Times into proper stuff cls.trust_observations.reset_index(inplace=True) cls.trust_observations['t']*=cls.trust_period cls.trust_observations.set_index(['var','run','observer','t','target'], inplace=True) data = cls.trust_observations.xs(cls.observer, level='observer') cls.deviance = pd.concat([gf - gf.mean() for g, gf in data.groupby(level=['var', 'run', 't'])]) cls.sigmas = pd.concat([(gf / (gf.std(axis=0))).abs() for g, gf in cls.deviance.groupby(level=['var', 'run', 't'])]) # Optimise the below; it's naaaaasty for long runs cls.summed_sigmas = cls.sigmas.unstack('target').groupby(level=['var','run']).sum() cls.dixon_df = pd.concat( [(group_dixon_test(g, gf)) for g, gf in cls.summed_sigmas.stack('target').groupby(level=['var', 'run'])]) cls.dixon_df['correct'] = (cls.dixon_df['target'] == cls.target) != ((cls.dixon_df['var'] == cls.good_bev) & (cls.dixon_df['target'] == 'None'))
def get_space_targets(gold_standard, store_file): """Build a vector space from the store file. The output space contains only vectors for targets in the gold standard. :return: the space and it's targets """ with pd.get_store(store_file, mode='r') as store: targets = store['targets'] matrix = store['matrix'].reset_index() context_len = len(store['context']) # Get the targets used in the gold standard together with their ids. targets_of_interest = pd.DataFrame({ 'ngram': list(set(gold_standard['Word 1']).union(gold_standard['Word 2'])) }).merge(targets, left_on='ngram', right_index=True) # We are only interested in that targets which appear in the gold standard. matrix = matrix[matrix['id_target'].isin(targets_of_interest['id'])] counts = matrix['count'].values ij = matrix[['id_target', 'id_context']].values.T # Sparse *row* matrix behaves faster, because we select certain rows. space = csr_matrix((counts, ij), shape=(len(targets), context_len)) return space, targets_of_interest.set_index('ngram')
def test_write_all_tables(df, store_name): orca.add_table('table', df) orca.write_tables(store_name) with pd.get_store(store_name, mode='r') as store: for t in orca.list_tables(): assert t in store
def get_space_targets(gold_standard, store_file): """Build a vector space from the store file. The output space contains only vectors for targets in the gold standard. :return: the space and it's targets """ with pd.get_store(store_file, mode='r') as store: targets = store['targets'] matrix = store['matrix'].reset_index() context_len = len(store['context']) # Get the targets used in the gold standard together with their ids. targets_of_interest = pd.DataFrame( {'ngram': list(set(gold_standard['Word 1']).union(gold_standard['Word 2']))} ).merge(targets, left_on='ngram', right_index=True) # We are only interested in that targets which appear in the gold standard. matrix = matrix[matrix['id_target'].isin(targets_of_interest['id'])] counts = matrix['count'].values ij = matrix[['id_target', 'id_context']].values.T # Sparse *row* matrix behaves faster, because we select certain rows. space = csr_matrix((counts, ij), shape=(len(targets), context_len)) return space, targets_of_interest.set_index('ngram')
def test_run_and_write_tables(df, store_name): sim.add_table('table', df) def year_key(y): return '{}'.format(y) def series_year(y): return pd.Series([y] * 3, index=df.index) @sim.model() def model(year, table): table[year_key(year)] = series_year(year) sim.run(['model'], years=range(11), data_out=store_name, out_interval=3) with pd.get_store(store_name, mode='r') as store: for year in range(3, 11, 3): key = '{}/table'.format(year) assert key in store for x in range(year): pdt.assert_series_equal( store[key][year_key(x)], series_year(x)) assert 'base/table' in store for x in range(11): pdt.assert_series_equal( store['final/table'][year_key(x)], series_year(x))
def frequency(column, path): with pd.get_store(path) as store: result = store.select_column('tacos', column) unique = len(result.unique()) nan = sum(pd.isnull(result)) return [ column, [ unique, 100.0*unique/store.get_storer('tacos').nrows, nan, 100.0*nan/store.get_storer('tacos').nrows ] ]
def brownian_trajs_df(): """ """ store_path = os.path.join(data_path, "brownian_trajectories.h5") with pd.get_store(store_path) as store: trajs = store['trajs'] return trajs
def frames_to_hdf(vertex_df, edge_df, fname, reset=False, vert_kwargs={}, edge_kwargs={}): ''' Records the two DataFrame in the hdf file filename ''' with pd.get_store(fname) as store: if not len(store.keys()): store.put('edges', edge_df, format='table', **vert_kwargs) store.put('vertices', vertex_df, format='table', **edge_kwargs) ## FIXME: should only remove the matching index elif reset: try: store.remove('vertices') except KeyError: pass try: store.remove('edges') except KeyError: pass store.put('vertices', vertex_df, format='table', **vert_kwargs) store.put('edges', edge_df, format='table', **edge_kwargs) else: store.append('vertices', vertex_df, format='table', **vert_kwargs) store.append('edges', edge_df, format='table', **edge_kwargs)
def store_groupby_apply(store_file_name, col_names, fun, df_name='df', group_keys=True): if type(col_names) is str: col_names = [col_names] with pd.get_store(store_file_name) as store: groups = store.select(df_name, columns=col_names).drop_duplicates() df_list = [] for tup in groups.itertuples(): mask = [ "{}={}".format(col, getattr(tup, col)) for col in col_names ] grp_df = store.select(df_name, where=mask) stats_df = fun(grp_df) if group_keys: stats_df = (stats_df.assign(**dict( (col, getattr(tup, col)) for col in col_names)).set_index(col_names)) df_list.append(stats_df) return pd.concat(df_list)
def frames_from_hdf(fname, stamp=None, vertex_kwargs={}, edge_kwargs={}): with pd.get_store(fname) as store: if stamp == -1: vertex_df = store.select('vertices', #where="'stamp'=stamp", **vertex_kwargs) stamps = vertex_df.index.get_level_values('stamp').unique() last = stamps.max() vertex_df = vertex_df.xs(last, level='stamp') edge_df = store.select('edges', where="'stamp'={}".format(last), **edge_kwargs) elif stamp is not None: vertex_df = store.select('vertices', where="'stamp'={}".format(stamp), **vertex_kwargs) edge_df = store.select('edges', where="'stamp'={}".format(stamp), **edge_kwargs) else: vertex_df = store.select('vertices', **vertex_kwargs) edge_df = store.select('edges', **edge_kwargs) return vertex_df, edge_df
def setUpClass(cls): super(SingleRunGraphing, cls).setUpClass() #Grab results h5 file from set "good" results cls.fig_path = '/home/bolster/src/thesis/papers/active/16_MASS/figures' results_path_good_as_of_monday210416 = "/home/bolster/src/aietes/results/Malicious Behaviour Trust Comparison-2016-03-22-01-56-00" results_path_multi_run = "/home/bolster/src/aietes/results/Malicious Behaviour Trust Comparison-2016-03-24-19-38-14" cls.results_path = results_path_multi_run cls.run_id = 0 cls.target = 'Alfa' cls.observer = 'Bravo' cls.trust_period = 10 with pd.get_store(cls.results_path + '.h5') as store: cls.positions = store.positions cls.trust_observations = store.trust.drop('Tail',level='var') cls.trust_observations.columns.name = 'metric' map_levels(cls.trust_observations, {'Waypoint': 'Control'}) # Convert Sample Times into proper stuff cls.trust_observations.reset_index(inplace=True) cls.trust_observations['t']*=cls.trust_period cls.trust_observations.set_index(['var','run','observer','t','target'], inplace=True) data = cls.trust_observations.xs(cls.observer, level='observer').xs(cls.run_id, level='run') cls.deviance = pd.concat([gf - gf.mean() for g, gf in data.groupby(level=['var', 't'])]) cls.sigmas = pd.concat([(gf / (gf.std(axis=0))).abs() for g, gf in cls.deviance.groupby(level=['var', 't'])])
def testGetBestFullRuns(self): """ Purpose of this is to get the best results for full-metric scope (as defined as max(T_~Alfa.mean() - T_Alfa.mean()) A "Run" is from an individual node on an individual run This returns a run for each non-control behaviour i.e. something that will be sensibly processed by _inner = lambda x: map(np.nanmean,np.split(x, [1], axis=1)) assess = lambda x: -np.subtract(*_inner(x)) assess_run = lambda x: assess(x.xs(target_str, level='var').xs(0,level='run').values) :return: best run """ feat_d = { 'full': (self.joined_feat_weights, key_order), # 'comms':(self.comms_feat_weights, comm_keys), # 'phys': (self.phys_feat_weights, phys_keys), # 'comms_alt': (self.comms_alt_feat_weights, comm_keys_alt), # 'phys_alt': (self.phys_alt_feat_weights, phys_keys_alt), } with pd.get_store(results_path + '.h5') as store: trust_observations = store.trust.dropna() for feat_str, (feats, keys) in feat_d.items(): print(feat_str) if keys is not None: best = best_of_all(feats, trust_observations[keys]) else: best = best_of_all(feats, trust_observations) aietes.Tools.mkcpickle('best_{0}_runs'.format(feat_str), dict(best))
def get_datasets(): #datasets={} for dfile in DSfiles(): with pd.get_store(dfile) as storer: #datasets[dfile]=storer.keys() yield dfile,storer.keys()
def write_tables(fname, table_names=None, prefix=None, write_attached=False): """ Writes tables to a pandas.HDFStore file. Parameters ---------- fname : str File name for HDFStore. Will be opened in append mode and closed at the end of this function. table_names: list of str, optional, default None List of tables to write. If None, all registered tables will be written. prefix: str If not None, used to prefix the output table names so that multiple iterations can go in the same file. write_attached: bool, optional, default False If True, all columns are written out. If False, only the local columns will be written. """ if table_names is None: table_names = list_tables() tables = (get_injectable(t) for t in table_names) key_template = '{}/{{}}'.format(prefix) if prefix is not None else '{}' with pd.get_store(fname, mode='a') as store: for t in tables: if write_attached: store[key_template.format(t.name)] = t.to_frame() else: store[key_template.format(t.name)] = t.local
def directed_motion_trajs_df(): """ """ store_path = os.path.join(data_path, "directed_motion_trajectories.h5") with pd.get_store(store_path) as store: trajs = store['trajs'] return trajs
def datalogger_backup(filename='default'): "saves all datalogger data to filename" written = 0 if filename == 'default': filename = os.path.join(MEDIA_ROOT, 'datalogger.h5') metadata = dict() data = dict() sensors = Sensor.objects.all() if sensors is None: raise ValueError("No sensors in datalogger!") for sens in sensors: metadata[sens.name] = sens.description sd = SensingDevice(name=sens.name) data[sens.name] = sd.getallpoints_raw() with pandas.get_store(filename) as store: for sens in sensors: store[sens.name] = data[sens.name] written += len(data[sens.name]) with h5py.File(filename) as the_file: try: params = the_file["params"] except KeyError: params = the_file.create_group("params") for key, value in metadata.iteritems(): try: params[key] except KeyError: params.create_dataset(key, data=value) else: del params[key] params.create_dataset(key, data=value) print "Backup finished! " + str( written) + " MeasurementPoints written to " + filename
def write_tables(fname, models, year): """ Write all tables injected into `models` to a pandas.HDFStore file. If year is not None it will be used to prefix the table names so that multiple years can go in the same file. Parameters ---------- fname : str File name for HDFStore. Will be opened in append mode and closed at the end of this function. models : list of str Models from which to gather injected tables for saving. year : int or None If an integer, used as a prefix along with table names for labeling DataFrames in the HDFStore. """ models = (get_model(m) for m in toolz.unique(models)) table_names = toolz.unique(toolz.concat(m._tables_used() for m in models)) tables = (get_table(t) for t in table_names) key_template = '{}/{{}}'.format(year) if year is not None else '{}' with pd.get_store(fname, mode='a') as store: for t in tables: store[key_template.format(t.name)] = t.to_frame()
def frames_from_hdf(fname, stamp=None, vertex_kwargs={}, edge_kwargs={}): with pd.get_store(fname) as store: if stamp == -1: vertex_df = store.select( 'vertices', #where="'stamp'=stamp", **vertex_kwargs) stamps = vertex_df.index.get_level_values('stamp').unique() last = stamps.max() vertex_df = vertex_df.xs(last, level='stamp') edge_df = store.select('edges', where="'stamp'={}".format(last), **edge_kwargs) elif stamp is not None: vertex_df = store.select('vertices', where="'stamp'={}".format(stamp), **vertex_kwargs) edge_df = store.select('edges', where="'stamp'={}".format(stamp), **edge_kwargs) else: vertex_df = store.select('vertices', **vertex_kwargs) edge_df = store.select('edges', **edge_kwargs) return vertex_df, edge_df
def test_write_cooccurrence_matrix_hd5(counter, output, utterances, metadata): io.write_cooccurrence_matrix_hd5(counter, output, utterances, metadata) with pd.get_store(output) as store: row2id = store['row2id'] assert row2id.ix['a'] != row2id.ix['b'] with pytest.raises(KeyError): row2id.ix['Z'] col2id = store['col2id'] assert col2id.ix['Z'] != col2id.ix['Y'] with pytest.raises(KeyError): col2id['a'] assert tuple(store['row_labels']) == tuple('abac') assert tuple(store['col_labels']) == tuple('ZYXW') assert tuple(store['row_ids']) == tuple(row2id[r] for r in store['row_labels']) assert tuple(store['col_ids']) == tuple(col2id[c] for c in store['col_labels']) assert tuple(store['data'].values) == (3, 4, 2, 1) assert store.get_storer('data').attrs.metadata == metadata
def save(self, out_filename, out_key): with pd.get_store(out_filename) as out_store: with pd.get_store(self.filename) as store: while True: try: frame_no, labels = next(self.label_generator) except StopIteration: break # Fetch data (redundantly) this time taking all columns. frame = store.select(self.key, 'frame == %d' % frame_no) frame['probe'] = -1 # an integer placeholder frame['probe'].update(labels) out_store.append(out_key, frame, data_columns=frame.columns) print "Frame %d written with %d probes tracked." \ % (frame_no, len(frame))
def load_oldformat(filename): """loads the curve at filename""" with pandas.get_store(filename, "r") as store: data = store["data"] kwds = dict() with h5py.File(filename) as the_file: try: meta = the_file["meta"] except KeyError: print "In filename "+filename+" not even the meta attribute exists!" else: for key, value in meta.iteritems(): kwds[key] = convert_from_numpy(value.value) dir_name, file_name = os.path.split(filename) file_root, file_ext = os.path.splitext(file_name) kwds["name"] = file_root if file_ext!='.h5': print "Error: file ended not with .h5" kwds["id"]=1 kwds["date"]=datetime.fromtimestamp(os.path.getmtime(filename)) kwds["oldformat"]=True curve = Curve() curve.set_data(data) curve.set_params(**kwds) return curve
def test_run_and_write_tables(df, store_name): orca.add_table('table', df) def year_key(y): return '{}'.format(y) def series_year(y): return pd.Series([y] * 3, index=df.index, name=str(y)) @orca.step() def step(iter_var, table): table[year_key(iter_var)] = series_year(iter_var) orca.run( ['step'], iter_vars=range(11), data_out=store_name, out_interval=3) with pd.get_store(store_name, mode='r') as store: for year in range(0, 11, 3): key = '{}/table'.format(year) assert key in store for x in range(year): pdt.assert_series_equal( store[key][year_key(x)], series_year(x)) assert 'base/table' in store for x in range(11): pdt.assert_series_equal( store['10/table'][year_key(x)], series_year(x))
def main(): args = parser.parse_args() store = os.path.expanduser(args.hdf_store) data_dir = Path(os.path.expanduser(args.data_dir)) cached = cached_games(data_dir) # first time. Generate the store if not os.path.isfile(store): pd.HDFStore(store) with pd.get_store(store) as s: try: stored = s.select("drs")["match_id"].unique() except KeyError: stored = [] new_games = filter(lambda x: int(x.stem) not in stored, cached) dfs = [] i = 0 # if no new games for i, game in enumerate(new_games, 1): dr = api.DetailsResponse.from_json(str(game)) dfs.append(format_df(dr)) else: append_to_store(store, dfs) print("Added {} games.".format(i))
def test_get_store(self): pytest.importorskip('tables') with tm.ensure_clean() as path: with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): s = pd.get_store(path) s.close()
def sample_rate_during_period(self, time_period, alias): """docstring for eye_during_period""" with pd.get_store(self.input_object) as h5_file: period_block_nr = self.sample_in_block( sample=time_period[0], block_table=h5_file['%s/blocks' % alias]) return h5_file['%s/blocks' % alias]['sample_rate'][period_block_nr]
def numbered_frames(self): with pd.get_store(self.filename) as store: for frame_no in xrange(self.first_frame, 1 + self.last_frame): frame = store.select(self.key, 'frame == %d' % frame_no, columns=self.pos_columns) yield frame_no, frame
def from_h5(cls, store_path, base_dir=None, minimum_metadata_keys=[], clean_store=False): """Load ObjectsIO from HDF5 file. Parameters ---------- store_path : str HDF5 file path. base_dir : str Root directory """ if base_dir: full_store_path = os.path.join(base_dir, store_path) else: full_store_path = store_path with pd.get_store(full_store_path) as store: metadata_serie = store['metadata'] metadata = metadata_serie.to_dict() return cls(metadata=metadata, store_path=store_path, base_dir=base_dir, minimum_metadata_keys=minimum_metadata_keys, clean_store=clean_store)
def keys(self): """Return list of objects in HDF5 file. """ objs = [] with pd.get_store(self.store_path) as store: objs = store.keys() return objs