def __initialize_datasets__(self, sample_data): self.out = {} sample_data = self.__parse_input_data__(sample_data) sample_data = self.__convert_input_data__(sample_data) dims = {dset: 1 if len(data.shape) == 1 else data.shape[ 1] for dset, data in iteritems(sample_data)} # needed for raw_datasets only dtypes = { dset: get_dtype(sample_data[dset]) for dset in self.raw_datasets} # init raw datasets for dset in self.raw_datasets: (group, dataset) = os.path.split(dset) if not(group): group = '/' self.out[dset] = self.np2h5.add_dataset(group, dataset, n_columns=dims[dset], item_type=dtypes[ dset], fixed_size=False) # FIXME at some point should become super.add_dataset(...) # init not fused indexed datasets, in this implementation they are all # encoded in the same matrix if self.non_fused_datasets: indexed_dims = [dims[dset] for dset in self.non_fused_datasets] indexed_levels = [len(self.indexes[dset]) for dset in self.non_fused_datasets] dim = sum(indexed_dims) # smallest unsigned integer dtype compatible with all # indexed_datasets d_type = type_fitting.fit_integer_type( max(indexed_levels), is_signed=False) # FIXME at some point should become super.add_dataset(...) self.out['indexed'] = self.np2h5.add_dataset( self.group, 'indexed_data', n_columns=dim, item_type=d_type, fixed_size=False) with h5py.File(self.filename) as f: # necessary to access the part of the data corresponding to a # particular dataset f[self.group].create_dataset( 'indexed_cumudims', data=np.cumsum(indexed_dims), dtype=np.uint64) # fused datasets have a separate one dimensional dataset each self.key_weights = {} for fused_dset in self.fused_datasets: fused_dims = np.array( [dims[dset] for dset in self.fused_members[fused_dset]], dtype=np.uint64) max_key = np.prod( self.nb_levels[fused_dset] ** fused_dims) - np.uint64(1) if max_key >= 2 ** 64: raise ValueError('fused dataset %s in file %s cannot be created because 64 bits keys are not sufficient to cover all possible combinations of the fused datasets' % ( fused_dset, self.filename)) # smallest unsigned integer dtype compatible d_type = type_fitting.fit_integer_type(max_key, is_signed=False) # FIXME at some point should become super.add_dataset(...) self.out[fused_dset] = self.np2h5.add_dataset( self.group, fused_dset, n_columns=1, item_type=d_type, fixed_size=False) nb_levels_with_multiplicity = np.concatenate([np.array( n, dtype=d_type) * np.ones(d, dtype=d_type) for n, d in zip(self.nb_levels[fused_dset], fused_dims)]) self.key_weights[fused_dset] = np.concatenate( [np.array([1], dtype=d_type), np.cumprod(d_type(nb_levels_with_multiplicity))[:-1]]) with h5py.File(self.filename) as f: f[self.group]['fused'][fused_dset].create_dataset( 'key_weights', data=self.key_weights[fused_dset], dtype=d_type)
def __initialize_datasets__(self, sample_data): self.out = {} sample_data = self.__parse_input_data__(sample_data) sample_data = self.__convert_input_data__(sample_data) dims = {dset: 1 if len(data.shape) == 1 else data.shape[ 1] for dset, data in sample_data.iteritems()} # needed for raw_datasets only dtypes = { dset: get_dtype(sample_data[dset]) for dset in self.raw_datasets} # init raw datasets for dset in self.raw_datasets: (group, dataset) = os.path.split(dset) if not(group): group = '/' self.out[dset] = self.np2h5.add_dataset(group, dataset, n_columns=dims[dset], item_type=dtypes[ dset], fixed_size=False) # FIXME at some point should become super.add_dataset(...) # init not fused indexed datasets, in this implementation they are all # encoded in the same matrix if self.non_fused_datasets: indexed_dims = [dims[dset] for dset in self.non_fused_datasets] indexed_levels = [len(self.indexes[dset]) for dset in self.non_fused_datasets] dim = sum(indexed_dims) # smallest unsigned integer dtype compatible with all # indexed_datasets d_type = type_fitting.fit_integer_type( max(indexed_levels), is_signed=False) # FIXME at some point should become super.add_dataset(...) self.out['indexed'] = self.np2h5.add_dataset( self.group, 'indexed_data', n_columns=dim, item_type=d_type, fixed_size=False) with h5py.File(self.filename) as f: # necessary to access the part of the data corresponding to a # particular dataset f[self.group].create_dataset( 'indexed_cumudims', data=np.cumsum(indexed_dims), dtype=np.uint64) # fused datasets have a separate one dimensional dataset each self.key_weights = {} for fused_dset in self.fused_datasets: fused_dims = np.array( [dims[dset] for dset in self.fused_members[fused_dset]], dtype=np.uint64) max_key = np.prod( self.nb_levels[fused_dset] ** fused_dims) - np.uint64(1) if max_key >= 2 ** 64: raise ValueError('fused dataset %s in file %s cannot be created because 64 bits keys are not sufficient to cover all possible combinations of the fused datasets' % ( fused_dset, self.filename)) # smallest unsigned integer dtype compatible d_type = type_fitting.fit_integer_type(max_key, is_signed=False) # FIXME at some point should become super.add_dataset(...) self.out[fused_dset] = self.np2h5.add_dataset( self.group, fused_dset, n_columns=1, item_type=d_type, fixed_size=False) nb_levels_with_multiplicity = np.concatenate([np.array( n, dtype=d_type) * np.ones(d, dtype=d_type) for n, d in zip(self.nb_levels[fused_dset], fused_dims)]) self.key_weights[fused_dset] = np.concatenate( [np.array([1], dtype=d_type), np.cumprod(d_type(nb_levels_with_multiplicity))[:-1]]) with h5py.File(self.filename) as f: f[self.group]['fused'][fused_dset].create_dataset( 'key_weights', data=self.key_weights[fused_dset], dtype=d_type)
def collapse(scorefile, taskfile, fid): """Collapses the results for each triplets sharing the same on, across and by labels. """ # We make the assumption that everything fits in memory... scorefid = h5py.File(scorefile) taskfid = h5py.File(taskfile) bys = taskfid['bys'][...] for by_idx, by in enumerate(bys): # print 'collapsing {0}/{1}'.format(by_idx + 1, len(bys)) trip_attrs = taskfid['triplets']['by_index'][by_idx] tfrk = taskfid['regressors'][by] tmp = tfrk[u'indexed_data'] indices = np.array(tmp) if indices.size == 0: continue tmp = scorefid['scores'][trip_attrs[0]:trip_attrs[1]] scores_arr = np.array(tmp) tmp = np.ascontiguousarray(indices).view( np.dtype((np.void, indices.dtype.itemsize * indices.shape[1]))) n_indices = np.max(indices, 0) + 1 assert np.prod(n_indices) < 18446744073709551615, "type not big enough" ind_type = fit_integer_type(np.prod(n_indices), is_signed=False) # encoding the indices of a triplet to a unique index new_index = indices[:, 0].astype(ind_type) for i in range(1, len(n_indices)): new_index = indices[:, i] + n_indices[i] * new_index permut = np.argsort(new_index) # collapsing the score sorted_scores = scores_arr[permut] sorted_index = new_index[permut] mean, unique_index, counts = unique(sorted_index, sorted_scores) # retrieving the triplet indices from the unique index. tmp = npdecode(unique_index, n_indices) regs = tfrk['indexed_datasets'] indexes = [] for reg in regs: indexes.append(tfrk['indexes'][reg][:]) nregs = len(regs) for i, key in enumerate(tmp): aux = list() for j in range(nregs): aux.append(indexes[j][int(key[j])]) score = mean[i] n = counts[i] result = aux + [by, score, int(n)] fid.write('\t'.join(map(str, result)) + '\n') # results.append(aux + [context, score, n]) # wf_tmp.write('\t'.join(map(str, results[-1])) + '\n') scorefid.close() taskfid.close() del taskfid
def initialize_output_dsets(self, sample_data): # do some automatic conversion (maybe risky?) out = [ o if hasattr(o, 'shape') else numpy.array(o) for o in sample_data[1] ] if isinstance(out, collections.Mapping): # dict, DataFrame ... dim = [ 1 if len(out[o_name].shape) == 1 else out[o_name].shape[1] for o_name in self.output_names ] dtypes = [get_dtype(out[o_name]) for o_name in self.output_names] else: # list, tuple ... dim = [1 if len(o.shape) == 1 else o.shape[1] for o in out] dtypes = [get_dtype(o) for o in out] with h5py.File(self.filename) as f: for o, d, t in zip(self.output_names, dim, dtypes): if not (o in self.indexed_outputs): f['data'].create_dataset(o, (0, d), dtype=t, chunks=(chunk_size( numpy.dtype(t).itemsize, d), d), maxshape=(None, d)) indexed_o_dims = [] indexed_o_levels = [] # indexed outputs are stored in the order specified by # self.indexed_outputs for o in self.indexed_outputs: indexed_o_dims.append(dim[self.output_names.index(o)]) indexed_o_levels.append(len(self.indexes[o])) d = sum(indexed_o_dims) # smallest unsigned integer dtype compatible with all # indexed_outputs t = type_fitting.fit_integer_type(max(indexed_o_levels), is_signed=False) f['data'].create_dataset('indexed_outputs', (0, d), dtype=t, chunks=(chunk_size( numpy.dtype(t).itemsize, d), d), maxshape=(None, d)) # necessary to access the part of the dataset corresponding to a # particular output f['synopsis'].create_dataset('indexed_outputs_dims', data=numpy.cumsum(indexed_o_dims), dtype=numpy.int64)
def initialize_output_dsets(self, sample_data): # do some automatic conversion (maybe risky?) out = [o if hasattr(o, 'shape') else numpy.array(o) for o in sample_data[1]] if isinstance(out, collections.Mapping): # dict, DataFrame ... dim = [1 if len(out[o_name].shape) == 1 else out[o_name].shape[ 1] for o_name in self.output_names] dtypes = [get_dtype(out[o_name]) for o_name in self.output_names] else: # list, tuple ... dim = [1 if len(o.shape) == 1 else o.shape[1] for o in out] dtypes = [get_dtype(o) for o in out] with h5py.File(self.filename) as f: for o, d, t in zip(self.output_names, dim, dtypes): if not(o in self.indexed_outputs): f['data'].create_dataset(o, (0, d), dtype=t, chunks=( chunk_size(numpy.dtype(t).itemsize, d), d), maxshape=(None, d)) indexed_o_dims = [] indexed_o_levels = [] # indexed outputs are stored in the order specified by # self.indexed_outputs for o in self.indexed_outputs: indexed_o_dims.append(dim[self.output_names.index(o)]) indexed_o_levels.append(len(self.indexes[o])) d = sum(indexed_o_dims) # smallest unsigned integer dtype compatible with all # indexed_outputs t = type_fitting.fit_integer_type( max(indexed_o_levels), is_signed=False) f['data'].create_dataset( 'indexed_outputs', (0, d), dtype=t, chunks=( chunk_size(numpy.dtype(t).itemsize, d), d), maxshape=(None, d)) # necessary to access the part of the dataset corresponding to a # particular output f['synopsis'].create_dataset( 'indexed_outputs_dims', data=numpy.cumsum(indexed_o_dims), dtype=numpy.int64)
def generate_pairs(self, output=None): """Generate the pairs associated to the triplet list .. note:: This function is called by generate_triplets and should not be used independantly """ # FIXME change this to a random file name to avoid overwriting problems # default name for output file if output is None: (basename, _) = os.path.splitext(self.database) output = basename + '.abx' # list all pairs all_empty = True try: _, output_tmp = tempfile.mkstemp() for by, db in self.by_dbs.iteritems(): # FIXME maybe care about this case earlier ? if self.verbose > 0: print("Writing AX/BX pairs to task file...") with h5py.File(output) as fh: not_empty = fh['/triplets/' + str(by)].size if not_empty: all_empty = False max_ind = np.max(db.index.values) pair_key_type = type_fitting.fit_integer_type( (max_ind + 1) ** 2 - 1, is_signed=False) with h52np.H52NP(output) as f_in: with np2h5.NP2H5(output_tmp) as f_out: inp = f_in.add_dataset('triplets', str(by)) out = f_out.add_dataset( 'pairs', str(by), n_columns=1, item_type=pair_key_type, fixed_size=False) # FIXME repace this by a for loop by making h52np # implement the iterable pattern with next() outputing # inp.read() try: while True: triplets = pair_key_type(inp.read()) n = triplets.shape[0] ind = np.arange(n) i1 = 2 * ind i2 = 2 * ind + 1 # would need to amend np2h5 and h52np to remove # the second dim... pairs = np.empty( shape=(2 * n, 1), dtype=pair_key_type) # FIXME change the encoding (and type_fitting) # so that A,B and B,A have the same code ... # (take a=min(a,b), b=max(a,b)) # FIXME but allow a flag to control the # behavior to be able to enforce A,X and B,X # order when using assymetrical distance # functions pairs[i1, 0] = triplets[:, 0] + ( max_ind + 1) * triplets[:, 2] # AX pairs[i2, 0] = triplets[:, 1] + ( max_ind + 1) * triplets[:, 2] # BX # FIXME do a unique here already? Do not store # the inverse mapping ? (could sort triplets on # pair1, complete pair1, sort on pair2, # complete pair 2 and shuffle ?) out.write(pairs) except StopIteration: pass # sort pairs handler = h5_handler.H5Handler(output_tmp, '/pairs/', str(by)) # memory: available RAM in Mo, could be a param memory = 1000 # estimate of the amount of data to be sorted with h5py.File(output_tmp) as fh: n = fh['/pairs/' + str(by)].shape[0] i = fh['/pairs/' + str(by)].dtype.itemsize amount = n * i # in bytes # harmonize units to Ko: memory = 1000 * memory amount = amount / 1000. # be conservative: aim at using no more than 3/4 the available # memory # if enough memory take one chunk (this will do an unnecessary # full write and read of the file... could be optimized easily) if amount <= 0.75 * memory: # would it be beneficial to have a large o_buffer_size as # well ? handler.sort(buffer_size=amount) # else take around 30 chunks if possible (this seems efficient # given the current implem, using a larger number of chunks # efficiently might be possible if the reading chunks part of # the sort was cythonized ?) elif amount / 30. <= 0.75 * memory: handler.sort(buffer_size=amount / 30.) # else take minimum number of chunks possible given the # available RAM else: handler.sort(buffer_size=0.75 * memory) # FIXME should have a unique function directly instead of # sorting + unique ? with h52np.H52NP(output_tmp) as f_in: with np2h5.NP2H5(output) as f_out: inp = f_in.add_dataset('pairs', str(by)) out = f_out.add_dataset( 'unique_pairs', str(by), n_columns=1, item_type=pair_key_type, fixed_size=False) try: last = -1 while True: pairs = inp.read() pairs = np.unique(pairs) # unique alters the shape pairs = np.reshape(pairs, (pairs.shape[0], 1)) if pairs[0, 0] == last: pairs = pairs[1:] if pairs.size > 0: last = pairs[-1, 0] out.write(pairs) except StopIteration: pass # store for ulterior decoding with h5py.File(output) as fh: fh['/unique_pairs'].attrs[str(by)] = max_ind + 1 store = pd.HDFStore(output) # use append to make use of table format, which is better at # handling strings without much space (fixed-size format) store.append('/feat_dbs/' + str(by), self.feat_dbs[by], expectedrows=len(self.feat_dbs[by])) store.close() # FIXME generate inverse mapping to triplets (1 and 2) ? finally: os.remove(output_tmp) if self.verbose > 0: print("done.")
def on_across_triplets(self, by, on, across, on_across_block, on_across_by_values, with_regressors=True): """Generate all possible triplets for a given by block. Given an on_across_block of the database and the parameters of the \ task, this function will generate the complete set of triplets and \ the regressors. Parameters ---------- by : int The block index on, across : int The task attributes on_across_block : list the block on_across_by_values : dict the actual values with_regressors : bool, optional By default, true Returns ------- triplets : numpy.Array the set of triplets generated regressors : numpy.Array the regressors generated """ # find all possible A, B, X where A and X have the 'on' feature of the # block and A and B have the 'across' feature of the block A = np.array(on_across_block, dtype=self.types[by]) on_set = set(self.on_blocks[by].groups[on]) # FIXME quick fix to process case whith no across, but better done in a # separate loop ... if self.across == ['#across']: # in this case A is a singleton and B can be anything in the by # block that doesn't have the same 'on' as A B = np.array( list(set(self.by_dbs[by].index).difference(on_set)), dtype=self.types[by]) else: B = self.across_blocks[by].groups[across] # remove B with the same 'on' than A B = np.array(list(set(B).difference(A)), dtype=self.types[by]) # remove X with the same 'across' than A if type(across) is tuple: antiacross_set = set(self.antiacross_blocks[by][across]) X = np.array(list(antiacross_set & on_set), dtype=self.types[by]) else: X = np.array(list(on_set.difference(A)), dtype=self.types[by]) # apply singleton filters db = self.by_dbs[by] if self.filters.A: A = self.filters.A_filter(on_across_by_values, db, A) if self.filters.B: B = self.filters.B_filter(on_across_by_values, db, B) if self.filters.X: X = self.filters.X_filter(on_across_by_values, db, X) # instantiate A, B, X regressors here if with_regressors: self.regressors.set_A_regressors(on_across_by_values, db, A) self.regressors.set_B_regressors(on_across_by_values, db, B) self.regressors.set_X_regressors(on_across_by_values, db, X) # A, B, X can then be combined efficiently in a full (or randomly # sampled) factorial design size = len(A) * len(B) * len(X) if size > 0: ind_type = type_fitting.fit_integer_type(size, is_signed=False) # if sampling in the absence of triplets filters, do it here if self.sampling and not(self.filters.ABX): indices = self.sampler.sample(size, dtype=ind_type) else: indices = np.arange(size, dtype=ind_type) # generate triplets from indices iX = np.mod(indices, len(X)) iB = np.mod(np.divide(indices, len(X)), len(B)) iA = np.divide(indices, len(B) * len(X)) triplets = np.column_stack((A[iA], B[iB], X[iX])) # apply triplets filters if self.filters.ABX: triplets = self.filters.ABX_filter( on_across_by_values, db, triplets) size = triplets.shape[0] # if sampling in the presence of triplets filters, do it here if self.sampling: ind_type = type_fitting.fit_integer_type( size, is_signed=False) indices = self.sampler.sample(size, dtype=ind_type) triplets = triplets[indices, :] else: triplets = np.empty(shape=(0, 3), dtype=self.types[by]) indices = np.empty(shape=size, dtype=np.uint8) iA = indices iB = indices iX = indices if with_regressors: if self.regressors.ABX: # instantiate ABX regressors here self.regressors.set_ABX_regressors( on_across_by_values, db, triplets) # self.regressors.XXX contains either (for by and on_across_by) # [[scalar_output_1_dbfun_1, scalar_output_2_dbfun_1,...], # [scalar_output_1_dbfun_2, ...], ...] # or: # [[np_array_output_1_dbfun_1, np_array_output_2_dbfun_1,...], # [np_array_output_1_dbfun_2, ...], ...] # FIXME change manager API so that self.regressors.A contains the # data and not the list of dbfun_s ? regressors = {} scalar_names = self.regressors.by_names + \ self.regressors.on_across_by_names scalar_regressors = self.regressors.by_regressors + \ self.regressors.on_across_by_regressors for names, regs in zip(scalar_names, scalar_regressors): for name, reg in zip(names, regs): regressors[name] = np.tile(np.array(reg), (np.size(triplets, 0), 1)) for names, regs in zip(self.regressors.A_names, self.regressors.A_regressors): for name, reg in zip(names, regs): regressors[name] = reg[iA] for names, regs in zip(self.regressors.B_names, self.regressors.B_regressors): for name, reg in zip(names, regs): regressors[name] = reg[iB] for names, regs in zip(self.regressors.X_names, self.regressors.X_regressors): for name, reg in zip(names, regs): regressors[name] = reg[iX] # FIXME implement this # for names, regs in zip(self.regressors.ABX_names, # self.regressors.ABX_regressors): # for name, reg in zip(names, regs): # regressors[name] = reg[indices,:] return triplets, regressors else: return triplets
def __init__(self, db_name, on, across=None, by=None, filters=None, regressors=None, verbose=0): self.verbose = verbose assert os.path.exists(db_name), ('the item file {0} was not found:' .format(db_name)) if across is None: across = [] if by is None: by = [] if filters is None: filters = [] if regressors is None: regressors = [] # check parameters # using several 'on' isn't supported by the toolbox assert isinstance(on, basestring), \ 'ON attribute must be specified by a string' on = [on] if isinstance(across, basestring): across = [across] if isinstance(by, basestring): by = [by] if verbose: print("Verifying input...") # open database db, db_hierarchy, feat_db = database.load(db_name, features_info=True) # check that required columns are present cols = set(db.columns) message = ' argument is invalid, check that all \ the provided attributes are defined in the database ' + db_name # the argument of issuperset needs to be a list ... assert cols.issuperset(on), 'ON' + message assert cols.issuperset(across), 'ACROSS' + message assert cols.issuperset(by), 'BY' + message # FIXME add additional checks, for example that columns # in BY, ACROSS, ON are not the same ? (see task structure notes) # also that location columns are not used for col in cols: assert '_' not in col, col + ': you cannot use underscore in \ column names' assert '#' not in col, col + ': you cannot use \'#\' in \ column names' if verbose: print("Input verified") # if 'by' or 'across' are empty create appropriate dummy columns # (note that '#' is forbidden in user names for columns) if not by: db['#by'] = 0 by = ['#by'] if not across: db['#across'] = range(len(db)) across = ['#across'] # note that this additional columns are not in the db_hierarchy, # but I don't think this is problematic self.filters = filter_manager.FilterManager(db_hierarchy, on, across, by, filters) self.regressors = regressor_manager.RegressorManager(db, db_hierarchy, on, across, by, regressors) self.sampling = False # prepare the database for generating the triplets self.by_dbs = {} self.feat_dbs = {} self.on_blocks = {} self.across_blocks = {} self.on_across_blocks = {} self.antiacross_blocks = {} by_groups = db.groupby(by) if self.verbose > 0: display = progress_display.ProgressDisplay() display.add('block', 'Preprocessing by block', len(by_groups)) for by_key, by_frame in by_groups: if self.verbose > 0: display.update('block', 1) display.display() # allow to get by values as well as values of other variables # that are determined by these by_values = dict(by_frame.iloc[0]) # apply 'by' filters if self.filters.by_filter(by_values): # get analogous feat_db by_feat_db = feat_db.iloc[by_frame.index] # drop indexes by_frame = by_frame.reset_index(drop=True) # reset_index to get an index relative to the 'by' db, # the original index could be conserved in an additional # 'index' column if necessary by removing the drop=True, but # this would add another constraint on the possible column name by_feat_db = by_feat_db.reset_index(drop=True) # apply generic filters by_frame = self.filters.generic_filter(by_values, by_frame) self.by_dbs[by_key] = by_frame self.feat_dbs[by_key] = by_feat_db self.on_blocks[by_key] = self.by_dbs[by_key].groupby(on) self.across_blocks[by_key] = self.by_dbs[ by_key].groupby(across) self.on_across_blocks[by_key] = self.by_dbs[ by_key].groupby(on + across) if len(across) > 1: self.antiacross_blocks[by_key] = dict() for across_key in (self.across_blocks[by_key] .groups.iterkeys()): b = True for i, col in enumerate(across): b = b * (by_frame[col] != across_key[i]) self.antiacross_blocks[by_key][ across_key] = by_frame[b].index # store parameters self.database = db_name self.db = db self.db_hierarchy = db_hierarchy self.on = on self.across = across self.by = by # determining appropriate numeric type to represent index (currently # used only for numpy arrays and h5 storage, might also be used for # panda frames) types = {} for key, db in self.by_dbs.iteritems(): # len(db)-1 wouldn't work here because there could be missing index # due to generic filtering n = np.max(db.index.values) types[key] = type_fitting.fit_integer_type(n, is_signed=False) self.types = types # compute some statistics about the task self.compute_statistics()
def score(task_file, distance_file, score_file=None, score_group='scores'): """Calculate the score of a task and put the results in a hdf5 file. Parameters ---------- task_file : string The hdf5 file containing the task (with the triplets and pairs generated) distance_file : string The hdf5 file containing the distances between the pairs score_file : string, optional The hdf5 file that will contain the results """ if score_file is None: (basename_task, _) = os.path.splitext(task_file) (basename_dist, _) = os.path.splitext(distance_file) score_file = basename_task + '_' + basename_dist + '.score' # file verification: assert os.path.exists(task_file), 'Cannot find task file ' + task_file assert os.path.exists(distance_file), ('Cannot find distance file ' + distance_file) assert not os.path.exists(score_file), ('score file already exist ' + score_file) # with h5py.File(task_file) as t: # bys = [by for by in t['triplets']] # FIXME skip empty by datasets, this should not be necessary anymore when # empty datasets are filtered at the task file generation level with h5py.File(task_file, 'r') as t: bys = t['bys'][...] # bys = t['feat_dbs'].keys() n_triplets = t['triplets']['data'].shape[0] with h5py.File(score_file, 'w') as s: s.create_dataset('scores', (n_triplets, 1), dtype=np.int8) for n_by, by in enumerate(bys): with h5py.File(task_file, 'r') as t, h5py.File(distance_file, 'r') as d: trip_attrs = t['triplets']['by_index'][n_by] pair_attrs = t['unique_pairs'].attrs[by] # FIXME here we make the assumption # that this fits into memory ... dis = d['distances']['data'][pair_attrs[1]:pair_attrs[2]][...] dis = np.reshape(dis, dis.shape[0]) # FIXME idem + only unique_pairs used ? pairs = t['unique_pairs']['data'][pair_attrs[1]:pair_attrs[2]][ ...] pairs = np.reshape(pairs, pairs.shape[0]) base = pair_attrs[0] pair_key_type = type_fitting.fit_integer_type((base)**2 - 1, is_signed=False) with h52np.H52NP(task_file) as t: inp = t.add_subdataset('triplets', 'data', indexes=trip_attrs) idx_start = trip_attrs[0] for triplets in inp: triplets = pair_key_type(triplets) idx_end = idx_start + triplets.shape[0] pairs_AX = triplets[:, 0] + base * triplets[:, 2] # FIXME change the encoding (and type_fitting) so that # A,B and B,A have the same code ... (take a=min(a,b), # b=max(a,b)) pairs_BX = triplets[:, 1] + base * triplets[:, 2] dis_AX = dis[np.searchsorted(pairs, pairs_AX)] dis_BX = dis[np.searchsorted(pairs, pairs_BX)] scores = (np.int8(dis_AX < dis_BX) - np.int8(dis_AX > dis_BX)) # 1 if X closer to A, -1 if X closer to B, 0 if equal # distance (this doesn't use 0, 1/2, 1 to use the # compact np.int8 data format) s['scores'][idx_start:idx_end] = np.reshape( scores, (-1, 1)) idx_start = idx_end
def score(task_file, distance_file, score_file=None, score_group="scores"): """Calculate the score of a task and put the results in a hdf5 file. Parameters ---------- task_file : string The hdf5 file containing the task (with the triplets and pairs generated) distance_file : string The hdf5 file containing the distances between the pairs score_file : string, optional The hdf5 file that will contain the results """ if score_file is None: (basename_task, _) = os.path.splitext(task_file) (basename_dist, _) = os.path.splitext(distance_file) score_file = basename_task + "_" + basename_dist + ".score" # file verification: assert os.path.exists(task_file), "Cannot find task file " + task_file assert os.path.exists(distance_file), "Cannot find distance file " + distance_file assert not os.path.exists(score_file), "score file already exist " + score_file # with h5py.File(task_file) as t: # bys = [by for by in t['triplets']] # FIXME skip empty by datasets, this should not be necessary anymore when # empty datasets are filtered at the task file generation level with h5py.File(distance_file) as d: bys = [by for by in d["distances"]] for by in bys: with h5py.File(task_file) as t, h5py.File(distance_file) as d: n = t["triplets"][by].shape[0] # FIXME here we make the assumption # that this fits into memory ... dis = d["distances"][by][...] dis = np.reshape(dis, dis.shape[0]) # FIXME idem + only unique_pairs used ? pairs = t["unique_pairs"][by][...] pairs = np.reshape(pairs, pairs.shape[0]) base = t["unique_pairs"].attrs[by] pair_key_type = type_fitting.fit_integer_type((base) ** 2 - 1, is_signed=False) with h52np.H52NP(task_file) as t: with np2h5.NP2H5(score_file) as s: inp = t.add_dataset("triplets", by) out = s.add_dataset("scores", by, n_rows=n, n_columns=1, item_type=np.int8) try: # FIXME replace this by a for loop by making h52np # implement the iterable pattern with next() outputing # inp.read() while True: # FIXME keep the pairs in the file ? triplets = pair_key_type(inp.read()) pairs_AX = triplets[:, 0] + base * triplets[:, 2] # FIXME change the encoding (and type_fitting) so that # A,B and B,A have the same code ... (take a=min(a,b), # b=max(a,b)) pairs_BX = triplets[:, 1] + base * triplets[:, 2] dis_AX = dis[np.searchsorted(pairs, pairs_AX)] dis_BX = dis[np.searchsorted(pairs, pairs_BX)] scores = np.int8(dis_AX < dis_BX) - np.int8(dis_AX > dis_BX) # 1 if X closer to A, -1 if X closer to B, 0 if equal # distance (this doesn't use 0, 1/2, 1 to use the # compact np.int8 data format) out.write(np.reshape(scores, (scores.shape[0], 1))) except StopIteration: pass
def score(task_file, distance_file, score_file=None, score_group='scores'): """Calculate the score of a task and put the results in a hdf5 file. Parameters ---------- task_file : string The hdf5 file containing the task (with the triplets and pairs generated) distance_file : string The hdf5 file containing the distances between the pairs score_file : string, optional The hdf5 file that will contain the results """ if score_file is None: (basename_task, _) = os.path.splitext(task_file) (basename_dist, _) = os.path.splitext(distance_file) score_file = basename_task + '_' + basename_dist + '.score' # file verification: assert os.path.exists(task_file), 'Cannot find task file ' + task_file assert os.path.exists(distance_file), ('Cannot find distance file ' + distance_file) assert not os.path.exists(score_file), ('score file already exist ' + score_file) # with h5py.File(task_file) as t: #bys = [by for by in t['triplets']] # FIXME skip empty by datasets, this should not be necessary anymore when # empty datasets are filtered at the task file generation level with h5py.File(task_file) as t: bys = t['bys'][...] # bys = t['feat_dbs'].keys() n_triplets = t['triplets']['data'].shape[0] with h5py.File(score_file) as s: s.create_dataset('scores', (n_triplets, 1), dtype=np.int8) for n_by, by in enumerate(bys): with h5py.File(task_file) as t, h5py.File(distance_file) as d: trip_attrs = t['triplets']['by_index'][n_by] pair_attrs = t['unique_pairs'].attrs[by] # FIXME here we make the assumption # that this fits into memory ... dis = d['distances']['data'][pair_attrs[1]:pair_attrs[2]][...] dis = np.reshape(dis, dis.shape[0]) # FIXME idem + only unique_pairs used ? pairs = t['unique_pairs']['data'][pair_attrs[1]:pair_attrs[2]][...] pairs = np.reshape(pairs, pairs.shape[0]) base = pair_attrs[0] pair_key_type = type_fitting.fit_integer_type((base) ** 2 - 1, is_signed=False) with h52np.H52NP(task_file) as t: inp = t.add_subdataset('triplets', 'data', indexes=trip_attrs) idx_start = trip_attrs[0] for triplets in inp: triplets = pair_key_type(triplets) idx_end = idx_start + triplets.shape[0] pairs_AX = triplets[:, 0] + base * triplets[:, 2] # FIXME change the encoding (and type_fitting) so that # A,B and B,A have the same code ... (take a=min(a,b), # b=max(a,b)) pairs_BX = triplets[:, 1] + base * triplets[:, 2] dis_AX = dis[np.searchsorted(pairs, pairs_AX)] dis_BX = dis[np.searchsorted(pairs, pairs_BX)] scores = (np.int8(dis_AX < dis_BX) - np.int8(dis_AX > dis_BX)) # 1 if X closer to A, -1 if X closer to B, 0 if equal # distance (this doesn't use 0, 1/2, 1 to use the # compact np.int8 data format) s['scores'][idx_start:idx_end] = np.reshape(scores, (-1, 1)) idx_start = idx_end
def collapse(scorefile, taskfile, fid): """Collapses the results for each triplets sharing the same on, across and by labels. """ # wf_tmp = open('tmp_pandas.txt', 'wb') scorefid = h5py.File(scorefile) taskfid = h5py.File(taskfile) nkeys = len(scorefid['scores'].keys()) # results = [] for key_idx, key in enumerate(scorefid['scores'].keys()): print 'collapsing {0}/{1}'.format(key_idx + 1, nkeys) context = key tfrk = taskfid['regressors'][key] tmp = tfrk[u'indexed_data'] indices = np.array(tmp) if indices.size == 0: continue tmp = scorefid['scores'][key] scores_arr = np.array(tmp) tmp = np.ascontiguousarray(indices).view( np.dtype((np.void, indices.dtype.itemsize * indices.shape[1]))) n_indices = np.max(indices, 0) + 1 if np.prod(n_indices) > 18446744073709551615: print "type not big enough" ind_type = type_fitting.fit_integer_type(np.prod(n_indices), is_signed=False) # encoding the indices of a triplet to a unique index new_index = indices[:, 0].astype(ind_type) for i in range(1, len(n_indices)): new_index = indices[:, i] + n_indices[i] * new_index permut = np.argsort(new_index) i_unique = 0 # collapsing the score key_reg = new_index[permut[0]] mean = np.empty((len(permut), 3)) mean[0] = [key_reg, scores_arr[permut[0]], 0] i_start = 0 for i, p in enumerate(permut[1:]): i += 1 if new_index[p] != key_reg: mean[i_unique, 1] = (np.mean(scores_arr[permut[i_start:i]]) + 1) / 2 mean[i_unique, 2] = i - i_start i_start = i i_unique += 1 key_reg = new_index[p] mean[i_unique] = [key_reg, 0, 0] mean[i_unique] = [key_reg, (np.mean(scores_arr[permut[i_start:i + 1]]) + 1) / 2, i - i_start + 1] mean = np.resize(mean, (i_unique + 1, 3)) # retrieving the triplet indices from the unique index. tmp = npdecode(mean[:, 0], n_indices) regs = tfrk['indexed_datasets'] indexes = [] for reg in regs: indexes.append(tfrk['indexes'][reg][:]) nregs = len(regs) for i, key in enumerate(tmp): aux = list() for j in range(nregs): aux.append(indexes[j][key[j]]) # aux.append((indexes[regs[j]])[key[j]]) score = mean[i, 1] n = mean[i, 2] result = aux + [context, score, int(n)] fid.write('\t'.join(map(str, result)) + '\n') # results.append(aux + [context, score, n]) # wf_tmp.write('\t'.join(map(str, results[-1])) + '\n') scorefid.close() taskfid.close() del taskfid
def collapse(scorefile, taskfile, fid): """Collapses the results for each triplets sharing the same on, across and by labels. """ # We make the assumption that everything fits in memory... scorefid = h5py.File(scorefile) taskfid = h5py.File(taskfile) bys = taskfid['bys'][...] for by_idx, by in enumerate(bys): # print 'collapsing {0}/{1}'.format(by_idx + 1, len(bys)) trip_attrs = taskfid['triplets']['by_index'][by_idx] tfrk = taskfid['regressors'][by] tmp = tfrk[u'indexed_data'] indices = np.array(tmp) if indices.size == 0: continue tmp = scorefid['scores'][trip_attrs[0]:trip_attrs[1]] scores_arr = np.array(tmp) tmp = np.ascontiguousarray(indices).view( np.dtype((np.void, indices.dtype.itemsize * indices.shape[1]))) n_indices = np.max(indices, 0) + 1 assert np.prod(n_indices) < 18446744073709551615, "type not big enough" ind_type = fit_integer_type(np.prod(n_indices), is_signed=False) # encoding the indices of a triplet to a unique index new_index = indices[:, 0].astype(ind_type) for i in range(1, len(n_indices)): new_index = indices[:, i] + n_indices[i] * new_index permut = np.argsort(new_index) # collapsing the score sorted_scores = scores_arr[permut] sorted_index = new_index[permut] mean, unique_index, counts = unique(sorted_index, sorted_scores) # retrieving the triplet indices from the unique index. tmp = npdecode(unique_index, n_indices) regs = tfrk['indexed_datasets'] indexes = [] for reg in regs: indexes.append(tfrk['indexes'][reg][:]) nregs = len(regs) for i, key in enumerate(tmp): aux = list() for j in range(nregs): aux.append(indexes[j][int(key[j])]) score = mean[i] n = counts[i] result = aux + [by, score, int(n)] fid.write('\t'.join(map(str, result)) + u'\n') # results.append(aux + [context, score, n]) # wf_tmp.write('\t'.join(map(str, results[-1])) + '\n') scorefid.close() taskfid.close() del taskfid