def _from_save_dict(self, save_dict): with ns.NameSpace(''): self.name = save_dict['name'] for tank_name in save_dict['tanks']: tank_dict = save_dict['tanks'][tank_name] mod = importlib.import_module(tank_dict['__module__']) cls = getattr(mod, tank_dict['__class__']) tank = cls(name=tank_name, waterwork=self) self.tanks[tank_name] = tank for slot_name in save_dict['slots']: slot_dict = save_dict['slots'][slot_name] tank = self.tanks[slot_dict['tank']] slot = tank.get_slot(slot_dict['key']) slot.plug = slot_dict['plug'] # Set to proper name del self.slots[slot.name] slot.name = slot_name self.slots[slot_name] = slot for tube_name in save_dict['tubes']: tube_dict = save_dict['tubes'][tube_name] tank = self.tanks[tube_dict['tank']] tube = tank.get_tube(tube_dict['key']) tube.plug = tube_dict['plug'] # Set to proper name del self.tubes[tube.name] tube.name = tube_name self.tubes[tube_name] = tube for slot_name in self.slots: slot_dict = save_dict['slots'][slot_name] slot = self.slots[slot_name] if slot_dict['tube'] is not None: tube = self.tubes[slot_dict['tube']] tube.slot = slot else: tube = empty slot.tube = tube self.funnels = {} for funnel_name in save_dict['funnels']: self.funnels[funnel_name] = self.slots[funnel_name] self.taps = {} for tap_name in save_dict['taps']: self.taps[tap_name] = self.tubes[tap_name]
def define_waterwork(self, array=empty, return_tubes=None): """Get the waterwork that completely describes the pour and pump transformations. Parameters ---------- array : np.ndarray or empty The array to be transformed. Returns ------- Waterwork The waterwork with all the tanks (operations) added, and names set. """ assert self.input_dtype is not None, ( "Run calc_global_values before running the transform") with ns.NameSpace(self.name): for trans_num, trans_key in enumerate(self.transform_order): trans = self.transforms[trans_key] with ns.NameSpace(trans.name): if trans_num < len(self.transform_order) - 1: tap_key = self.tap_keys[trans_num + 1] return_tubes = [self._pre(tap_key)] else: return_tubes = None tubes = trans.define_waterwork(array, return_tubes) if tubes is None: continue old_name = tubes[0].name tubes[0].set_name("to_be_cloned") tube_dict, _ = td.clone(tubes[0]) array = tube_dict['a'] tube_dict['b'].set_name(old_name)
def __enter__(self): """When entering, set the global _default_waterwork to this waterwork.""" if gl._default_waterwork is not None: raise ValueError( "_default_waterwork is already set. Cannot be reset until context is exitted. Are you within the with statement of another waterwork?" ) # Create a new namespace for this waterwork self.name_space = ns.NameSpace(self.name) self.name_space.__enter__() gl._default_waterwork = self return self
def define_waterwork(self, array=empty, return_tubes=None): """Get the waterwork that completely describes the pour and pump transformations. Parameters ---------- array : np.ndarray or empty The array to be transformed. Returns ------- Waterwork The waterwork with all the tanks (operations) added, and names set. """ # If the dimensions are not being kept then run flat_tokenize which puts # all sentences on the same axis. if not self.keep_dims: sents, sents_slots = td.flat_tokenize( strings=array, tokenizer=self.sent_tokenizer, detokenizer=self.sent_detokenizer) sents_slots['ids'].set_name('doc_ids') sents['ids'].set_name('ids') # Otherwise call tokenize which keeps the structure of array and adds a dim. else: sents, sents_slots = td.tokenize(strings=array, tokenizer=self.sent_tokenizer, detokenizer=self.sent_detokenizer, max_len=self.max_doc_len) sents_slots['strings'].set_name('input') with ns.NameSpace(self.string_transform.name): self.string_transform.define_waterwork(array=sents['target']) if return_tubes is not None: ww = sents['target'].waterwork r_tubes = [] for r_tube_key in return_tubes: r_tubes.append(ww.maybe_get_tube(r_tube_key)) return r_tubes
def _from_save_dict(self, save_dict): import wtrwrks.tanks.tank_defs as td with ns.NameSpace(''): self.name = save_dict['name'] for tank_name in save_dict['tanks']: tank_dict = save_dict['tanks'][tank_name] func = getattr(td, tank_dict['func_name']) kwargs = {} if 'kwargs' in tank_dict: kwargs = tank_dict['kwargs'] args = [] if 'args' in tank_dict: args = tank_dict['args'] tubes, slots = func(name=tank_name, waterwork=self, *args, **kwargs) tank = tubes[tubes.keys()[0]].tank self.tanks[tank_name] = tank for slot_name in save_dict['slots']: slot_dict = save_dict['slots'][slot_name] tank = self.tanks[slot_dict['tank']] slot = tank.get_slot(slot_dict['key']) slot.plug = slot_dict['plug'] # Set to proper name del self.slots[slot.name] slot.name = slot_name self.slots[slot_name] = slot for tube_name in save_dict['tubes']: tube_dict = save_dict['tubes'][tube_name] tank = self.tanks[tube_dict['tank']] tube = tank.get_tube(tube_dict['key']) tube.plug = tube_dict['plug'] # Set to proper name del self.tubes[tube.name] tube.name = tube_name self.tubes[tube_name] = tube for tube_name in save_dict['tubes']: tube = self.tubes[tube_name] downstream_tube_name = save_dict['tubes'][tube_name][ 'downstream_tube'] if downstream_tube_name is not None: tube.downstream_tube = self.tubes[downstream_tube_name] for slot_name in self.slots: slot_dict = save_dict['slots'][slot_name] slot = self.slots[slot_name] if slot_dict['tube'] is not None: tube = self.tubes[slot_dict['tube']] tube.slot = slot else: tube = empty slot.tube = tube self.funnels = {} for funnel_name in save_dict['funnels']: self.funnels[funnel_name] = self.slots[funnel_name] self.taps = {} for tap_name in save_dict['taps']: self.taps[tap_name] = self.tubes[tap_name] for key in save_dict['merged']: tube = self.tubes[key] self.merged[tube] = set( [self.tubes[k] for k in save_dict['merged'][key]])
def define_waterwork(self, array=empty, return_tubes=None, prefix=''): """Get the waterwork that completely describes the pour and pump transformations. Parameters ---------- array : np.ndarray or empty The array to be transformed. return_tubes : list of str or None Tube objects to be returned from the Waterwork object. Only needed if Waterworks are being stiched together. prefix : str Any additional prefix string/dictionary keys start with. Defaults to no additional prefix. Returns ------- Waterwork The waterwork with all the tanks (operations) added, and names set. """ assert self.is_calc_run, ( "Run calc_global_values before running the transform") with ns.NameSpace(self.name): indices = [] all_cols = [] for name in self.transform_names: trans_cols = self.transform_cols[name] all_cols.extend(trans_cols) for name in self.transform_names: trans_cols = self.transform_cols[name] indices.append([list(all_cols).index(c) for c in trans_cols]) # Can only partition along the 0th axis so transpose it so that the # 'column' dimension is the first perm = [1, 0] transp, transp_slots = td.transpose(a=array, axes=perm) # Parition the full dataset array into subarrays so that the individual # transforms can handle them. parts, _ = td.partition_by_index(a=transp['target'], indices=indices, tube_plugs={ 'missing_cols': np.zeros( (0, 1), dtype=self.input_dtype), 'missing_array': np.zeros( (0, 1), dtype=self.input_dtype) }) parts['missing_cols'].set_name('missing_cols') parts['missing_array'].set_name('missing_array') transp_slots['a'].set_name('array') # Split up the Tube object into a list of Tubes so they can each be fed # into individual transforms. parts_list, _ = td.iter_list(parts['target'], num_entries=len(self.transforms)) for part, name in zip(parts_list, self.transform_names): trans = self.transforms[name] # Transpose it back to it's original orientation trans_back, _ = td.transpose(a=part, axes=perm, name=name + '-Trans') part = trans_back['target'] # Depending on the type of transform, cast the subarray to its valid # type. cast, _ = td.cast(part, trans.input_dtype, tube_plugs={ 'input_dtype': self.input_dtype, 'diff': np.array([], dtype=self.input_dtype) }, name=name + '-Cast') # if isinstance(trans, nt.NumTransform): # cast, _ = td.cast(part, np.float64, name='-'.join([name, 'cast'])) # part = cast['target'] # elif isinstance(trans, dt.DateTimeTransform): # cast, _ = td.cast(part, np.datetime64, name='-'.join([name, 'cast'])) # part = cast['target'] # elif isinstance(trans, st.StringTransform): # cast, _ = td.cast(part, np.unicode, name='-'.join([name, 'cast'])) # part = cast['target'] # elif isinstance(trans, mlst.MultiLingualStringTransform): # cast, _ = td.cast(part, np.unicode, name='-'.join([name, 'cast'])) # part = cast['target'] # elif isinstance(trans, ct.CatTransform): # cast, _ = td.cast(part, np.unicode, name='-'.join([name, 'cast'])) # part = cast['target'] with ns.NameSpace(name): trans.define_waterwork(array=cast['target'], prefix=os.path.join( prefix, self.name)) if return_tubes is not None: ww = parts['missing_array'].waterwork r_tubes = [] for r_tube_key in return_tubes: r_tubes.append(ww.maybe_get_tube(r_tube_key)) return r_tubes
def define_waterwork(self, array=empty, return_tubes=None): """Get the waterwork that completely describes the pour and pump transformations. Parameters ---------- array : np.ndarray or empty The array to be transformed. Returns ------- Waterwork The waterwork with all the tanks (operations) added, and names set. """ assert self.input_dtype is not None, ( "Run calc_global_values before running the transform") with ns.NameSpace(self.name): indices = [ self.transform_col_ranges[k] for k in sorted(self.transforms) ] # Can only partition along the 0th axis so transpose it so that the # 'column' dimension is the first perm = [1, 0] + list(self.input_shape[2:]) transp, transp_slots = td.transpose(a=array, axes=perm) # Parition the full dataset array into subarrays so that the individual # transforms can handle them. parts, _ = td.partition(a=transp['target'], indices=indices) parts['missing_cols'].set_name('missing_cols') parts['missing_array'].set_name('missing_array') transp_slots['a'].set_name('input') # Split up the Tube object into a list of Tubes so they can each be fed # into individual transforms. parts_list, _ = td.iter_list(parts['target'], num_entries=len(self.transforms)) for part, name in zip(parts_list, sorted(self.transforms)): trans = self.transforms[name] # Transpose it back to it's original orientation trans_back, _ = td.transpose(a=part, axes=perm) part = trans_back['target'] # Depending on the type of transform, cast the subarray to its valid # type. if isinstance(trans, nt.NumTransform): cast, _ = td.cast(part, np.float64, name='-'.join([name, 'cast'])) part = cast['target'] elif isinstance(trans, dt.DateTimeTransform): cast, _ = td.cast(part, np.datetime64, name='-'.join([name, 'cast'])) part = cast['target'] elif isinstance(trans, st.StringTransform): cast, _ = td.cast(part, np.unicode, name='-'.join([name, 'cast'])) part = cast['target'] elif isinstance(trans, ct.CatTransform): cast, _ = td.cast(part, np.unicode, name='-'.join([name, 'cast'])) part = cast['target'] with ns.NameSpace(name): trans.define_waterwork(array=part) if return_tubes is not None: ww = parts['missing_array'].waterwork r_tubes = [] for r_tube_key in return_tubes: r_tubes.append(ww.maybe_get_tube(r_tube_key)) return r_tubes