def f_materialize(self, f_node, old_record=None): """ Assuming all the components in this f_node are ready. Generate the Feature and the FTransform based on those. The function doesn't search db. Any searched result can be passed to the old_record parameter :param f_node: :param old_record: a document that matches the node :return: """ if not isinstance(f_node, FNode): raise TypeError("The parameter f_node should be of the type FNode.") if old_record: if old_record["filepaths"]: filepaths = old_record["filepaths"] feature_id = old_record["_id"] f_transform_id = old_record["essentials"]["f_transform"] ih = IOHandler() feature = ih.load_obj_from_file(feature_id, "Feature", filepaths) f_transform = ih.load_obj_from_file(f_transform_id, "FTransform", filepaths) else: feature, f_transform = self.recover_with_existing_doc(f_node, old_record) else: feature, f_transform = self.create_and_record(f_node) return feature, f_transform
def f_knit(self, f_node): if f_node.lst_fed: for f in f_node.lst_fed: self.f_subknit(f) if f_node.l_node: self.l_subknit(f_node.l_node) if f_node.filepaths: ih = IOHandler() feature = ih.load_obj_from_file(f_node.obj_id, "Feature", f_node.filepaths) f_transform_id = feature.essentials["f_transform"] f_transform = ih.load_obj_from_file(f_transform_id, "FTransform", f_node.filepaths) else: # TODO: Current f_materialize doesn't work with non-empty ftransform. Fix this and rm the next line self.fnode_has_empty_ftransform(f_node) doc = self.fc.collect_doc(f_node) feature, f_transform = self.fc.f_materialize(f_node, doc) if doc and "filepaths" in doc: """ Update if the obj is already saved in the filepaths. Whether or not save a new created one should be decided by a higher level function """ f_node.filepaths = doc["filepaths"] obj_id = doc["_id"] else: obj_id = feature.obj_id if f_node.obj_id is None: f_node.obj_id = obj_id return feature, f_transform
def l_materialize(self, l_node, old_record=None): """ Assuming all the components in this l_node are ready. Generate the Label and the LTransform based on those. The function doesn't search db. Any searched result can be passed to the old_record parameter :param l_node: LNode :param old_record: a document matched the node. :return: """ if not isinstance(l_node, LNode): raise TypeError("The parameter l_node should be of the type LNode.") if old_record: if old_record["filepaths"]: filepaths = old_record["filepaths"] label_id = old_record["_id"] l_transform_id = old_record["essentials"]["l_transform"] ih = IOHandler() label = ih.load_obj_from_file(label_id, "Label", filepaths) l_transform = ih.load_obj_from_file(l_transform_id, "LTransform", filepaths) else: label, l_transform = self.recover_with_existing_doc(l_node, old_record) else: label, l_transform = self.create_and_record(l_node) return label, l_transform
def l_knit(self, l_node): # TODO: need to prevent knitting fitted l_transform somehow if l_node.lab_fed: self.l_subknit(l_node.lab_fed) if l_node.filepaths: ih = IOHandler() label = ih.load_obj_from_file(l_node.obj_id, "Label", l_node.filepaths) l_transform_id = label.l_transform l_transform = ih.load_obj_from_file(l_transform_id, "LTransform", l_node.filepaths) else: doc = self.lc.collect_doc(l_node) label, l_transform = self.lc.l_materialize(l_node, doc) if doc and "filepaths" in doc: """" Update if the obj is already saved in the filepaths. Whether or not save a new created one should be decided by a higher level function """ l_node.filepaths = doc["filepaths"] obj_id = doc["_id"] else: obj_id = label.obj_id if l_node.obj_id is None: l_node.obj_id = obj_id return label, l_transform
def f_collect_components(f_node): ih = IOHandler() frame = ih.load_obj_from_file(obj_id=f_node.pipe_init.frame, element="Frame", filepaths=f_node.pipe_init.filepaths) label = ih.load_obj_from_file(obj_id=f_node.l_node.obj_id, element="Label", filepaths=f_node.pipe_init.filepaths) l_values = label.values lst_fed = [] for f in f_node.lst_fed: fed = ih.load_obj_from_file(obj_id=f.obj_id, element="Feature", filepaths=f_node.pipe_init.filepaths) lst_fed.append(fed) if len(lst_fed) == 1: fed_values = lst_fed[0].values else: fed_values = np.concatenate(list(map(lambda x: x.values, lst_fed)), axis=1) prevstage = max(map(lambda x: x.stage, lst_fed)) return frame, l_values, fed_values, prevstage
def lst_fed(self): filepaths = self.__pipe_init.filepaths lst_fed = self.__essentials["lst_fed"] ih = IOHandler() return [ ih.load_obj_from_file(fid, "Feature", filepaths) for fid in lst_fed ]
def save_file(self, filepaths): """ :param filepaths: list of dict :return: """ self.set_filepaths(filepaths) ih = IOHandler() ih.save_obj2file(self)
def __init__(self, data=None, col_y=None, lst_layers=None, shuffle=False, stratified=False, col_selected=None, tag=None, db=None, filepaths=None, pipe_id=None): """ The difference between PipeInit and ml_forest.core.constructions.core_init.CoreInit is that - PipeInit has initiating Nodes - CoreInit has initiating Features/Labels obj_id :param data: pandas.DataFrame. This needs to be a pandas data frame with a label column :param col_y: The name of the label column :param lst_layers: list. This gives the "lst_layers" to the Frame :param shuffle: boolean. :param stratified: boolean. Should not be used to a regression problem :param col_selected: dict. Ex: {'num': ['colname1', 'colname2'], 'cate':['colname3'], ...} :param db: :param filepaths: :param pipe_id """ project = db["project"] if project not in root_database: root_database[project] = {} lst = [ 'Feature', 'FTransform', 'Label', 'LTransform', 'CoreInit', 'Frame', 'PipeTestData', 'TestFeature' ] for ele in lst: root_database[project][ele] = [] if pipe_id and isinstance(pipe_id, ObjectId) and filepaths: ih = IOHandler() self.core = ih.load_obj_from_file(obj_id=pipe_id, element="CoreInit", filepaths=filepaths) elif pipe_id and not isinstance(pipe_id, ObjectId): raise TypeError("The pipe_id you passed is not an ObjectId.") else: self.core = CoreInit(data, col_y, lst_layers, shuffle, stratified, col_selected, tag, db, filepaths) init_fnodes = self.init_features for key in init_fnodes: init_fnodes[key] = FNode(self.core, obj_id=init_fnodes[key]) self._init_fnodes = init_fnodes init_lnode = LNode(self.core, obj_id=self.label) self._init_lnode = init_lnode
def l_collect_components(l_node): ih = IOHandler() frame = ih.load_obj_from_file(obj_id=l_node.pipe_init.frame, element="Frame", filepaths=l_node.pipe_init.filepaths) lab_fed = ih.load_obj_from_file(obj_id=l_node.lab_fed.obj_id, element="Label", filepaths=l_node.pipe_init.filepaths) lab_fed = lab_fed.values return frame, lab_fed
def fetch(self): if self.obj_id is None or self.filepaths is None: msg = "The node doesn't have obj_id or filepaths yet. The function is designed to fetch an obj whose" +\ "location is specified in a node." raise ValueError(msg) obj_id = self.obj_id element = self.decide_element() filepaths = self.core.filepaths ih = IOHandler() obj_fetched = ih.load_obj_from_file(obj_id, element, filepaths) return obj_fetched
def create_grid(self, grid_dict): frame_id = self.__essentials["frame"] filepaths = self.__pipe_init.filepaths ih = IOHandler() frame = ih.load_obj_from_file(frame_id, "Frame", filepaths) idx = pd.MultiIndex.from_product(grid_dict.values(), names=grid_dict.keys()) folds = frame.create_structure(self.__layer) evals = [e.__name__ for e in self.__evaluators] cols = pd.MultiIndex.from_product([evals, folds]) r_grid, p_grid = pd.DataFrame(index=idx, columns=["feature_id", "f_transform_id"]), \ pd.DataFrame(index=idx, columns=cols) return r_grid, p_grid
def search_for_scheme(self, db): """ :return: """ dh = DbHandler() docs = dh.search_by_essentials(self, db) if bool(docs): doc = docs[0] obj_id = doc["_id"] filepaths = doc["filepaths"] element = self.decide_element() ih = IOHandler() scheme_loaded = ih.load_obj_from_file(obj_id, element, filepaths) return scheme_loaded else: return None
def __go(l_node): frame_id = l_node.core.frame lab_fed_id = l_node.lab_fed.obj_id l_transform = l_node.l_transform filepaths = l_node.core.filepaths ih = IOHandler() frame = ih.load_obj_from_file(frame_id, "Frame", filepaths) lab_fed = ih.load_obj_from_file(lab_fed_id, "Label", filepaths) # TODO: might need to refactor transform with ref better if has_ref(l_transform): l_values = l_transform.transform_with_ref(l_node) ############################################################# else: lflow = LFlow() l_values, l_transform = lflow.label_encoding_transform(frame, lab_fed, l_transform) return l_values, l_transform
def __go(f_node, frame_id, filepaths, label_id): lst_fed_id = [f.obj_id for f in f_node.lst_fed] ih = IOHandler() frame = ih.load_obj_from_file(frame_id, "Frame", filepaths) lst_fed = [ih.load_obj_from_file(f_id, "Feature", filepaths) for f_id in lst_fed_id] if label_id: label = ih.load_obj_from_file(label_id, "Label", filepaths) else: label = None f_transform = f_node.f_transform # TODO: might need to refactor transform with ref better if has_ref(f_transform): f_values, stage = f_transform.transform_with_ref(f_node) ############################################################# else: ff = FFlow() if f_transform.rise == 1: f_values, f_transform, stage = ff.supervised_fit_transform(frame, lst_fed, f_transform, label) else: f_values, f_transform, stage = ff.unsupervised_fit_transform(lst_fed, f_transform) return f_values, f_transform, stage
def return_constant_params(self, key): """ At this point, self should be loaded from a old record/storage For the keys that are not in grid_dict, find the values from self.essentials[key] :param key: :return: """ filepaths = self.pipe_init.filepaths param_lst = [] ih = IOHandler() for ft_id in self.result_grid["f_transform_id"]: f_transform = ih.load_obj_from_file(ft_id, "FTransform", filepaths) param_lst.append(f_transform.essentials[key]) if len(set(param_lst)) > 1: raise ValueError( "Something seriously wrong with the design of the Scheme family" ) else: return param_lst[0]
def frame(self): filepaths = self.__pipe_init.filepaths _id = self.__essentials["frame"] ih = IOHandler() return ih.load_obj_from_file(_id, "Frame", filepaths)
def label(self): filepaths = self.__pipe_init.filepaths lid = self.__essentials["label"] ih = IOHandler() return ih.load_obj_from_file(lid, "Label", filepaths)
def update_scheme(self): ih = IOHandler() ih.save_obj2file(self)