def set_filepaths(self, filepaths): """ :param filepaths: lst of dictionaries, each dictionary specifies where pkl file is saved. Currently supports below: [ {'home': home, 'project':project_name}, {'bucket': aws_bucket, 'project':project_name} ] :return: """ if self.obj_id is None: msg = "The object doesn't have an obj_id, which means it's not saved in db yet," +\ "so it should not be saved in storage either." raise AttributeError(msg) if self.filepaths: raise AttributeError( "The set_filepaths method in Base does not allow reseting the file paths." ) if filepaths and not isinstance(filepaths, list): raise TypeError( "Currently the collection of the file paths has to be of the list type" ) elif filepaths: for path in filepaths: if not isinstance(path, dict): raise TypeError( "Currently the file paths have to be of the dictionary type" ) dh = DbHandler() dh.update_doc(self, {"filepaths": filepaths}) self.__filepaths = filepaths
def save_db(self, db): """ :param db: dict :return: """ self.set_db(db) dh = DbHandler() obj_id = dh.init_doc(self) self.obj_id = obj_id
def l_prepare_locate(self, l_node): if not l_node.lab_fed.obj_id: self.l_locate(l_node.lab_fed) dh = DbHandler() lst_transform_ids = dh.search_by_essentials(l_node.l_transform, l_node.pipe_init.db) lst_transform_ids = [ x["_id"] for x in lst_transform_ids if x["_id"] not in self.matched ] return lst_transform_ids
def identify_label(l_node, lst_l_transform): frame = l_node.pipe_init.frame lab_fed = l_node.lab_fed.obj_id dh = DbHandler() all_docs = [] for l_tran in lst_l_transform: tmp = Label(frame=frame, l_transform=l_tran, raw_y=lab_fed, values=None) all_docs.extend(dh.search_by_essentials(tmp, l_node.pipe_init.db)) all_docs = sorted(all_docs, key=lambda d: not bool(d["filepaths"])) return all_docs
def f_prepare_locate(self, f_node): for node in f_node.lst_fed: if node.obj_id is None: self.f_locate(node) lc = LConnector(self.l_matched) lc.l_locate(f_node.l_node) dh = DbHandler() lst_transform_ids = dh.search_by_essentials(f_node.f_transform, f_node.pipe_init.db) lst_transform_ids = [ x["_id"] for x in lst_transform_ids if x["_id"] not in self.matched ] return lst_transform_ids
def identify_feature(f_node, lst_f_transform): frame = f_node.pipe_init.frame lst_fed = [f.obj_id for f in f_node.lst_fed] dh = DbHandler() all_docs = [] for f_tran in lst_f_transform: tmp = Feature(frame=frame, f_transform=f_tran, lst_fed=lst_fed, label=f_node.l_node.obj_id, values=None) all_docs.extend(dh.search_by_essentials(tmp, f_node.pipe_init.db)) all_docs = sorted(all_docs, key=lambda d: not bool(d["filepaths"])) return all_docs
def search_for_scheme(self, db): """ :return: """ dh = DbHandler() docs = dh.search_by_essentials(self, db) if bool(docs): doc = docs[0] obj_id = doc["_id"] filepaths = doc["filepaths"] element = self.decide_element() ih = IOHandler() scheme_loaded = ih.load_obj_from_file(obj_id, element, filepaths) return scheme_loaded else: return None
def l_locate(self, l_node, save_obtained=True): if not isinstance(l_node, LNode): raise TypeError( "The parameter l_node should be of the type LNode.") label_obtained, l_trans_obtained = None, None db = l_node.pipe_init.db filepaths = l_node.pipe_init.filepaths if l_node.obj_id is None: lst_l_transform = self.l_prepare_locate(l_node) all_docs = self.identify_label(l_node, lst_l_transform) if all_docs: doc = all_docs[0] if doc["filepaths"]: # update l_node l_node.obj_id = doc["_id"] l_node.filepaths = doc["filepaths"] elif save_obtained: label, l_transform = self.materialize_with_existing_doc( doc=doc, l_node=l_node) # save obtained l_transform.save_file(filepaths) label.save_file(filepaths) # update l_node l_node.obj_id = doc["_id"] l_node.filepaths = filepaths # for return label_obtained, l_trans_obtained = label, l_transform else: # update l_node l_node.obj_id = doc["_id"] elif save_obtained: label, l_transform = self.set_off_and_record(l_node, db) # save obtained l_transform.save_file(filepaths) label.save_file(filepaths) # update l_node l_node.obj_id = label.obj_id l_node.filepaths = label.filepaths # for return label_obtained, l_trans_obtained = label, l_transform else: label, l_transform = self.set_off_and_record(l_node, db) # update l_node l_node.obj_id = label.obj_id # for return label_obtained, l_trans_obtained = label, l_transform else: dh = DbHandler() doc = dh.search_by_obj_id(obj_id=l_node.obj_id, element="Label", db=db) if doc["filepaths"]: doc_filepaths = doc[ "filepaths"] # Prevent potentials errors resulted from different # filepaths from doc and pipe_init # update l_node if l_node.filepaths is None: l_node.filepaths = doc_filepaths elif save_obtained: label, l_transform = self.materialize_with_existing_doc( doc=doc, l_node=l_node) # save obtained label.save_file(filepaths) l_transform.save_file(filepaths) # update l_node l_node.filepaths = filepaths # for return label_obtained, l_trans_obtained = label, l_transform else: label, l_transform = self.materialize_with_existing_doc( doc=doc, l_node=l_node) # for return label_obtained, l_trans_obtained = label, l_transform return label_obtained, l_trans_obtained
def f_locate(self, f_node, save_obtained=True): if not isinstance(f_node, FNode): raise TypeError("The parameter f_node should of the type FNode.") feature_obtained, f_trans_obtained = None, None db = f_node.pipe_init.db filepaths = f_node.pipe_init.filepaths if f_node.obj_id is None: if not isinstance(f_node.l_node, LNode): raise TypeError( "The attribute f_node.l_node should be of the type LNode") lst_f_transform = self.f_prepare_locate(f_node) all_docs = self.identify_feature(f_node, lst_f_transform) if all_docs: doc = all_docs[0] if doc["filepaths"]: # update f_node f_node.obj_id = doc["_id"] f_node.filepaths = doc["filepaths"] elif save_obtained: feature, f_transform = self.materialize_with_existing_doc( f_node=f_node, doc=doc) # save obtained f_transform.save_file(filepaths) feature.save_file(filepaths) # update f_node f_node.obj_id = doc["_id"] f_node.filepaths = filepaths # for return feature_obtained, f_trans_obtained = feature, f_transform else: # update f_node f_node.obj_id = doc["_id"] elif save_obtained: feature, f_transform = self.set_off_and_record(f_node, db) # save obtained f_transform.save_file(filepaths) feature.save_file(filepaths) # update f_node f_node.obj_id = feature.obj_id f_node.filepaths = feature.filepaths # for return feature_obtained, f_trans_obtained = feature, f_transform else: feature, f_transform = self.set_off_and_record(f_node, db) # update f_node f_node.obj_id = feature.obj_id # for return feature_obtained, f_trans_obtained = feature, f_transform else: dh = DbHandler() doc = dh.search_by_obj_id(obj_id=f_node.obj_id, element="Feature", db=db) if doc["filepaths"]: doc_filepaths = doc[ "filepaths"] # Prevent potentials errors resulted from different # filepaths from doc and pipe_init # update f_node if f_node.filepaths is None: f_node.filepaths = filepaths # TODO: we should probably remove this part since nothing is "obtained" here # ih = IOHandler() # feature = ih.load_obj_from_file(f_node.obj_id, "Feature", doc_filepaths) # f_transform = ih.load_obj_from_file(doc["essentials"]["f_transform"], "FTransform", doc_filepaths) # # # for return # feature_obtained, f_trans_obtained = feature, f_transform elif save_obtained: feature, f_transform = self.materialize_with_existing_doc( f_node=f_node, doc=doc) # save obtained f_transform.save_file(filepaths) feature.save_file(filepaths) # update f_node f_node.filepaths = filepaths # for return feature_obtained, f_trans_obtained = feature, f_transform else: feature, f_transform = self.materialize_with_existing_doc( f_node=f_node, doc=doc) # for return feature_obtained, f_trans_obtained = feature, f_transform return feature_obtained, f_trans_obtained
def __init__(self, data, col_y, lst_layers, shuffle=False, stratified=False, col_selected=None, tag=None, db=None, filepaths=None): """ :param data: pandas.DataFrame. This needs to be a pandas data frame with a label column :param col_y: The name of the label column :param lst_layers: list. This gives the "lst_layers" to the Frame :param shuffle: boolean. :param stratified: boolean. Should not be used to a regression problem :param col_selected: dict. Ex: {'num': ['colname1', 'colname2'], 'cate':['colname3'], ...} :param db: :param filepaths: """ if not isinstance(data, pd.DataFrame): raise TypeError( "The data for initializing a pipe should be of the type pandas.DataFrame" ) if col_y not in data: raise KeyError( "The column name of the target: col_y provided is not in the data" ) if col_selected: for key in col_selected: if not isinstance(col_selected[key], list): raise TypeError( "All the values in the dictionary col_selected have to be lists." ) super(PipeInit, self).__init__() self.__essentials = {} # Initializing the rows if shuffle: idx = np.random.choice(data.index, len(data.index), replace=False) data = self.shuffle_pddf_idx(data, idx) if stratified: data, frame = self.get_stratified_starter_and_frame( lst_layers, data, col_y) else: frame = self.get_regular_frame(lst_layers, data) frame.save_db_file(db=db, filepaths=filepaths) self.__frame = frame.obj_id # Initializing labels values = data[[col_y]].values label = Label(frame.obj_id, None, None, values) label.save_db_file(db=db, filepaths=filepaths) self.__label = label.obj_id # Initializing features (columns) self._column_groups = { } # to collect dict like {'num': ['colname1', 'colname2'], 'cate':['colname3'], ...} self._init_features = { } # {'num': obj_id(data['colname1', 'colname2']), # 'cate': obj_id(data['colname3']), ...} self._y_name = col_y if isinstance(col_selected, dict): for key in col_selected: cols = col_selected[key] self._column_groups[key] = cols values = data[cols].values feature = Feature(frame.obj_id, None, None, None, values=values) feature.stage = 0 feature.save_db_file(db=db, filepaths=filepaths) self._init_features[key] = feature.obj_id elif not col_selected: cols = data.columns values = data[cols].values feature = Feature(frame.obj_id, None, None, None, values=values) feature.save_db_file(db=db, filepaths=filepaths) self._init_features['raw'] = feature.obj_id elif isinstance(col_selected, list): raise NotImplementedError( "Currently only support dictionary to initialize features") else: raise ValueError( "Don't know what to do with the way you specified columns") if type(self) == PipeInit: self.save_db_file(db=db, filepaths=filepaths) DbHandler.insert_tag(self, {"tag": tag}) print(self.obj_id)