Python DbHandler Examples, ml_forest.core.constructions.db_handler.DbHandler Python Examples

Example #1

0

Show file

    def set_filepaths(self, filepaths):
        """

        :param filepaths: lst of dictionaries, each dictionary specifies where pkl file is saved.
            Currently supports below:
            [
                {'home': home, 'project':project_name},
                {'bucket': aws_bucket, 'project':project_name}
            ]
        :return:
        """
        if self.obj_id is None:
            msg = "The object doesn't have an obj_id, which means it's not saved in db yet," +\
                  "so it should not be saved in storage either."
            raise AttributeError(msg)

        if self.filepaths:
            raise AttributeError(
                "The set_filepaths method in Base does not allow reseting the file paths."
            )
        if filepaths and not isinstance(filepaths, list):
            raise TypeError(
                "Currently the collection of the file paths has to be of the list type"
            )
        elif filepaths:
            for path in filepaths:
                if not isinstance(path, dict):
                    raise TypeError(
                        "Currently the file paths have to be of the dictionary type"
                    )

        dh = DbHandler()
        dh.update_doc(self, {"filepaths": filepaths})
        self.__filepaths = filepaths

Example #2

0

Show file

    def save_db(self, db):
        """

        :param db: dict
        :return:
        """
        self.set_db(db)
        dh = DbHandler()
        obj_id = dh.init_doc(self)
        self.obj_id = obj_id

Example #3

0

Show file

File: nodes_pack.py Project: LukeLinEx/ml_forest

    def l_prepare_locate(self, l_node):
        if not l_node.lab_fed.obj_id:
            self.l_locate(l_node.lab_fed)

        dh = DbHandler()
        lst_transform_ids = dh.search_by_essentials(l_node.l_transform,
                                                    l_node.pipe_init.db)
        lst_transform_ids = [
            x["_id"] for x in lst_transform_ids if x["_id"] not in self.matched
        ]
        return lst_transform_ids

Example #4

0

Show file

File: nodes_pack.py Project: LukeLinEx/ml_forest

    def identify_label(l_node, lst_l_transform):
        frame = l_node.pipe_init.frame
        lab_fed = l_node.lab_fed.obj_id

        dh = DbHandler()
        all_docs = []
        for l_tran in lst_l_transform:
            tmp = Label(frame=frame,
                        l_transform=l_tran,
                        raw_y=lab_fed,
                        values=None)
            all_docs.extend(dh.search_by_essentials(tmp, l_node.pipe_init.db))
        all_docs = sorted(all_docs, key=lambda d: not bool(d["filepaths"]))

        return all_docs

Example #5

0

Show file

File: nodes_pack.py Project: LukeLinEx/ml_forest

    def f_prepare_locate(self, f_node):
        for node in f_node.lst_fed:
            if node.obj_id is None:
                self.f_locate(node)

        lc = LConnector(self.l_matched)
        lc.l_locate(f_node.l_node)

        dh = DbHandler()
        lst_transform_ids = dh.search_by_essentials(f_node.f_transform,
                                                    f_node.pipe_init.db)
        lst_transform_ids = [
            x["_id"] for x in lst_transform_ids if x["_id"] not in self.matched
        ]

        return lst_transform_ids

Example #6

0

Show file

File: nodes_pack.py Project: LukeLinEx/ml_forest

    def identify_feature(f_node, lst_f_transform):
        frame = f_node.pipe_init.frame
        lst_fed = [f.obj_id for f in f_node.lst_fed]

        dh = DbHandler()
        all_docs = []
        for f_tran in lst_f_transform:
            tmp = Feature(frame=frame,
                          f_transform=f_tran,
                          lst_fed=lst_fed,
                          label=f_node.l_node.obj_id,
                          values=None)
            all_docs.extend(dh.search_by_essentials(tmp, f_node.pipe_init.db))
        all_docs = sorted(all_docs, key=lambda d: not bool(d["filepaths"]))

        return all_docs

Example #7

0

Show file

File: scheme.py Project: LukeLinEx/ml_forest

    def search_for_scheme(self, db):
        """

        :return:
        """
        dh = DbHandler()
        docs = dh.search_by_essentials(self, db)

        if bool(docs):
            doc = docs[0]
            obj_id = doc["_id"]
            filepaths = doc["filepaths"]
            element = self.decide_element()

            ih = IOHandler()
            scheme_loaded = ih.load_obj_from_file(obj_id, element, filepaths)
            return scheme_loaded
        else:
            return None

Example #8

0

Show file

File: nodes_pack.py Project: LukeLinEx/ml_forest

    def l_locate(self, l_node, save_obtained=True):
        if not isinstance(l_node, LNode):
            raise TypeError(
                "The parameter l_node should be of the type LNode.")

        label_obtained, l_trans_obtained = None, None
        db = l_node.pipe_init.db
        filepaths = l_node.pipe_init.filepaths

        if l_node.obj_id is None:
            lst_l_transform = self.l_prepare_locate(l_node)
            all_docs = self.identify_label(l_node, lst_l_transform)

            if all_docs:
                doc = all_docs[0]
                if doc["filepaths"]:
                    # update l_node
                    l_node.obj_id = doc["_id"]
                    l_node.filepaths = doc["filepaths"]

                elif save_obtained:
                    label, l_transform = self.materialize_with_existing_doc(
                        doc=doc, l_node=l_node)

                    # save obtained
                    l_transform.save_file(filepaths)
                    label.save_file(filepaths)

                    # update l_node
                    l_node.obj_id = doc["_id"]
                    l_node.filepaths = filepaths

                    # for return
                    label_obtained, l_trans_obtained = label, l_transform

                else:
                    # update l_node
                    l_node.obj_id = doc["_id"]

            elif save_obtained:
                label, l_transform = self.set_off_and_record(l_node, db)

                # save obtained
                l_transform.save_file(filepaths)
                label.save_file(filepaths)

                # update l_node
                l_node.obj_id = label.obj_id
                l_node.filepaths = label.filepaths

                # for return
                label_obtained, l_trans_obtained = label, l_transform

            else:
                label, l_transform = self.set_off_and_record(l_node, db)

                # update l_node
                l_node.obj_id = label.obj_id

                # for return
                label_obtained, l_trans_obtained = label, l_transform
        else:
            dh = DbHandler()
            doc = dh.search_by_obj_id(obj_id=l_node.obj_id,
                                      element="Label",
                                      db=db)

            if doc["filepaths"]:
                doc_filepaths = doc[
                    "filepaths"]  # Prevent potentials errors resulted from different
                # filepaths from doc and pipe_init

                # update l_node
                if l_node.filepaths is None:
                    l_node.filepaths = doc_filepaths

            elif save_obtained:
                label, l_transform = self.materialize_with_existing_doc(
                    doc=doc, l_node=l_node)

                # save obtained
                label.save_file(filepaths)
                l_transform.save_file(filepaths)

                # update l_node
                l_node.filepaths = filepaths

                # for return
                label_obtained, l_trans_obtained = label, l_transform

            else:
                label, l_transform = self.materialize_with_existing_doc(
                    doc=doc, l_node=l_node)

                # for return
                label_obtained, l_trans_obtained = label, l_transform

        return label_obtained, l_trans_obtained

Example #9

0

Show file

File: nodes_pack.py Project: LukeLinEx/ml_forest

    def f_locate(self, f_node, save_obtained=True):
        if not isinstance(f_node, FNode):
            raise TypeError("The parameter f_node should of the type FNode.")

        feature_obtained, f_trans_obtained = None, None
        db = f_node.pipe_init.db
        filepaths = f_node.pipe_init.filepaths

        if f_node.obj_id is None:
            if not isinstance(f_node.l_node, LNode):
                raise TypeError(
                    "The attribute f_node.l_node should be of the type LNode")

            lst_f_transform = self.f_prepare_locate(f_node)
            all_docs = self.identify_feature(f_node, lst_f_transform)

            if all_docs:
                doc = all_docs[0]
                if doc["filepaths"]:
                    # update f_node
                    f_node.obj_id = doc["_id"]
                    f_node.filepaths = doc["filepaths"]

                elif save_obtained:
                    feature, f_transform = self.materialize_with_existing_doc(
                        f_node=f_node, doc=doc)

                    # save obtained
                    f_transform.save_file(filepaths)
                    feature.save_file(filepaths)

                    # update f_node
                    f_node.obj_id = doc["_id"]
                    f_node.filepaths = filepaths

                    # for return
                    feature_obtained, f_trans_obtained = feature, f_transform

                else:
                    # update f_node
                    f_node.obj_id = doc["_id"]

            elif save_obtained:
                feature, f_transform = self.set_off_and_record(f_node, db)

                # save obtained
                f_transform.save_file(filepaths)
                feature.save_file(filepaths)

                # update f_node
                f_node.obj_id = feature.obj_id
                f_node.filepaths = feature.filepaths

                # for return
                feature_obtained, f_trans_obtained = feature, f_transform

            else:
                feature, f_transform = self.set_off_and_record(f_node, db)

                # update f_node
                f_node.obj_id = feature.obj_id

                # for return
                feature_obtained, f_trans_obtained = feature, f_transform

        else:
            dh = DbHandler()
            doc = dh.search_by_obj_id(obj_id=f_node.obj_id,
                                      element="Feature",
                                      db=db)

            if doc["filepaths"]:
                doc_filepaths = doc[
                    "filepaths"]  # Prevent potentials errors resulted from different
                # filepaths from doc and pipe_init

                # update f_node
                if f_node.filepaths is None:
                    f_node.filepaths = filepaths

                # TODO: we should probably remove this part since nothing is "obtained" here
                # ih = IOHandler()
                # feature = ih.load_obj_from_file(f_node.obj_id, "Feature", doc_filepaths)
                # f_transform = ih.load_obj_from_file(doc["essentials"]["f_transform"], "FTransform", doc_filepaths)
                #
                # # for return
                # feature_obtained, f_trans_obtained = feature, f_transform

            elif save_obtained:
                feature, f_transform = self.materialize_with_existing_doc(
                    f_node=f_node, doc=doc)

                # save obtained
                f_transform.save_file(filepaths)
                feature.save_file(filepaths)

                # update f_node
                f_node.filepaths = filepaths

                # for return
                feature_obtained, f_trans_obtained = feature, f_transform
            else:
                feature, f_transform = self.materialize_with_existing_doc(
                    f_node=f_node, doc=doc)

                # for return
                feature_obtained, f_trans_obtained = feature, f_transform

        return feature_obtained, f_trans_obtained

Example #10

0

Show file

File: pipe_init.py Project: LukeLinEx/ml_forest

    def __init__(self,
                 data,
                 col_y,
                 lst_layers,
                 shuffle=False,
                 stratified=False,
                 col_selected=None,
                 tag=None,
                 db=None,
                 filepaths=None):
        """
        :param data: pandas.DataFrame. This needs to be a pandas data frame with a label column
        :param col_y: The name of the label column
        :param lst_layers: list. This gives the "lst_layers" to the Frame
        :param shuffle: boolean.
        :param stratified: boolean. Should not be used to a regression problem
        :param col_selected: dict. Ex: {'num': ['colname1', 'colname2'], 'cate':['colname3'], ...}
        :param db:
        :param filepaths:
        """
        if not isinstance(data, pd.DataFrame):
            raise TypeError(
                "The data for initializing a pipe should be of the type pandas.DataFrame"
            )
        if col_y not in data:
            raise KeyError(
                "The column name of the target: col_y provided is not in the data"
            )
        if col_selected:
            for key in col_selected:
                if not isinstance(col_selected[key], list):
                    raise TypeError(
                        "All the values in the dictionary col_selected have to be lists."
                    )

        super(PipeInit, self).__init__()
        self.__essentials = {}

        # Initializing the rows
        if shuffle:
            idx = np.random.choice(data.index, len(data.index), replace=False)
            data = self.shuffle_pddf_idx(data, idx)

        if stratified:
            data, frame = self.get_stratified_starter_and_frame(
                lst_layers, data, col_y)
        else:
            frame = self.get_regular_frame(lst_layers, data)
        frame.save_db_file(db=db, filepaths=filepaths)
        self.__frame = frame.obj_id

        # Initializing labels
        values = data[[col_y]].values
        label = Label(frame.obj_id, None, None, values)
        label.save_db_file(db=db, filepaths=filepaths)
        self.__label = label.obj_id

        # Initializing features (columns)
        self._column_groups = {
        }  # to collect dict like {'num': ['colname1', 'colname2'], 'cate':['colname3'], ...}
        self._init_features = {
        }  # {'num': obj_id(data['colname1', 'colname2']),
        #  'cate': obj_id(data['colname3']), ...}

        self._y_name = col_y
        if isinstance(col_selected, dict):
            for key in col_selected:
                cols = col_selected[key]
                self._column_groups[key] = cols

                values = data[cols].values
                feature = Feature(frame.obj_id,
                                  None,
                                  None,
                                  None,
                                  values=values)
                feature.stage = 0
                feature.save_db_file(db=db, filepaths=filepaths)
                self._init_features[key] = feature.obj_id
        elif not col_selected:
            cols = data.columns

            values = data[cols].values
            feature = Feature(frame.obj_id, None, None, None, values=values)
            feature.save_db_file(db=db, filepaths=filepaths)
            self._init_features['raw'] = feature.obj_id
        elif isinstance(col_selected, list):
            raise NotImplementedError(
                "Currently only support dictionary to initialize features")
        else:
            raise ValueError(
                "Don't know what to do with the way you specified columns")

        if type(self) == PipeInit:
            self.save_db_file(db=db, filepaths=filepaths)
            DbHandler.insert_tag(self, {"tag": tag})
            print(self.obj_id)