Python validate_required_columnsの例、util.pd.validate_required_columns Pythonの例

コード例 #1

0

ファイルを表示

ファイル: well.py プロジェクト: la-mar/prodstats

    def is_producing(self,
                     status_column: str = "status",
                     producing_states: List[str] = None) -> pd.Series:

        validate_required_columns([status_column], self._obj.columns)
        producing_states = producing_states or PRODUCING_STATES
        return self._obj.status.isin(producing_states)

コード例 #2

0

ファイルを表示

ファイル: geom.py プロジェクト: la-mar/prodstats

 def as_stick(self,
              geometry: str = "geom",
              label: str = "stick") -> pd.Series:
     points = self._obj
     validate_required_columns([geometry], points.columns)
     validate_required_columns(["md"], points.index.names)
     return (points.groupby(level=0).agg(
         ["first", "last"]).stack().shapes.as_line(geometry=geometry,
                                                   label=label))

コード例 #3

0

ファイルを表示

ファイル: geom.py プロジェクト: la-mar/prodstats

    def relative_trajectory_angle(
        self,
        geometry: str = "geom",
        preceeding: int = 1,
        following: int = 1,
        max_soft_angle: float = 150.0,
        max_hard_angle: float = 130.0,
    ) -> gp.GeoDataFrame:
        """ calculate the central angle of a point relative to an arbitrary
            number points before and after it, element wise. The triangle formed
            by the three points is used to calculate the measurement of the angle
            touching the current point (theta).


        Keyword Arguments:
            preceeding {int} -- number of points preceeding the current point to be
                                considered when calculating the central angle. (default: {1})
            following {int} -- number of points following the current point to be
                                considered when calculating the central angle. (default: {1})

        Returns:
            gp.GeoDataFrame
        """

        points = self.as_gdf()

        validate_required_columns([geometry], points.columns)
        rel_prev = points.groupby(level=0).shift(preceeding).shapes.as_gdf()
        rel_next = points.groupby(level=0).shift(-following).shapes.as_gdf()

        hyp = rel_prev.distance(rel_next).rename("hyp")
        adj = points.distance(rel_next).rename("adj")
        opp = points.distance(rel_prev).rename("opp")

        tri = hyp.to_frame().join(adj).join(opp)  # .mul(100000)

        # law of cosines:
        #   c2 = a2 + b2 − 2ab cos(C) -> acos((adj^2 + opp^2 - hyp^2) / (2 * adj * opp))
        tri[["hyp_sq", "adj_sq",
             "opp_sq"]] = tri.loc[:, ["hyp", "adj", "opp"]].pow(2)
        tri["theta"] = (tri.adj_sq.add(tri.opp_sq).sub(tri.hyp_sq).div(
            tri.adj.mul(tri.opp).mul(2)).apply(np.arccos).apply(np.rad2deg))

        # mark the soft corners (bends)
        soft_corner_mask = (tri.theta > 0) & (tri.theta < max_soft_angle)
        tri["is_soft_corner"] = False
        tri.loc[soft_corner_mask, "is_soft_corner"] = True

        # mark the hard corners (steep bends)
        hard_corner_mask = (tri.theta > 0) & (tri.theta < max_hard_angle
                                              )  # | (tri.theta > 200)
        tri["is_hard_corner"] = False
        tri.loc[hard_corner_mask, "is_hard_corner"] = True

        return tri.loc[:,
                       ["theta", "is_soft_corner", "is_hard_corner"]].dropna(
                           how="any")

コード例 #4

0

ファイルを表示

ファイル: geom.py プロジェクト: la-mar/prodstats

    def mark_lateral_points(self, dip_threshold: int = None) -> pd.DataFrame:
        dip_threshold = dip_threshold or LATERAL_DIP_THRESHOLD

        points = self._obj
        validate_required_columns(["dip"], points.columns)

        points.loc[points.dip > dip_threshold, "is_in_lateral"] = True
        points.is_in_lateral = points.is_in_lateral.fillna(False)
        return points

コード例 #5

0

ファイルを表示

ファイル: geom.py プロジェクト: la-mar/prodstats

    def index_survey_points(self, dip_threshold: int = None) -> pd.DataFrame:
        points = self._obj
        validate_required_columns(["dip"], points.columns)
        validate_required_columns(["api14", "md"], points.index.names)

        calc_column_names = [
            "is_in_lateral",
            "is_heel_point",
            "is_mid_point",
            "is_toe_point",
        ]
        # points.loc[:, calc_column_names] = np.nan
        # add placeholders (with defaults) for columns to be calculated
        points = pd.concat([points, pd.DataFrame(columns=calc_column_names)])
        points.loc[:, calc_column_names] = False

        points["sequence"] = points.groupby(level=0).cumcount() + 1
        points = points.shapes.mark_lateral_points()

        heel_point_index = (points.loc[points.is_in_lateral].reset_index(
            level=1).groupby(level=0).first().set_index("md",
                                                        append=True).index)

        mid_point_sequence_index = (points.loc[points.is_in_lateral].groupby(
            level=0).sequence.median().apply(
                np.floor).to_frame().astype(int).set_index("sequence",
                                                           append=True).index)

        mid_point_index = (points.reset_index(level=1).set_index(
            "sequence", append=True).loc[mid_point_sequence_index].reset_index(
                level=1).set_index("md", append=True).index)

        toe_point_index = (points.loc[points.is_in_lateral].reset_index(
            level=1).groupby(level=0).last().set_index("md",
                                                       append=True).index)

        points.loc[heel_point_index, "is_heel_point"] = True
        points.loc[mid_point_index, "is_mid_point"] = True
        points.loc[toe_point_index, "is_toe_point"] = True

        heel_start_seq_by_group = points.loc[heel_point_index, "sequence"]

        # ensure all points after the heel point are marked as in the lateral.
        # The dip filter doesnt always cant all of the points
        for api14 in points.groupby(level=0).groups:
            try:
                heel_start_seq = heel_start_seq_by_group.loc[api14].iloc[0]
                group = points.xs(api14, level=0, drop_level=False)
                group_in_lateral_index = group.loc[
                    group.sequence >= heel_start_seq].index
                points.loc[group_in_lateral_index, "is_in_lateral"] = True
            except KeyError:
                logger.debug(
                    f"{api14} has no survey points -- skipping lateral indexing"
                )

        return points

コード例 #6

0

ファイルを表示

ファイル: geom.py プロジェクト: la-mar/prodstats

    def as_line(self,
                geometry: str = "geom",
                label: str = "line") -> pd.Series:
        points = self._obj
        validate_required_columns([geometry], points.columns)

        # .to_numpy() is implemented by both pandas and geopandas geoarrays
        # whereas .values is not.
        return (points[geometry].groupby(level=0).apply(
            lambda x: LineString(x.to_numpy().tolist())).rename(label))

コード例 #7

0

ファイルを表示

ファイル: well.py プロジェクト: la-mar/prodstats

    def melt_depths(self) -> pd.DataFrame:

        validate_required_columns(["api14"], self._obj.index.names)

        depths_melted = (self._obj.dropna(how="all").reset_index().melt(
            id_vars=["api14"], var_name="property_name"))
        depths_melted["aggregate_type"] = None
        depths_melted["name"] = depths_melted.property_name
        depths_melted = depths_melted.set_index(
            ["api14", "property_name", "aggregate_type"])
        return depths_melted

コード例 #8

0

ファイルを表示

ファイル: geom.py プロジェクト: la-mar/prodstats

    def find_kop(self) -> pd.Series:
        """ Return an educated guess as to the location of a survey's kickoff point, element-wise.
            The calling dataframe be a dataframe of survey points. """

        points = self._obj
        validate_required_columns(["is_in_lateral", "sequence"],
                                  points.columns)
        angles = points.shapes.relative_trajectory_angle()

        points = points.join(angles.loc[~points.is_in_lateral])
        points["theta"] = angles.theta
        points.loc[:, ["is_soft_corner", "is_hard_corner"]] = points.loc[:, [
            "is_soft_corner", "is_hard_corner"
        ]].fillna(False)
        points.loc[:, "theta"] = points.loc[:, "theta"].fillna(0)

        max_hard_corner_mask = (
            points.loc[points.is_hard_corner].sequence.groupby(
                level=0).max().rename("hard"))

        max_soft_corner_mask = (
            points.loc[points.is_soft_corner].sequence.groupby(
                level=0).max().rename("soft"))

        last_non_lateral_point_mask = (
            points.loc[~points.is_in_lateral].sequence.groupby(
                level=0).max().rename("last_non_lateral"))

        # create a frame from the masks created above
        empty = pd.DataFrame(index=points.groupby(level=0).max().index)
        joined = (empty.join(max_hard_corner_mask).join(
            max_soft_corner_mask).join(last_non_lateral_point_mask))

        # determine the sequence index of the kop by traversing the joined dataframe's columns
        # from left to right and taking the first non-na value for each row.
        kop_seq_index = (joined.fillna(
            method="bfill", axis=1).iloc[:, 0].rename("kop_seq").fillna(
                -1).astype(int).to_frame().set_index("kop_seq",
                                                     append=True).index)

        # mark kop points using sequence index
        points["is_kop"] = False
        kop_index = (points.reset_index(level=1).set_index(
            "sequence",
            append=True).loc[kop_seq_index].reset_index(level=1).set_index(
                "md", append=True).index)

        points.loc[kop_index, "is_kop"] = True

        return points.loc[:, [
            "theta", "is_soft_corner", "is_hard_corner", "is_kop"
        ]]

コード例 #9

0

ファイルを表示

ファイル: geom.py プロジェクト: la-mar/prodstats

    def lateral_length(self):
        points = self._obj

        if "is_in_lateral" not in points.columns:
            points = points.shapes.mark_lateral_points()

        validate_required_columns(["is_in_lateral"], points.columns)
        validate_required_columns(["md"], points.index.names)

        return (points.loc[points.is_in_lateral].reset_index(
            level=1).loc[:, "md"].groupby(level=0).agg(
                ["min", "max"]).apply(lambda row: row["max"] - row["min"],
                                      axis=1).rename("lateral_length"))

コード例 #10

0

ファイルを表示

ファイル: geom.py プロジェクト: la-mar/prodstats

    def as_bent_stick(self,
                      geometry: str = "geom",
                      label: str = "bent_stick") -> pd.Series:
        points = self._obj
        validate_required_columns([geometry], points.columns)
        validate_required_columns(["md"], points.index.names)
        bent_stick_points = (
            pd.concat([
                points.reset_index(level=1)  # first and last points
                .groupby(level=0).agg(["first", "last"]).stack().reset_index(
                    level=1, drop=True).set_index("md", append=True),
                points.loc[points.is_kop],  # kop points
            ]).sort_index().loc[:, [geometry]])

        return bent_stick_points.shapes.as_line(geometry=geometry, label=label)

コード例 #11

0

ファイルを表示

ファイル: geom.py プロジェクト: la-mar/prodstats

    def as_3d(self, geometry: str = "geom") -> pd.DataFrame:
        """transform 2d points into 3d points using the MD index as Z """
        xyz = self._obj
        validate_required_columns([geometry], xyz.columns)
        validate_required_columns(["md"], xyz.index.names)

        xyz["x"] = xyz[geometry].apply(lambda x: x.x)
        xyz["y"] = xyz[geometry].apply(lambda x: x.y)

        xyz = (xyz.reset_index(level=1).set_index(
            "md", append=True, drop=False).rename(columns={"md": "z"}))
        xyz = xyz.loc[:, ["x", "y", "z", geometry]]

        # reacreate points with z
        xyz[geometry] = xyz.apply(lambda row: Point(row.x, row.y, row.z),
                                  axis=1)
        return xyz

コード例 #12

0

ファイルを表示

ファイル: well.py プロジェクト: la-mar/prodstats

    def merge_lateral_lengths(self) -> pd.DataFrame:
        """ Merge perfll and lateral_length into a single column, preferring perfll """

        wells = self._obj

        if "lateral_length" not in wells.columns:
            wells["lateral_length"] = np.nan

        required_columns = ["perfll", "lateral_length"]
        validate_required_columns(required_columns, wells.columns)

        # ? get lateral_length, preferring perll over lateral_length
        latlens = wells.loc[:, required_columns]
        latlens.loc[latlens.perfll.notnull(), "lateral_length"] = np.nan
        latlens = (latlens.reset_index().melt(
            id_vars="api14",
            var_name="lateral_length_type",
            value_name="lateral_length",
        ).set_index("api14").dropna(how="any"))
        return latlens

コード例 #13

0

ファイルを表示

ファイル: well.py プロジェクト: la-mar/prodstats

    def combine_frac_parameters(self,
                                other: pd.DataFrame,
                                dropna: bool = True) -> pd.DataFrame:
        fracs = self._obj
        validate_required_columns(["fluid", "proppant"], fracs.columns)
        validate_required_columns(["fluid", "proppant"], other.columns)
        validate_required_columns(["api14"], fracs.index.names)
        validate_required_columns(["api14"], other.index.names)

        fracs = fracs.combine_first(other)
        if dropna:
            fracs = fracs[(~fracs.fluid.isna()) & (~fracs.proppant.isna())]
        return fracs

コード例 #14

0

ファイルを表示

ファイル: well.py プロジェクト: la-mar/prodstats

    def process_fracs(self):

        fracs = self._obj
        validate_required_columns(
            ["fluid", "proppant", "lateral_length", "lateral_length_type"],
            fracs.columns,
        )

        fracs = fracs.dropna(how="all", subset=["fluid", "proppant"])

        # TODO: validate fluid/proppant UOM and convert to BBL/LB where necessary

        # convert lb & bbl to lb/ft & bbl/ft
        per_ft = (fracs.loc[:, ["fluid", "proppant"]].div(
            fracs["lateral_length"],
            axis=0).rename(columns={
                "fluid": "fluid_bbl_ft",
                "proppant": "proppant_lb_ft"
            }))

        fracs = fracs.join(per_ft)

        # rename fluid & proppant and drop uoms
        fracs = fracs.rename(columns={
            "fluid": "fluid_bbl",
            "proppant": "proppant_lb"
        }).drop(columns=["fluid_uom", "proppant_uom"])

        fracs = fracs.dropna(
            how="all",
            subset=[
                "fluid_bbl",
                "proppant_lb",
                "lateral_length_type",
                "lateral_length",
                "fluid_bbl_ft",
                "proppant_lb_ft",
            ],
        )

        return fracs

コード例 #15

0

ファイルを表示

def test_validate_required_columns_raise():
    with pytest.raises(KeyError):
        validate_required_columns(required=["a", "b"], columns=["a", "c", "d"])

コード例 #16

0

ファイルを表示

ファイル: well.py プロジェクト: la-mar/prodstats

    def assign_status(
        self,
        # target_column: str = "new_status",
        status_column: str = "status",
        how: str = "waterfall",
        status_indicator_map: Dict[str, str] = None,
        detail: bool = False,
        as_labels: bool = False,
        empty_label_placeholder: str = ".",
    ) -> pd.DataFrame:
        """ Assign well status using indicators existing in the passed DataFrame or
            using pd.DataFrame.wells.status_indicators() to generate them if they arent
            present.

        Keyword Arguments:
            target_column {str} -- column name for status assignments (default: "new_status")
            how {str} -- assignment methodology to use, currently the only available option
                is the default. (default: "waterfall")
            status_column {str} -- name of column containing the original stati from
                the data provider (default: "status")
            detail {bool} -- return the intermediate calculations used in assignments
            status_indicator_map {Dict[str, str]} -- status_indicator_map of indicator column names
                and their corresponding status value to be used in waterfall assignment.
                The precidence of assignment is inferred from the order or items in the
                status_indicator_map (default: const.STATUS_INDICATOR_MAP).

        Raises:
            ValueError

        Returns:
            pd.DataFrame
        """
        wells = self._obj

        well_columns = [
            "status",
            "spud_date",
            "comp_date",
            "permit_date",
            "last_prod_date",
        ]
        validate_required_columns(
            well_columns,
            wells.columns,
        )

        target_column = "new_status"
        status: pd.DataFrame = wells.loc[:, well_columns]
        status = status.wells.status_indicators()

        status_indicator_map = (status_indicator_map if status_indicator_map
                                else STATUS_INDICATOR_MAP)

        # seed with keeper values from original status column
        status.loc[status.is_keeper_status,
                   target_column] = status.loc[status.is_keeper_status,
                                               status_column]

        if how == "waterfall":
            for column_name, label in status_indicator_map.items():
                selected = status.loc[status[target_column].isna()
                                      & status[column_name], column_name]

                if label is not None:
                    selected = selected.replace({True: label})

                status.loc[status[target_column].isna(),
                           target_column] = selected
        else:
            raise ValueError("Invalid how value: use 'waterfall'")

        if not detail:
            status = status.loc[:, [target_column]]

        # overwrite original status with new status
        if status_column in status.columns:
            status = status.drop(columns=[status_column])

        status = status.rename(columns={target_column: status_column})

        if as_labels:
            status = status.replace({
                "is_other": {
                    True: "OTHER"
                },
                "is_inactive_pa": {
                    True: "INACTIVE-PA"
                },
                "is_ta": {
                    True: "TA"
                },
                "is_producing": {
                    True: "PRODUCING"
                },
                "is_completed": {
                    True: "COMPLETED"
                },
                "is_duc": {
                    True: "DUC"
                },
                "is_drilling": {
                    True: "DRILLING"
                },
                "is_permit": {
                    True: "PERMIT"
                },
                "is_stale_permit": {
                    True: "STALE-PERMIT"
                },
            }).replace({False: empty_label_placeholder})

        return status

コード例 #17

0

ファイルを表示

ファイル: well.py プロジェクト: la-mar/prodstats

    def status_indicators(
        self,
        indicators_only: bool = False,
    ) -> pd.DataFrame:
        df = self._obj

        required_columns = [
            "status",
            "spud_date",
            "comp_date",
            "permit_date",
            "last_prod_date",
        ]

        validate_required_columns(
            required_columns,
            df.columns,
        )

        """ Original logic:

        Is_GoodSymCode
            if Uppercase([IHS_Status]) not in ('OIL PRODUCER', 'OIL-WO', 'AT TOTAL DEPTH', 'WELL START', 'WELL PERMIT', 'TREATD', 'CANCEL', 'GAS PRODUCER', 'GAS-WO', 'TA') then [IHS_Status] else Null() endif

        Is_Other
            if IsNull([LastProd]) and ([SPUD_DATE] < "1971-01-01" or [SPUD_DATE] < DateTimeAdd(DateTimeToday(),-36,"months")) then "OTHER" else Null() endif
            # if last_prod is None and (spudded before 1971 or spudded < 3 years ago)

        Is_InactivePA
            if [LastProd] < DateAdd([ProductionDateCutoff],-12,"months") and !IsNull([LastProd]) then "INACTIVE-PA" else Null() endif

        Is_TA
            if [LastProd] < DateAdd([ProductionDateCutoff],-3,"months") and [LastProd] >= DateAdd([ProductionDateCutoff],-12,"months") then "TA" else Null() endif

        Is_Producing
            if [LastProd] >= DateAdd([ProductionDateCutoff],-3,"months") then "PRODUCING" else Null() endif

        Is_Completed
            if [COMP_DATE] >= DateAdd([ProductionDateCutoff],-9,"months") then "COMPLETED" else Null() endif

        Is_DUC
            if [SPUD_DATE] < DateAdd(MonthStart(DateTimeToday()),-1,"months") then "DUC" else Null() endif

        Is_Drilling
            if !IsNull([SPUD_DATE]) then "DRILLING" else Null() endif

        Is_Permit
            if [PERMIT_DATE] >= DateAdd(DateAdd(MonthStart(DateTimeToday()),-1,"months"),-36,"months") then "PERMIT" else Null() endif

        Is_StalePermit
            if [PERMIT_DATE] < DateAdd(DateAdd(MonthStart(DateTimeToday()),-1,"months"),-36,"months") then "STALE-PERMIT" else Null() endif

        Status
            if !IsNull([Is_GoodSymCode]) then [Is_GoodSymCode]
            elseif !IsNull([Is_Other]) then [Is_Other]
            elseif !IsNull([Is_InactivePA]) then [Is_InactivePA]
            elseif !IsNull([Is_TA]) then [Is_TA]
            elseif !IsNull([Is_Producing]) then [Is_Producing]
            elseif !IsNull([Is_Completed]) then [Is_Completed]
            elseif !IsNull([Is_DUC]) then [Is_DUC]
            elseif !IsNull([Is_Drilling]) then [Is_Drilling]
            elseif !IsNull([Is_Permit]) then [Is_Permit]
            elseif !IsNull([Is_StalePermit]) then [Is_StalePermit]
            else "OTHER" endif

        """  # noqa

        last_prod_norm_date = x_months_ago(3)

        to_recategorize = [
            "OIL PRODUCER",
            "OIL-WO",
            "AT TOTAL DEPTH",
            "WELL START",
            "WELL PERMIT",
            "TREATD",
            "CANCEL",
            "GAS PRODUCER",
            "GAS-WO",
            "TA",
        ]

        df.loc[~df.status.isin(to_recategorize), "is_keeper_status"] = True

        other_mask = (df.last_prod_date.isna()) & (df.spud_date <
                                                   x_months_ago(36))
        df.loc[other_mask, "is_other"] = True

        inactive_pa_mask = (~df.last_prod_date.isna()) & (
            df.last_prod_date < x_months_ago(12,
                                             relative_to=last_prod_norm_date))
        df.loc[inactive_pa_mask, "is_inactive_pa"] = True

        is_ta_mask = (df.last_prod_date < x_months_ago(
            3, relative_to=last_prod_norm_date)) & (
                df.last_prod_date < x_months_ago(
                    12, relative_to=last_prod_norm_date))
        df.loc[is_ta_mask, "is_ta"] = True

        is_producing_mask = df.last_prod_date >= x_months_ago(
            3, relative_to=last_prod_norm_date)
        df.loc[is_producing_mask, "is_producing"] = True

        is_completed_mask = df.comp_date.notnull()
        df.loc[is_completed_mask, "is_completed"] = True

        is_duc_mask = df.spud_date < x_months_ago(
            3, relative_to=last_prod_norm_date)
        df.loc[is_duc_mask, "is_duc"] = True

        is_drilling_mask = df.spud_date.notnull()
        df.loc[is_drilling_mask, "is_drilling"] = True

        is_permit_mask = df.permit_date >= x_months_ago(36)
        df.loc[is_permit_mask, "is_permit"] = True

        is_stale_permit_mask = df.permit_date >= x_months_ago(36)
        df.loc[is_stale_permit_mask, "is_stale_permit"] = True

        indicators = [
            "is_keeper_status",
            "is_other",
            "is_inactive_pa",
            "is_ta",
            "is_producing",
            "is_completed",
            "is_duc",
            "is_drilling",
            "is_permit",
            "is_stale_permit",
        ]

        df.loc[:, indicators] = df.loc[:, indicators].fillna(False)

        if indicators_only:
            return_columns = indicators
        else:
            return_columns = required_columns + indicators

        return df.loc[:, return_columns]