Exemple #1
0
        def _categorize_cvss_data(df_sft3):
            # The CVE vulnerability data is categorized.

            # Use the CVE vulnerability data as input.
            # Categorize the CVSS impact data
            # Calculate maximum impact for each software
            # Convert the "worst case" impact into a simple classification
            # "Hi-Med-Low"

            self.logger.info('\n\nEntering categorize_cvss_data\n\n')

            # Function to compute simplistic criticality value

            def myfn6(row):
                # self.logger.debug('input: ', row
                try:
                    my_score = row['cvss_score']
                    my_ease = row['cvss_acc_cmpl_cat'] in ['LOW', 'MEDIUM']
                    my_access = row['cvss_acc_vect_cat'] in [
                        'NETWORK', 'ADJACENT_NETWORK'
                    ]
                    if (my_score > 7) and my_ease and my_access:
                        return ('High')

                    elif (my_score < 4) or ((not my_ease) and (not my_access)):
                        return ('Low')

                    else:
                        return ('Medium')
                except:
                    return ("None")

            # Categorize the CVSS impact data

            # categories
#            df_sft3['cvss_acc_cmpl_cat'] = df_sft3[
#                        'cvss:access-complexity'].astype(
#                            'category',
#                            categories=['HIGH', 'MEDIUM', 'LOW'],
#                            ordered=True
#                            )
            cats = CategoricalDtype(categories=['HIGH', 'MEDIUM', 'LOW'],
                                    ordered=True)
            df_sft3['cvss_acc_cmpl_cat'] = df_sft3[
                'cvss:access-complexity'].astype(cats)

            #            df_sft3['cvss_acc_vect_cat'] = df_sft3[
            #                            'cvss:access-vector'].astype(
            #                                'category',
            #                                categories=[
            #                                        'LOCAL',
            #                                        'ADJACENT_NETWORK',
            #                                        'NETWORK'
            #                                        ],
            #                                ordered=True)
            cats = CategoricalDtype(
                categories=['LOCAL', 'ADJACENT_NETWORK', 'NETWORK'],
                ordered=True)
            df_sft3['cvss_acc_vect_cat'] = df_sft3[
                'cvss:access-vector'].astype(cats)

            # convert from string to float for max comparisons
            df_sft3['cvss_score'] = pd.to_numeric(df_sft3['cvss:score'],
                                                  errors='coerce')

            # get rid of extraneous data columns
            df_sft3.drop(
                [
                    u'cvss:score', u'cvss:access-complexity',
                    u'cvss:access-vector', u'cvss:authentication',
                    u'cvss:availability-impact',
                    u'cvss:confidentiality-impact', u'cvss:integrity-impact'
                ],
                # u'vuln:security-protection'],
                # 170118 Bug fix: Sometimes not present
                inplace=True,
                axis=1)

            # rename some other columns for easier access
            df_sft4 = df_sft3.rename(
                columns={
                    'vuln:cve-id': 'cve_id',
                    'vuln:product': 'cpe_prod',
                    'cvss:source': 'cvss_src'
                })

            self.logger.info('\n\nProcessing CVE '
                             'vulnerability data: \n{0}\n{1}\n\n'.format(
                                 df_sft4.shape, df_sft4.columns))

            # Calculate maximum impact for each software

            # group vulns by software
            df_sft4_gp = df_sft4.groupby('cpe_prod')

            # compute worst case value for each software
            df_sft4_agg = df_sft4_gp.agg({
                'cvss_score': max,
                'cvss_acc_cmpl_cat': max,
                'cvss_acc_vect_cat': max
            })

            self.logger.debug('\n\n Aggregated CVE vuln data '
                              'for worst case\n{0}\n{1} '.format(
                                  df_sft4_agg.shape, df_sft4_agg.columns))

            # Convert the "worst case" impact into a simple classification
            # of "Hi-Med-Low"

            # Compute the criticality for the vuln data
            df_sft4_agg['crit_X'] = df_sft4_agg.apply(myfn6, axis=1)

            self.logger.debug('\nAggregated vuln data counts '
                              'by criticality \n{0}\n\n'.format(
                                  df_sft4_agg.crit_X.value_counts()))

            # The criticality value is converted to a pandas category
            my_crit_categories = ['None', 'Low', 'Medium', 'High']

            # and then convert the calculated value to a category
            #            df_sft4_agg['crit_X_cat'] = df_sft4_agg['crit_X'].astype(
            #                                            'category',
            #                                            categories=my_crit_categories,
            #                                            ordered=True
            #                                            )
            cats = CategoricalDtype(categories=my_crit_categories,
                                    ordered=True)
            df_sft4_agg['crit_X_cat'] = df_sft4_agg['crit_X'].astype(cats)

            df_sft4_agg.drop(['crit_X'], axis=1, inplace=True)

            self.logger.info('\n\nAggregated vuln data: \n{0}\n{1}\n\n'.format(
                df_sft4_agg.shape, df_sft4_agg.columns))

            return (df_sft4_agg)
Exemple #2
0
 def dtype():
     return CategoricalDtype(categories=[s.name for s in Present],
                             ordered=True)
Exemple #3
0
 def dtype():
     return CategoricalDtype(categories=[s.name for s in CompassBearing],
                             ordered=True)
Exemple #4
0
    def test_rank_categorical(self):
        # GH issue #15420 rank incorrectly orders ordered categories

        # Test ascending/descending ranking for ordered categoricals
        exp = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
        exp_desc = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0])
        ordered = Series(
            ["first", "second", "third", "fourth", "fifth", "sixth"]
        ).astype(
            CategoricalDtype(
                categories=["first", "second", "third", "fourth", "fifth", "sixth"],
                ordered=True,
            )
        )
        assert_series_equal(ordered.rank(), exp)
        assert_series_equal(ordered.rank(ascending=False), exp_desc)

        # Unordered categoricals should be ranked as objects
        unordered = Series(
            ["first", "second", "third", "fourth", "fifth", "sixth"]
        ).astype(
            CategoricalDtype(
                categories=["first", "second", "third", "fourth", "fifth", "sixth"],
                ordered=False,
            )
        )
        exp_unordered = Series([2.0, 4.0, 6.0, 3.0, 1.0, 5.0])
        res = unordered.rank()
        assert_series_equal(res, exp_unordered)

        unordered1 = Series([1, 2, 3, 4, 5, 6]).astype(
            CategoricalDtype([1, 2, 3, 4, 5, 6], False)
        )
        exp_unordered1 = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
        res1 = unordered1.rank()
        assert_series_equal(res1, exp_unordered1)

        # Test na_option for rank data
        na_ser = Series(
            ["first", "second", "third", "fourth", "fifth", "sixth", np.NaN]
        ).astype(
            CategoricalDtype(
                ["first", "second", "third", "fourth", "fifth", "sixth", "seventh"],
                True,
            )
        )

        exp_top = Series([2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 1.0])
        exp_bot = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0])
        exp_keep = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, np.NaN])

        assert_series_equal(na_ser.rank(na_option="top"), exp_top)
        assert_series_equal(na_ser.rank(na_option="bottom"), exp_bot)
        assert_series_equal(na_ser.rank(na_option="keep"), exp_keep)

        # Test na_option for rank data with ascending False
        exp_top = Series([7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0])
        exp_bot = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 7.0])
        exp_keep = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0, np.NaN])

        assert_series_equal(na_ser.rank(na_option="top", ascending=False), exp_top)
        assert_series_equal(na_ser.rank(na_option="bottom", ascending=False), exp_bot)
        assert_series_equal(na_ser.rank(na_option="keep", ascending=False), exp_keep)

        # Test invalid values for na_option
        msg = "na_option must be one of 'keep', 'top', or 'bottom'"

        with pytest.raises(ValueError, match=msg):
            na_ser.rank(na_option="bad", ascending=False)

        # invalid type
        with pytest.raises(ValueError, match=msg):
            na_ser.rank(na_option=True, ascending=False)

        # Test with pct=True
        na_ser = Series(["first", "second", "third", "fourth", np.NaN]).astype(
            CategoricalDtype(["first", "second", "third", "fourth"], True)
        )
        exp_top = Series([0.4, 0.6, 0.8, 1.0, 0.2])
        exp_bot = Series([0.2, 0.4, 0.6, 0.8, 1.0])
        exp_keep = Series([0.25, 0.5, 0.75, 1.0, np.NaN])

        assert_series_equal(na_ser.rank(na_option="top", pct=True), exp_top)
        assert_series_equal(na_ser.rank(na_option="bottom", pct=True), exp_bot)
        assert_series_equal(na_ser.rank(na_option="keep", pct=True), exp_keep)
def test_read_zinc_grid():
    expected_grid_info = dict(
        view=zincio.String("chart"),
        hisStart=zincio.Datetime(pd.Timestamp("2020-05-18T00:00:00-07:00"),
                                 tz="Los_Angeles"),
        hisEnd=zincio.Datetime(pd.Timestamp("2020-05-18T01:15:00-07:00"),
                               tz="Los_Angeles"),
        hisLimit=zincio.Number(10000),
        dis=zincio.String("Mon 18-May-2020"))
    expected_column_info = dict(
        ts=dict(
            disKey=zincio.String('ui::timestamp'),
            tz=zincio.String('Los_Angeles'),
            chartFormat=zincio.String('ka'),
        ),
        v0=dict(id=zincio.Ref('p:q01b001:r:0197767d-c51944e4',
                              'Building One VAV1-01 Eff Heat SP'),
                navName=zincio.String('Eff Heat SP'),
                point=zincio.MARKER,
                his=zincio.MARKER,
                siteRef=zincio.Ref('p:q01b001:r:8fc116f8-72c5320c',
                                   'Building One'),
                equipRef=zincio.Ref('p:q01b001:r:b78a8dcc-828caa1b', None),
                curVal=zincio.Number(65.972, '°F'),
                curStatus=zincio.String('ok'),
                kind=zincio.String('Number'),
                unit=zincio.String('°F'),
                tz=zincio.String('Los_Angeles'),
                sp=zincio.MARKER,
                temp=zincio.MARKER,
                cur=zincio.MARKER,
                haystackPoint=zincio.MARKER,
                air=zincio.MARKER,
                effective=zincio.MARKER,
                heating=zincio.MARKER),
        v1=dict(
            id=zincio.Ref('p:q01b001:r:e69a7401-f4b340ff',
                          'Building One VAV1-01 Eff Occupancy'),
            navName=zincio.String('Eff Occupancy'),
            point=zincio.MARKER,
            his=zincio.MARKER,
            siteRef=zincio.Ref('p:q01b001:r:8fc116f8-72c5320c',
                               'Building One'),
            equipRef=zincio.Ref('p:q01b001:r:b78a8dcc-828caa1b',
                                'Building One VAV1-01'),
            curVal=zincio.String('Occupied'),
            curStatus=zincio.String('ok'),
            kind=zincio.String('Str'),
            tz=zincio.String('Los_Angeles'),
            sensor=zincio.MARKER,
            cur=zincio.MARKER,
            haystackPoint=zincio.MARKER,
            hisCollectCov=zincio.MARKER,
            enum=zincio.String('Nul,Occupied,Unoccupied,Bypass,Standby'),
            effective=zincio.MARKER,
            occupied=zincio.MARKER,
        ),
        v2=dict(id=zincio.Ref('p:q01b001:r:dcfe87d9-cd034388',
                              'Building One VAV1-01 Damper Pos'),
                navName=zincio.String('Damper Pos'),
                point=zincio.MARKER,
                his=zincio.MARKER,
                siteRef=zincio.Ref('p:q01b001:r:8fc116f8-72c5320c',
                                   'Building One'),
                equipRef=zincio.Ref('p:q01b001:r:b78a8dcc-828caa1b',
                                    'Building One VAV1-01'),
                curVal=zincio.Number(41.5, '%'),
                curStatus=zincio.String('ok'),
                kind=zincio.String('Number'),
                unit=zincio.String('%'),
                tz=zincio.String('Los_Angeles'),
                sensor=zincio.MARKER,
                cur=zincio.MARKER,
                damper=zincio.MARKER,
                precision=zincio.Number(1.0),
                haystackPoint=zincio.MARKER,
                air=zincio.MARKER),
        v3=dict(id=zincio.Ref('p:q01b001:r:8fab195e-58ffca99',
                              'Building One VAV1-01 Occ Heat SP Offset'),
                navName=zincio.String('Occ Heat SP Offset'),
                point=zincio.MARKER,
                his=zincio.MARKER,
                siteRef=zincio.Ref('p:q01b001:r:8fc116f8-72c5320c',
                                   'Building One'),
                equipRef=zincio.Ref('p:q01b001:r:b78a8dcc-828caa1b',
                                    'Building One VAV1-01'),
                curVal=zincio.Number(-2.394, '°C'),
                curStatus=zincio.String('ok'),
                kind=zincio.String('Number'),
                unit=zincio.String('°C'),
                tz=zincio.String('Los_Angeles'),
                sp=zincio.MARKER,
                temp=zincio.MARKER,
                cur=zincio.MARKER,
                air=zincio.MARKER,
                occ=zincio.MARKER,
                writable=zincio.MARKER,
                writeStatus=zincio.String('unknown'),
                zone=zincio.MARKER,
                hisCollectInterval=zincio.Number(5.0, 'min'),
                heating=zincio.MARKER,
                offset=zincio.MARKER,
                writeLevel=zincio.Number(8.0, None),
                haystackPoint=zincio.MARKER,
                writeVal=zincio.Number(-10.0),
                actions=zincio.String(
                    'ver:\\"3.0\\"\\ndis,expr\\n\\"Override\\",'
                    '\\"pointOverride(\\$self, \\$val, \\$duration)\\"\\n'
                    '\\"Auto\\",\\"pointAuto(\\$self)\\"\\n')),
        v4=dict(
            id=zincio.Ref('p:q01b001:r:260ce2bb-2ef5065f',
                          'Building One VAV1-01 Air Flow'),
            navName=zincio.String('Air Flow'),
            point=zincio.MARKER,
            his=zincio.MARKER,
            siteRef=zincio.Ref('p:q01b001:r:8fc116f8-72c5320c',
                               'Building One'),
            equipRef=zincio.Ref('p:q01b001:r:b78a8dcc-828caa1b',
                                'Building One VAV1-01'),
            curVal=zincio.Number(117.6611, 'cfm'),
            curStatus=zincio.String('ok'),
            kind=zincio.String('Number'),
            unit=zincio.String('cfm'),
            tz=zincio.String('Los_Angeles'),
            sensor=zincio.MARKER,
            cur=zincio.MARKER,
        ))
    expected_index = pd.DatetimeIndex([
        pd.to_datetime('2020-05-17T23:47:08-07:00'),
        pd.to_datetime('2020-05-17T23:55:00-07:00'),
        pd.to_datetime('2020-05-18T00:00:00-07:00'),
        pd.to_datetime('2020-05-18T00:05:00-07:00'),
        pd.to_datetime('2020-05-18T01:13:09-07:00'),
    ],
                                      name='ts')
    expected_dataframe = pd.DataFrame(
        index=expected_index,
        data={
            ('@p:q01b001:r:0197767d-c51944e4 '
             '"Building One VAV1-01 Eff Heat SP"'): [
                np.nan,
                68.553,
                68.554,
                69.723,
                np.nan,
            ],
            ('@p:q01b001:r:e69a7401-f4b340ff '
             '"Building One VAV1-01 Eff Occupancy"'):
            pd.Series(['Occupied', '', '', '', 'Unoccupied'],
                      index=expected_index,
                      dtype=CategoricalDtype(categories=[
                          'Nul', 'Occupied', 'Unoccupied', 'Bypass', 'Standby'
                      ])),
            ('@p:q01b001:r:dcfe87d9-cd034388 '
             '"Building One VAV1-01 Damper Pos"'): [np.nan, 3, 7, 18, np.nan],
            ('@p:q01b001:r:8fab195e-58ffca99 '
             '"Building One VAV1-01 Occ Heat SP Offset"'): [
                np.nan,
                -1.984,
                -2.203,
                5.471,
                np.nan,
            ],
            '@p:q01b001:r:260ce2bb-2ef5065f "Building One VAV1-01 Air Flow"': [
                np.nan,
                118.65,
                62.0,
                np.nan,
                np.nan,
            ],
        })
    actual = zincio.read(FULL_GRID_FILE)
    expected = zincio.Grid(version=3,
                           grid_info=expected_grid_info,
                           column_info=expected_column_info,
                           data=expected_dataframe)
    assert_grid_equal(actual, expected)
Exemple #6
0
    def add_categories(
        self, new_categories: Union[pd.Index, Any, List], inplace: bool = False
    ) -> Optional["ps.Series"]:
        """
        Add new categories.

        `new_categories` will be included at the last/highest place in the
        categories and will be unused directly after this call.

        Parameters
        ----------
        new_categories : category or list-like of category
           The new categories to be included.
        inplace : bool, default False
           Whether or not to add the categories inplace or return a copy of
           this categorical with added categories.

        Returns
        -------
        Series or None
            Categorical with new categories added or None if ``inplace=True``.

        Raises
        ------
        ValueError
            If the new categories include old categories or do not validate as
            categories

        Examples
        --------
        >>> s = ps.Series(list("abbccc"), dtype="category")
        >>> s  # doctest: +SKIP
        0    a
        1    b
        2    b
        3    c
        4    c
        5    c
        dtype: category
        Categories (3, object): ['a', 'b', 'c']

        >>> s.cat.add_categories('x')  # doctest: +SKIP
        0    a
        1    b
        2    b
        3    c
        4    c
        5    c
        dtype: category
        Categories (4, object): ['a', 'b', 'c', 'x']
        """
        from pyspark.pandas.frame import DataFrame

        if is_list_like(new_categories):
            categories = list(new_categories)  # type: List
        else:
            categories = [new_categories]

        if any(cat in self.categories for cat in categories):
            raise ValueError(
                "new categories must not include old categories: {{{cats}}}".format(
                    cats=", ".join(set(str(cat) for cat in categories if cat in self.categories))
                )
            )

        internal = self._data._psdf._internal.with_new_spark_column(
            self._data._column_label,
            self._data.spark.column,
            field=self._data._internal.data_fields[0].copy(
                dtype=CategoricalDtype(list(self.categories) + categories, ordered=self.ordered)
            ),
        )
        if inplace:
            self._data._psdf._update_internal_frame(internal)
            return None
        else:
            psser = DataFrame(internal)._psser_for(self._data._column_label)
            return psser._with_new_scol(psser.spark.column, field=psser._internal.data_fields[0])
Exemple #7
0
def pd_load_acquisition_csv(acquisition_path, **kwargs):
    """ Loads acquisition data

    Returns
    -------
    PD DataFrame
    """

    cols = [
        'loan_id', 'orig_channel', 'seller_name', 'orig_interest_rate',
        'orig_upb', 'orig_loan_term', 'orig_date', 'first_pay_date',
        'orig_ltv', 'orig_cltv', 'num_borrowers', 'dti',
        'borrower_credit_score', 'first_home_buyer', 'loan_purpose',
        'property_type', 'num_units', 'occupancy_status', 'property_state',
        'zip', 'mortgage_insurance_percent', 'product_type',
        'coborrow_credit_score', 'mortgage_insurance_type',
        'relocation_mortgage_indicator'
    ]

    dtypes = {
        "loan_id":
        np.int64,
        "monthly_reporting_period":
        str,
        "servicer":
        str,
        "interest_rate":
        np.float64,
        "current_actual_upb":
        np.float64,
        "loan_age":
        np.float64,
        "remaining_months_to_legal_maturity":
        np.float64,
        "adj_remaining_months_to_maturity":
        np.float64,
        "maturity_date":
        str,
        "msa":
        np.float64,
        "current_loan_delinquency_status":
        np.int32,
        "mod_flag":
        CategoricalDtype(['N', 'Y']),
        "zero_balance_code":
        CategoricalDtype(['01', '02', '06', '09', '03', '15', '16']),
        "zero_balance_effective_date":
        str,
        "last_paid_installment_date":
        str,
        "foreclosed_after":
        str,
        "disposition_date":
        str,
        "foreclosure_costs":
        np.float64,
        "prop_preservation_and_repair_costs":
        np.float64,
        "asset_recovery_costs":
        np.float64,
        "misc_holding_expenses":
        np.float64,
        "holding_taxes":
        np.float64,
        "net_sale_proceeds":
        np.float64,
        "credit_enhancement_proceeds":
        np.float64,
        "repurchase_make_whole_proceeds":
        np.float64,
        "other_foreclosure_proceeds":
        np.float64,
        "non_interest_bearing_upb":
        np.float64,
        "principal_forgiveness_upb":
        np.float64,
        "repurchase_make_whole_proceeds_flag":
        CategoricalDtype(['N', 'Y']),
        "foreclosure_principal_write_off_amount":
        np.float64,
        "servicing_activity_indicator":
        CategoricalDtype(['N', 'Y']),
    }
    print(acquisition_path)

    #return pd.read_csv(acquisition_path, names=cols, delimiter='|', dtype=dtypes, parse_dates=[6,7])
    return pd.read_csv('acq.csv',
                       names=cols,
                       delimiter='|',
                       dtype=dtypes,
                       parse_dates=[6, 7])
Exemple #8
0
def pd_load_acquisition_csv(acquisition_path, **kwargs):
    """ Loads acquisition data

    Returns
    -------
    PD DataFrame
    """

    cols = [
        'loan_id', 'orig_channel', 'seller_name', 'orig_interest_rate',
        'orig_upb', 'orig_loan_term', 'orig_date', 'first_pay_date',
        'orig_ltv', 'orig_cltv', 'num_borrowers', 'dti',
        'borrower_credit_score', 'first_home_buyer', 'loan_purpose',
        'property_type', 'num_units', 'occupancy_status', 'property_state',
        'zip', 'mortgage_insurance_percent', 'product_type',
        'coborrow_credit_score', 'mortgage_insurance_type',
        'relocation_mortgage_indicator'
    ]
    dtypes = {
        "loan_id":
        np.int64,
        "orig_channel":
        CategoricalDtype(['B', 'C', 'R']),
        "seller_name":
        str,
        "orig_interest_rate":
        np.float64,
        "orig_upb":
        np.int64,
        "orig_loan_term":
        np.int64,
        "orig_date":
        str,
        "first_pay_date":
        str,
        "orig_ltv":
        np.float64,
        "orig_cltv":
        np.float64,
        "num_borrowers":
        np.float64,
        "dti":
        np.float64,
        "borrower_credit_score":
        np.float64,
        "first_home_buyer":
        CategoricalDtype(['N', 'U', 'Y']),
        "loan_purpose":
        CategoricalDtype(['C', 'P', 'R', 'U']),
        "property_type":
        CategoricalDtype(['CO', 'CP', 'MH', 'PU', 'SF']),
        "num_units":
        np.int64,
        "occupancy_status":
        CategoricalDtype(['I', 'P', 'S']),
        "property_state":
        CategoricalDtype([
            'AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA',
            'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME',
            'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM',
            'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'PR', 'RI', 'SC', 'SD', 'TN',
            'TX', 'UT', 'VA', 'VI', 'VT', 'WA', 'WI', 'WV', 'WY'
        ]),
        "zip":
        np.int64,
        "mortgage_insurance_percent":
        np.float64,
        "product_type":
        CategoricalDtype(['FRM']),
        "coborrow_credit_score":
        np.float64,
        "mortgage_insurance_type":
        np.float64,
        "relocation_mortgage_indicator":
        CategoricalDtype(['N', 'Y']),
    }

    return ddf.read_csv(acquisition_path,
                        names=cols,
                        delimiter='|',
                        dtype=dtypes,
                        parse_dates=[6, 7],
                        assume_missing=True)
Exemple #9
0
def pd_load_performance_csv(performance_path, **kwargs):
    """ Loads performance data

    Returns
    -------
    PD DataFrame
    """

    cols = [
        "loan_id", "monthly_reporting_period", "servicer", "interest_rate",
        "current_actual_upb", "loan_age", "remaining_months_to_legal_maturity",
        "adj_remaining_months_to_maturity", "maturity_date", "msa",
        "current_loan_delinquency_status", "mod_flag", "zero_balance_code",
        "zero_balance_effective_date", "last_paid_installment_date",
        "foreclosed_after", "disposition_date", "foreclosure_costs",
        "prop_preservation_and_repair_costs", "asset_recovery_costs",
        "misc_holding_expenses", "holding_taxes", "net_sale_proceeds",
        "credit_enhancement_proceeds", "repurchase_make_whole_proceeds",
        "other_foreclosure_proceeds", "non_interest_bearing_upb",
        "principal_forgiveness_upb", "repurchase_make_whole_proceeds_flag",
        "foreclosure_principal_write_off_amount",
        "servicing_activity_indicator"
    ]
    dtypes = {
        "loan_id":
        np.int64,
        "monthly_reporting_period":
        str,
        "servicer":
        str,
        "interest_rate":
        np.float64,
        "current_actual_upb":
        np.float64,
        "loan_age":
        np.float64,
        "remaining_months_to_legal_maturity":
        np.float64,
        "adj_remaining_months_to_maturity":
        np.float64,
        "maturity_date":
        str,
        "msa":
        np.float64,
        "current_loan_delinquency_status":
        np.int32,
        "mod_flag":
        CategoricalDtype(['N', 'Y']),
        "zero_balance_code":
        CategoricalDtype(['01', '02', '06', '09', '03', '15', '16']),
        "zero_balance_effective_date":
        str,
        "last_paid_installment_date":
        str,
        "foreclosed_after":
        str,
        "disposition_date":
        str,
        "foreclosure_costs":
        np.float64,
        "prop_preservation_and_repair_costs":
        np.float64,
        "asset_recovery_costs":
        np.float64,
        "misc_holding_expenses":
        np.float64,
        "holding_taxes":
        np.float64,
        "net_sale_proceeds":
        np.float64,
        "credit_enhancement_proceeds":
        np.float64,
        "repurchase_make_whole_proceeds":
        np.float64,
        "other_foreclosure_proceeds":
        np.float64,
        "non_interest_bearing_upb":
        np.float64,
        "principal_forgiveness_upb":
        np.float64,
        "repurchase_make_whole_proceeds_flag":
        CategoricalDtype(['N', 'Y']),
        "foreclosure_principal_write_off_amount":
        np.float64,
        "servicing_activity_indicator":
        CategoricalDtype(['N', 'Y']),
    }

    return ddf.read_csv(performance_path,
                        names=cols,
                        delimiter='|',
                        dtype=dtypes,
                        parse_dates=[1, 8, 13, 14, 15, 16],
                        assume_missing=True)
Exemple #10
0
 def cate_type(levels):
     return CategoricalDtype(categories=levels, ordered=True)
Exemple #11
0
        (np.datetime64, dt.timestamp),
        (np.timedelta64, dt.interval),
    ],
)
def test_numpy_dtype(numpy_dtype, ibis_dtype):
    assert dt.dtype(np.dtype(numpy_dtype)) == ibis_dtype


@pytest.mark.parametrize(
    ('pandas_dtype', 'ibis_dtype'),
    [
        (
            DatetimeTZDtype(tz='US/Eastern', unit='ns'),
            dt.Timestamp('US/Eastern'),
        ),
        (CategoricalDtype(), dt.Category()),
    ],
)
def test_pandas_dtype(pandas_dtype, ibis_dtype):
    assert dt.dtype(pandas_dtype) == ibis_dtype


def test_series_to_ibis_literal():
    values = [1, 2, 3, 4]
    s = pd.Series(values)

    expr = ir.as_value_expr(s)
    expected = ir.sequence(list(s))
    assert expr.equals(expected)

def convert_json_field_to_pandas_type(field):
    """
    Converts a JSON field descriptor into its corresponding NumPy / pandas type

    Parameters
    ----------
    field
        A JSON field descriptor

    Returns
    -------
    dtype

    Raises
    -----
    ValueError
        If the type of the provided field is unknown or currently unsupported

    Examples
    --------
    >>> convert_json_field_to_pandas_type({'name': 'an_int',
                                           'type': 'integer'})
    'int64'
    >>> convert_json_field_to_pandas_type({'name': 'a_categorical',
                                           'type': 'any',
                                           'contraints': {'enum': [
                                                          'a', 'b', 'c']},
                                           'ordered': True})
    'CategoricalDtype(categories=['a', 'b', 'c'], ordered=True)'
    >>> convert_json_field_to_pandas_type({'name': 'a_datetime',
                                           'type': 'datetime'})
    'datetime64[ns]'
    >>> convert_json_field_to_pandas_type({'name': 'a_datetime_with_tz',
                                           'type': 'datetime',
                                           'tz': 'US/Central'})
    'datetime64[ns, US/Central]'
    """
    typ = field['type']
    if typ == 'string':
        return 'object'
    elif typ == 'integer':
        return 'int64'
    elif typ == 'number':
        return 'float64'
    elif typ == 'boolean':
        return 'bool'
    elif typ == 'duration':
        return 'timedelta64'
    elif typ == 'datetime':
        if field.get('tz'):
            return 'datetime64[ns, {tz}]'.format(tz=field['tz'])
        else:
            return 'datetime64[ns]'
    elif typ == 'any':
        if 'constraints' in field and 'ordered' in field:
            return CategoricalDtype(categories=field['constraints']['enum'],
                                    ordered=field['ordered'])
        else:
            return 'object'

    raise ValueError("Unsupported or invalid field type: {}".format(typ))
            user_embedding_similarity,
        ]
        sim_weights = [
            cast_weight,
            director_weight,
            keywords_weight,
            overview_weight,
            user_embedding_weight,
        ]
        sim_mat = generate_weighted_similarity_matrix(arrays=sim_matrices,
                                                      weights=sim_weights)
        recommendations = get_recommendations(
            films=data,
            titles=liked_films,
            similarity_matrix=sim_mat,
            top_n=config.POSTERS_PER_ROW * config.NUM_POSTER_ROWS,
        )
        filtered_films = data[data["title"].isin(recommendations)]

        recommendation_order = CategoricalDtype(recommendations, ordered=True)
        filtered_films["title"] = filtered_films["title"].astype(
            recommendation_order)
        filtered_films = filtered_films.sort_values("title")

        display_film_posters(
            streamlit=st,
            data=filtered_films,
            num_rows=config.NUM_POSTER_ROWS,
            posters_per_row=config.POSTERS_PER_ROW,
        )
Exemple #14
0
def get_and_filter_panos_by_osm_rid(road_id,
                                    df_edges,
                                    offset=1,
                                    vis=False,
                                    debug=False,
                                    outlier_filter=True,
                                    mul_factor=2,
                                    verbose=False):
    """Get the panos by OSM rid, and then filtered by some conditions.

    Args:
        road_id (int, optional): [description]. Defaults to 243387686.
        vis (bool, optional): [description]. Defaults to False.
        offset (int, optional): [the attribute `lane_num` is the real lane num or the real lane line num. If `lane_num` represent line num, then offset is 1. Other vise, the offset is 0 ]. Defaults to 1.

    Returns:
        matchingPano [dataframe]: [description]
        fig [plt.figure]: Figure
    """
    # step 1: matching panos
    atts = [
        'index', 'RID', 'Name', 'geometry', 'lane_num', 'frechet_dis', 'angel',
        'osm_road_id', 'osm_road_index', 'related_pos', 'link'
    ]
    try:
        if road_id > 0:
            matching = get_panos_of_road_and_indentify_lane_type_by_id(
                road_id, df_edges, False)
            matching = matching[atts].merge(df_edges[['s', 'e']],
                                            left_on='osm_road_index',
                                            right_index=True)
            road_name = df_edges.query(f'rid=={road_id}').name.unique()[0]
        else:
            # FIXME -208128058 高新中三道, 街景仅遍历了一遍
            df_tmp = _get_revert_df_edges(road_id, df_edges)
            road_name = df_tmp.name.unique()[0]
            matching = get_panos_of_road_and_indentify_lane_type_by_id(
                road_id, df_tmp, False)
            matching = matching[atts].merge(df_tmp[['s', 'e']],
                                            left_on='osm_road_index',
                                            right_index=True)

        if matching.shape[0] == 0:
            print(
                f"{sys._getframe(0).f_code.co_name} {road_id}, no matching recods"
            )
            return None, None
    except:
        print(f"{sys._getframe(0).f_code.co_name} {road_id}, process error")
        return None, None

    rids = []
    for i in matching.RID.values:
        if i in rids:
            continue
        rids.append(i)
    rids_ordered = CategoricalDtype(rids, ordered=True)

    # filter outlier -> 计算路段的统计属性
    points = DB_panos.query(f"RID in {rids}").dropna()
    tmp = points.groupby('RID').apply(lambda x: _panos_filter(x)).drop(
        columns='RID').reset_index()

    if outlier_filter and tmp.shape[0] != 0:
        if verbose:
            origin_size = tmp.shape[0]

        _mean, _std = tmp.lane_num.mean(), tmp.lane_num.std()
        if not np.isnan(_mean) and not np.isnan(_std):
            iterverl = (_mean - mul_factor * _std, _mean + mul_factor * _std)
            tmp.query(f" {iterverl[0]} < lane_num < {iterverl[1]}",
                      inplace=True)
            if verbose:
                print(
                    f"{sys._getframe(0).f_code.co_name} outlier_filter, size: {origin_size} -> {tmp.shape[0]}"
                )

    if tmp.shape[0] == 0:
        print(
            f"{sys._getframe(0).f_code.co_name} {road_id}, no matching records after filter algorithm"
        )
        return None, None

    # reorder the panos
    tmp.loc[:, 'RID'] = tmp['RID'].astype(rids_ordered)
    tmp.sort_values(by=['RID', 'Order'], inplace=True)
    tmp.reset_index(drop=True, inplace=True)

    if offset:
        tmp.loc[:, 'lane_num'] = tmp.loc[:, 'lane_num'] - 1

    if vis:
        fig, ax = map_visualize(tmp, scale=.1, color='gray', figsize=(15, 15))
        df_edges.query(f'rid =={road_id}').plot(ax=ax,
                                                linestyle='--',
                                                color='black',
                                                label='OSM road',
                                                alpha=.5)
        tmp.loc[:, 'lane_num_str'] = tmp.loc[:, 'lane_num'].astype(str)
        tmp.plot(ax=ax, column='lane_num_str', legend=True)

        _mean, _std = tmp.lane_num.mean(), tmp.lane_num.std()
        iterverl = (round(_mean - mul_factor * _std,
                          1), round(_mean + mul_factor * _std, 1))
        ax.set_title(
            f"{road_id}, {road_name}, mean {_mean:.1f}, std {_std:.1f}, {iterverl}",
            fontsize=18)
        if debug:
            try:
                fig.savefig(
                    f'../cache/matching_records/{road_name}_{road_id}.jpg',
                    dpi=300)
            except:
                print(road_name, road_id)
        plt.tight_layout(pad=0.1)
        plt.close()

        return tmp, fig

    return tmp, None
Exemple #15
0
    def reorder_categories(
        self,
        new_categories: Union[pd.Index, List],
        ordered: Optional[bool] = None,
        inplace: bool = False,
    ) -> Optional["ps.Series"]:
        """
        Reorder categories as specified in new_categories.

        `new_categories` need to include all old categories and no new category
        items.

        Parameters
        ----------
        new_categories : Index-like
           The categories in new order.
        ordered : bool, optional
           Whether or not the categorical is treated as a ordered categorical.
           If not given, do not change the ordered information.
        inplace : bool, default False
           Whether or not to reorder the categories inplace or return a copy of
           this categorical with reordered categories.

           .. deprecated:: 3.2.0

        Returns
        -------
        cat : Series or None
            Categorical with removed categories or None if ``inplace=True``.

        Raises
        ------
        ValueError
            If the new categories do not contain all old category items or any
            new ones

        See Also
        --------
        rename_categories : Rename categories.
        add_categories : Add new categories.
        remove_categories : Remove the specified categories.
        remove_unused_categories : Remove categories which are not used.
        set_categories : Set the categories to the specified ones.

        Examples
        --------
        >>> s = ps.Series(list("abbccc"), dtype="category")
        >>> s  # doctest: +SKIP
        0    a
        1    b
        2    b
        3    c
        4    c
        5    c
        dtype: category
        Categories (3, object): ['a', 'b', 'c']

        >>> s.cat.reorder_categories(['c', 'b', 'a'], ordered=True)  # doctest: +SKIP
        0    a
        1    b
        2    b
        3    c
        4    c
        5    c
        dtype: category
        Categories (3, object): ['c' < 'b' < 'a']
        """
        if inplace:
            warnings.warn(
                "The `inplace` parameter in reorder_categories is deprecated "
                "and will be removed in a future version.",
                FutureWarning,
            )

        if not is_list_like(new_categories):
            raise TypeError(
                "Parameter 'new_categories' must be list-like, was '{}'".
                format(new_categories))
        elif len(set(new_categories)) != len(set(self.categories)) or any(
                cat not in self.categories for cat in new_categories):
            raise ValueError(
                "items in new_categories are not the same as in old categories"
            )

        if ordered is None:
            ordered = self.ordered

        if new_categories == list(self.categories) and ordered == self.ordered:
            if inplace:
                return None
            else:
                psser = self._data
                return psser._with_new_scol(
                    psser.spark.column, field=psser._internal.data_fields[0])
        else:
            dtype = CategoricalDtype(categories=new_categories,
                                     ordered=ordered)
            psser = self._data.astype(dtype)

            if inplace:
                internal = self._data._psdf._internal.with_new_spark_column(
                    self._data._column_label,
                    psser.spark.column,
                    field=psser._internal.data_fields[0],
                )
                self._data._psdf._update_internal_frame(internal)
                return None
            else:
                return psser
start_station_counter.most_common(10)

## make pandas data frame for visualizing with ggplot/plotnine:
dat_start_station_freq = pd.DataFrame(
    start_station_counter.most_common(20),
    columns = ['start_station_code', 'frequency'])
dat_start_station_freq.rename(index = dat_start_station_freq['start_station_code'], inplace = True)

## frequency series (for sorting):
## (pandas series with index that corresponds to categories):
dat_start_station_freq['frequency']

## create list for sorting:
#station_list = dat_start_station_freq['start_station_code'].value_counts().index.tolist()
station_list = dat_start_station_freq['frequency'].index.tolist()
station_cat = CategoricalDtype(categories=station_list, ordered=True)
dat_start_station_freq['start_station_code_cat'] = \
    dat_start_station_freq['start_station_code'].astype(str).astype(station_cat)

## plot counter data (frequency table, with identity relation):
## (sorting does not work here)
%matplotlib inline
ggplot(dat_start_station_freq, aes(x = 'start_station_code_cat', y = 'frequency')) + \
    geom_bar(stat = 'identity') + \
    coord_flip()

## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
## total number of trips
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##

## total number of trips:
Exemple #17
0
    def set_categories(
        self,
        new_categories: Union[pd.Index, List],
        ordered: Optional[bool] = None,
        rename: bool = False,
        inplace: bool = False,
    ) -> Optional["ps.Series"]:
        """
        Set the categories to the specified new_categories.

        `new_categories` can include new categories (which will result in
        unused categories) or remove old categories (which results in values
        set to NaN). If `rename==True`, the categories will simple be renamed
        (less or more items than in old categories will result in values set to
        NaN or in unused categories respectively).

        This method can be used to perform more than one action of adding,
        removing, and reordering simultaneously and is therefore faster than
        performing the individual steps via the more specialised methods.

        On the other hand this methods does not do checks (e.g., whether the
        old categories are included in the new categories on a reorder), which
        can result in surprising changes, for example when using special string
        dtypes, which does not considers a S1 string equal to a single char
        python string.

        Parameters
        ----------
        new_categories : Index-like
           The categories in new order.
        ordered : bool, default False
           Whether or not the categorical is treated as a ordered categorical.
           If not given, do not change the ordered information.
        rename : bool, default False
           Whether or not the new_categories should be considered as a rename
           of the old categories or as reordered categories.
        inplace : bool, default False
           Whether or not to reorder the categories in-place or return a copy
           of this categorical with reordered categories.

           .. deprecated:: 3.2.0

        Returns
        -------
        Series with reordered categories or None if inplace.

        Raises
        ------
        ValueError
            If new_categories does not validate as categories

        See Also
        --------
        rename_categories : Rename categories.
        reorder_categories : Reorder categories.
        add_categories : Add new categories.
        remove_categories : Remove the specified categories.
        remove_unused_categories : Remove categories which are not used.

        Examples
        --------
        >>> s = ps.Series(list("abbccc"), dtype="category")
        >>> s  # doctest: +SKIP
        0    a
        1    b
        2    b
        3    c
        4    c
        5    c
        dtype: category
        Categories (3, object): ['a', 'b', 'c']

        >>> s.cat.set_categories(['b', 'c'])  # doctest: +SKIP
        0    NaN
        1      b
        2      b
        3      c
        4      c
        5      c
        dtype: category
        Categories (2, object): ['b', 'c']

        >>> s.cat.set_categories([1, 2, 3], rename=True)  # doctest: +SKIP
        0    1
        1    2
        2    2
        3    3
        4    3
        5    3
        dtype: category
        Categories (3, int64): [1, 2, 3]

        >>> s.cat.set_categories([1, 2, 3], rename=True, ordered=True)  # doctest: +SKIP
        0    1
        1    2
        2    2
        3    3
        4    3
        5    3
        dtype: category
        Categories (3, int64): [1 < 2 < 3]
        """
        from pyspark.pandas.frame import DataFrame

        if inplace:
            warnings.warn(
                "The `inplace` parameter in set_categories is deprecated "
                "and will be removed in a future version.",
                FutureWarning,
            )

        if not is_list_like(new_categories):
            raise TypeError(
                "Parameter 'new_categories' must be list-like, was '{}'".
                format(new_categories))

        if ordered is None:
            ordered = self.ordered

        new_dtype = CategoricalDtype(new_categories, ordered=ordered)
        scol = self._data.spark.column

        if rename:
            new_scol = (F.when(
                scol >= len(new_categories),
                SF.lit(-1).cast(
                    self._data.spark.data_type)).otherwise(scol).alias(
                        self._data._internal.data_spark_column_names[0]))

            internal = self._data._psdf._internal.with_new_spark_column(
                self._data._column_label,
                new_scol,
                field=self._data._internal.data_fields[0].copy(
                    dtype=new_dtype),
            )

            if inplace:
                self._data._psdf._update_internal_frame(internal)
                return None
            else:
                psser = DataFrame(internal)._psser_for(
                    self._data._column_label)
                return psser._with_new_scol(
                    psser.spark.column, field=psser._internal.data_fields[0])
        else:
            psser = self._data.astype(new_dtype)
            if inplace:
                internal = self._data._psdf._internal.with_new_spark_column(
                    self._data._column_label,
                    psser.spark.column,
                    field=psser._internal.data_fields[0],
                )
                self._data._psdf._update_internal_frame(internal)
                return None
            else:
                return psser
Exemple #18
0
    def test_astype_categorical_to_other(self):

        df = DataFrame({'value': np.random.randint(0, 10000, 100)})
        labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
        cat_labels = Categorical(labels, labels)

        df = df.sort_values(by=['value'], ascending=True)
        df['value_group'] = pd.cut(df.value,
                                   range(0, 10500, 500),
                                   right=False,
                                   labels=cat_labels)

        s = df['value_group']
        expected = s
        tm.assert_series_equal(s.astype('category'), expected)
        tm.assert_series_equal(s.astype(CategoricalDtype()), expected)
        msg = (r"could not convert string to float|"
               r"invalid literal for float\(\)")
        with pytest.raises(ValueError, match=msg):
            s.astype('float64')

        cat = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']))
        exp = Series(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])
        tm.assert_series_equal(cat.astype('str'), exp)
        s2 = Series(Categorical(['1', '2', '3', '4']))
        exp2 = Series([1, 2, 3, 4]).astype(int)
        tm.assert_series_equal(s2.astype('int'), exp2)

        # object don't sort correctly, so just compare that we have the same
        # values
        def cmp(a, b):
            tm.assert_almost_equal(np.sort(np.unique(a)),
                                   np.sort(np.unique(b)))

        expected = Series(np.array(s.values), name='value_group')
        cmp(s.astype('object'), expected)
        cmp(s.astype(np.object_), expected)

        # array conversion
        tm.assert_almost_equal(np.array(s), np.array(s.values))

        # valid conversion
        for valid in [
                lambda x: x.astype('category'),
                lambda x: x.astype(CategoricalDtype()),
                lambda x: x.astype('object').astype('category'),
                lambda x: x.astype('object').astype(CategoricalDtype())
        ]:

            result = valid(s)
            # compare series values
            # internal .categories can't be compared because it is sorted
            tm.assert_series_equal(result, s, check_categorical=False)

        # invalid conversion (these are NOT a dtype)
        msg = (r"invalid type <class 'pandas\.core\.arrays\.categorical\."
               "Categorical'> for astype")
        for invalid in [
                lambda x: x.astype(Categorical),
                lambda x: x.astype('object').astype(Categorical)
        ]:
            with pytest.raises(TypeError, match=msg):
                invalid(s)
Exemple #19
0
    def remove_categories(
        self, removals: Union[pd.Index, Any, List], inplace: bool = False
    ) -> Optional["ps.Series"]:
        """
        Remove the specified categories.

        `removals` must be included in the old categories. Values which were in
        the removed categories will be set to NaN

        Parameters
        ----------
        removals : category or list of categories
           The categories which should be removed.
        inplace : bool, default False
           Whether or not to remove the categories inplace or return a copy of
           this categorical with removed categories.

        Returns
        -------
        Series or None
            Categorical with removed categories or None if ``inplace=True``.

        Raises
        ------
        ValueError
            If the removals are not contained in the categories

        Examples
        --------
        >>> s = ps.Series(list("abbccc"), dtype="category")
        >>> s  # doctest: +SKIP
        0    a
        1    b
        2    b
        3    c
        4    c
        5    c
        dtype: category
        Categories (3, object): ['a', 'b', 'c']

        >>> s.cat.remove_categories('b')  # doctest: +SKIP
        0      a
        1    NaN
        2    NaN
        3      c
        4      c
        5      c
        dtype: category
        Categories (2, object): ['a', 'c']
        """
        if is_list_like(removals):
            categories = [cat for cat in removals if cat is not None]  # type: List
        elif removals is None:
            categories = []
        else:
            categories = [removals]

        if any(cat not in self.categories for cat in categories):
            raise ValueError(
                "removals must all be in old categories: {{{cats}}}".format(
                    cats=", ".join(
                        set(str(cat) for cat in categories if cat not in self.categories)
                    )
                )
            )

        if len(categories) == 0:
            if inplace:
                return None
            else:
                psser = self._data
                return psser._with_new_scol(
                    psser.spark.column, field=psser._internal.data_fields[0]
                )
        else:
            dtype = CategoricalDtype(
                [cat for cat in self.categories if cat not in categories], ordered=self.ordered
            )
            psser = self._data.astype(dtype)

            if inplace:
                internal = self._data._psdf._internal.with_new_spark_column(
                    self._data._column_label,
                    psser.spark.column,
                    field=psser._internal.data_fields[0],
                )
                self._data._psdf._update_internal_frame(internal)
                return None
            else:
                return psser
Exemple #20
0
def dtype():
    return CategoricalDtype()
Exemple #21
0
def pd_load_performance_csv(performance_path, **kwargs):
    """ Loads performance data

    Returns
    -------
    PD DataFrame
    """

    cols = [
        "loan_id", "monthly_reporting_period", "servicer", "interest_rate",
        "current_actual_upb", "loan_age", "remaining_months_to_legal_maturity",
        "adj_remaining_months_to_maturity", "maturity_date", "msa",
        "current_loan_delinquency_status", "mod_flag", "zero_balance_code",
        "zero_balance_effective_date", "last_paid_installment_date",
        "foreclosed_after", "disposition_date", "foreclosure_costs",
        "prop_preservation_and_repair_costs", "asset_recovery_costs",
        "misc_holding_expenses", "holding_taxes", "net_sale_proceeds",
        "credit_enhancement_proceeds", "repurchase_make_whole_proceeds",
        "other_foreclosure_proceeds", "non_interest_bearing_upb",
        "principal_forgiveness_upb", "repurchase_make_whole_proceeds_flag",
        "foreclosure_principal_write_off_amount",
        "servicing_activity_indicator"
    ]
    dtypes = {
        "loan_id":
        np.int64,
        "orig_channel":
        CategoricalDtype(['B', 'C', 'R']),
        "seller_name":
        str,
        "orig_interest_rate":
        np.float64,
        "orig_upb":
        np.int64,
        "orig_loan_term":
        np.int64,
        "orig_date":
        str,
        "first_pay_date":
        str,
        "orig_ltv":
        np.float64,
        "orig_cltv":
        np.float64,
        "num_borrowers":
        np.float64,
        "dti":
        np.float64,
        "borrower_credit_score":
        np.float64,
        "first_home_buyer":
        CategoricalDtype(['N', 'U', 'Y']),
        "loan_purpose":
        CategoricalDtype(['C', 'P', 'R', 'U']),
        "property_type":
        CategoricalDtype(['CO', 'CP', 'MH', 'PU', 'SF']),
        "num_units":
        np.int64,
        "occupancy_status":
        CategoricalDtype(['I', 'P', 'S']),
        "property_state":
        CategoricalDtype([
            'AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA',
            'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME',
            'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM',
            'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'PR', 'RI', 'SC', 'SD', 'TN',
            'TX', 'UT', 'VA', 'VI', 'VT', 'WA', 'WI', 'WV', 'WY'
        ]),
        "zip":
        np.int64,
        "mortgage_insurance_percent":
        np.float64,
        "product_type":
        CategoricalDtype(['FRM']),
        "coborrow_credit_score":
        np.float64,
        "mortgage_insurance_type":
        np.float64,
        "relocation_mortgage_indicator":
        CategoricalDtype(['N', 'Y']),
    }

    print(performance_path)

    #return pd.read_csv(performance_path, names=cols, delimiter='|', dtype=dtypes, parse_dates=[1,8,13,14,15,16])
    return pd.read_csv('perf.csv',
                       names=cols,
                       delimiter='|',
                       dtype=dtypes,
                       parse_dates=[1, 8, 13, 14, 15, 16])
Exemple #22
0
# Load the data.
path_train = 'data/heart_disease_train.csv'
path_test = 'data/heart_disease_test.csv'
target_column_name = "diagnosis"
gender_options = ['male', 'female']
chest_pain_options = ['typical angina', 'asymptomatic', 'non-angina pain', 'atypical angina']
fasting_blood_sugar_greater_than_120_options = [True, False]
resting_ecg_result_options = ['probable or definite left ventricular hypertrophy', 'normal', 'ST-T wave abnormality']
exercise_induced_angina_options = ['no', 'yes']
exercise_st_slope_options = ['downsloping', 'flat', 'upsloping']
fluoroscopy_vessels_colored_options = ['0', '1', '2', '3']
thallium_stress_test_options = ['fixed defect', 'normal', 'reversible defect']
diagnosis_options = ['Negative', 'Positive']
dtype = {
	'age': np.float64,
	'gender': CategoricalDtype(categories=gender_options),
	'chest_pain': CategoricalDtype(categories=chest_pain_options),
	'resting_blood_pressure': np.float64,
	'cholesterol': np.float64,
	'fasting_blood_sugar_greater_than_120': CategoricalDtype(categories=fasting_blood_sugar_greater_than_120_options),
	'resting_ecg_result': CategoricalDtype(categories=resting_ecg_result_options),
	'exercise_max_heart_rate': np.float64,
	'exercise_induced_angina': CategoricalDtype(categories=exercise_induced_angina_options),
	'exercise_st_depression': np.float64,
	'exercise_st_slope': CategoricalDtype(categories=exercise_st_slope_options),
	'fluoroscopy_vessels_colored': CategoricalDtype(categories=fluoroscopy_vessels_colored_options),
	'thallium_stress_test': CategoricalDtype(categories=thallium_stress_test_options),
	'diagnosis': CategoricalDtype(categories=diagnosis_options)
}
data_train = pd.read_csv(path_train, dtype=dtype)
data_test = pd.read_csv(path_test, dtype=dtype)
Exemple #23
0
 def test_unordered_compare_equal(self):
     left = pd.Series(['a', 'b', 'c'],
                      dtype=CategoricalDtype(['a', 'b']))
     right = pd.Series(pd.Categorical(['a', 'b', np.nan],
                                      categories=['a', 'b']))
     tm.assert_series_equal(left, right)
Exemple #24
0
    def add_categories(self,
                       new_categories: Union[pd.Index, Any, List],
                       inplace: bool = False) -> Optional["ps.Series"]:
        """
        Add new categories.

        `new_categories` will be included at the last/highest place in the
        categories and will be unused directly after this call.

        Parameters
        ----------
        new_categories : category or list-like of category
           The new categories to be included.
        inplace : bool, default False
           Whether or not to add the categories inplace or return a copy of
           this categorical with added categories.

           .. deprecated:: 3.2.0

        Returns
        -------
        Series or None
            Categorical with new categories added or None if ``inplace=True``.

        Raises
        ------
        ValueError
            If the new categories include old categories or do not validate as
            categories

        See Also
        --------
        rename_categories : Rename categories.
        reorder_categories : Reorder categories.
        remove_categories : Remove the specified categories.
        remove_unused_categories : Remove categories which are not used.
        set_categories : Set the categories to the specified ones.

        Examples
        --------
        >>> s = ps.Series(list("abbccc"), dtype="category")
        >>> s  # doctest: +SKIP
        0    a
        1    b
        2    b
        3    c
        4    c
        5    c
        dtype: category
        Categories (3, object): ['a', 'b', 'c']

        >>> s.cat.add_categories('x')  # doctest: +SKIP
        0    a
        1    b
        2    b
        3    c
        4    c
        5    c
        dtype: category
        Categories (4, object): ['a', 'b', 'c', 'x']
        """
        from pyspark.pandas.frame import DataFrame

        if inplace:
            warnings.warn(
                "The `inplace` parameter in add_categories is deprecated "
                "and will be removed in a future version.",
                FutureWarning,
            )

        categories: List[Any]
        if is_list_like(new_categories):
            categories = list(new_categories)
        else:
            categories = [new_categories]

        if any(cat in self.categories for cat in categories):
            raise ValueError(
                "new categories must not include old categories: {{{cats}}}".
                format(cats=", ".join(
                    set(
                        str(cat) for cat in categories
                        if cat in self.categories))))

        internal = self._data._psdf._internal.with_new_spark_column(
            self._data._column_label,
            self._data.spark.column,
            field=self._data._internal.data_fields[0].copy(
                dtype=CategoricalDtype(list(self.categories) + categories,
                                       ordered=self.ordered)),
        )
        if inplace:
            self._data._psdf._update_internal_frame(internal)
            return None
        else:
            return DataFrame(internal)._psser_for(
                self._data._column_label).copy()
def replace_cat_data(data, col_name, order):
    data[col_name] = data[col_name].astype(
        CategoricalDtype(categories=order, ordered=True))
    data[col_name] = data[col_name].cat.codes
Exemple #26
0
df = diamonds.copy()
df.head()

df.info()

df.describe().T

df["cut"].value_counts()
df["color"].value_counts()

#ordinal tanımlama

from pandas.api.types import CategoricalDtype

#cut degerini kategorik değiskene donusturur ve sıralamayı(ordinal) bir sekilde yapar
df.cut = df.cut.astype(CategoricalDtype(ordered = True))

#cut degerinin kategorik degisken olarak degerlerini gozlemledim
df.dtypes

#kendi icerisinde kalite degerlerini sıraladı
df.cut.head(1)

# fakat bu kategorik degisken dogru bir sıralama yapmadı
# doÄŸru sıralama: (Fair > Good > Very Good > Premium > Ideal)
# kendi kategorileri sıralamamızı yazmamız daha iyi olacaktır
cut_kategoriler = ["Fair", "Good", "Very Good", "Premium", "Ideal"]

#categories ile kategorisi belirtilir
df.cut = df.cut.astype(CategoricalDtype(categories = cut_kategoriler, ordered = True))
Exemple #27
0
 def dtype():
     return CategoricalDtype(categories=[s.name for s in BodyPosition],
                             ordered=True)
Exemple #28
0
    def rename_categories(self,
                          new_categories: Union[list, dict, Callable],
                          inplace: bool = False) -> Optional["ps.Series"]:
        """
        Rename categories.

        Parameters
        ----------
        new_categories : list-like, dict-like or callable

            New categories which will replace old categories.

            * list-like: all items must be unique and the number of items in
              the new categories must match the existing number of categories.

            * dict-like: specifies a mapping from
              old categories to new. Categories not contained in the mapping
              are passed through and extra categories in the mapping are
              ignored.

            * callable : a callable that is called on all items in the old
              categories and whose return values comprise the new categories.

        inplace : bool, default False
            Whether or not to rename the categories inplace or return a copy of
            this categorical with renamed categories.

            .. deprecated:: 3.2.0

        Returns
        -------
        cat : Series or None
            Categorical with removed categories or None if ``inplace=True``.

        Raises
        ------
        ValueError
            If new categories are list-like and do not have the same number of
            items than the current categories or do not validate as categories

        See Also
        --------
        reorder_categories : Reorder categories.
        add_categories : Add new categories.
        remove_categories : Remove the specified categories.
        remove_unused_categories : Remove categories which are not used.
        set_categories : Set the categories to the specified ones.

        Examples
        --------
        >>> s = ps.Series(["a", "a", "b"], dtype="category")
        >>> s.cat.rename_categories([0, 1])  # doctest: +SKIP
        0    0
        1    0
        2    1
        dtype: category
        Categories (2, int64): [0, 1]

        For dict-like ``new_categories``, extra keys are ignored and
        categories not in the dictionary are passed through

        >>> s.cat.rename_categories({'a': 'A', 'c': 'C'})  # doctest: +SKIP
        0    A
        1    A
        2    b
        dtype: category
        Categories (2, object): ['A', 'b']

        You may also provide a callable to create the new categories

        >>> s.cat.rename_categories(lambda x: x.upper())  # doctest: +SKIP
        0    A
        1    A
        2    B
        dtype: category
        Categories (2, object): ['A', 'B']
        """
        from pyspark.pandas.frame import DataFrame

        if inplace:
            warnings.warn(
                "The `inplace` parameter in rename_categories is deprecated "
                "and will be removed in a future version.",
                FutureWarning,
            )

        if is_dict_like(new_categories):
            categories = [
                cast(dict, new_categories).get(item, item)
                for item in self.categories
            ]
        elif callable(new_categories):
            categories = [new_categories(item) for item in self.categories]
        elif is_list_like(new_categories):
            if len(self.categories) != len(new_categories):
                raise ValueError(
                    "new categories need to have the same number of items as the old categories!"
                )
            categories = cast(list, new_categories)
        else:
            raise TypeError(
                "new_categories must be list-like, dict-like or callable.")

        internal = self._data._psdf._internal.with_new_spark_column(
            self._data._column_label,
            self._data.spark.column,
            field=self._data._internal.data_fields[0].copy(
                dtype=CategoricalDtype(categories=categories,
                                       ordered=self.ordered)),
        )

        if inplace:
            self._data._psdf._update_internal_frame(internal)
            return None
        else:
            psser = DataFrame(internal)._psser_for(self._data._column_label)
            return psser._with_new_scol(psser.spark.column,
                                        field=psser._internal.data_fields[0])
Exemple #29
0
    def __preprocess(self, X):
        # Drop empty features (dataset v. 1.0.0): unspsc_code, label
        X = X.drop(["label", "unspsc_code"], axis=1)

        # Use unordered caterogies for several columns. List category values to support use cases when some
        # values are absent from a batch of source data.
        brand_types = CategoricalDtype(categories=[
            "b0", "b1", "b10", "b100", "b101", "b102", "b103", "b104", "b105",
            "b106", "b107", "b108", "b109", "b11", "b110", "b111", "b112",
            "b113", "b114", "b115", "b116", "b117", "b118", "b119", "b12",
            "b120", "b121", "b122", "b123", "b124", "b125", "b126", "b127",
            "b128", "b129", "b13", "b130", "b131", "b132", "b133", "b134",
            "b135", "b136", "b137", "b138", "b139", "b14", "b140", "b141",
            "b142", "b143", "b144", "b145", "b146", "b147", "b148", "b149",
            "b15", "b16", "b17", "b18", "b19", "b2", "b20", "b21", "b22",
            "b23", "b24", "b25", "b26", "b27", "b28", "b29", "b3", "b30",
            "b31", "b32", "b33", "b34", "b35", "b36", "b37", "b38", "b39",
            "b4", "b40", "b41", "b42", "b43", "b44", "b45", "b46", "b47",
            "b48", "b49", "b5", "b50", "b51", "b52", "b53", "b54", "b55",
            "b56", "b57", "b58", "b59", "b6", "b60", "b61", "b62", "b63",
            "b64", "b65", "b66", "b67", "b68", "b69", "b7", "b70", "b71",
            "b72", "b73", "b74", "b75", "b76", "b77", "b78", "b79", "b8",
            "b80", "b81", "b82", "b83", "b84", "b85", "b86", "b87", "b88",
            "b89", "b9", "b90", "b91", "b92", "b93", "b94", "b95", "b96",
            "b97", "b98", "b99"
        ],
                                       ordered=False)
        X["brand"] = X["brand"].astype(brand_types)
        cat1_types = CategoricalDtype(categories=[
            "baby", "clothing", "home", "kidswear", "menswear", "womenswear"
        ],
                                      ordered=False)
        X["category-1"] = X["category-1"].astype(cat1_types)
        cat2_types = CategoricalDtype(categories=[
            "home", "footwear", "nightwear", "thermals", "outerwear",
            "accessory", "uniform", "suit", "swimwear", "headgear",
            "sportswear", "costume", "clothing", "undergarments", "baby",
            "dress", "beachwear", "men-undergarments", "hosiery",
            "women-beachwear", "women-undergarments", "women-sportswear"
        ],
                                      ordered=False)
        X["category-2"] = X["category-2"].astype(cat2_types)
        cat3_types = CategoricalDtype(categories=[
            "backpack", "bikin", "body", "boxer-brief", "bra", "brief",
            "briefs", "cap", "coats", "costume", "curtain", "dress",
            "evening-dress", "fancy-dress", "flat-cap", "gloves", "hat",
            "hoodie", "jacket", "jean-shorts", "jeans", "jersey", "knit-cap",
            "knitwear", "long-sleeved-top", "mat", "overalls", "panties",
            "pants", "pillow", "pyjama", "scarf", "sheets", "shorts", "skirts",
            "snow-suit", "socks", "sport-bra", "stockings", "swimsuit",
            "T-shirt", "tie", "tights", "top", "towel", "trousers",
            "underpants", "wedding-dress"
        ],
                                      ordered=False)
        X["category-3"] = X["category-3"].astype(cat3_types)
        colour_types = CategoricalDtype(categories=[
            "Ivory", "amber", "aquamarine", "black", "blue", "blue gray",
            "bondi blue", "brown", "colourful", "dark green", "dark grey",
            "gold", "golden", "gray", "green", "grey", "indigo", "light brown",
            "light grey", "lime", "maroon", "metal", "mosaic", "mustard",
            "natural", "navy", "neon", "orange", "peach", "pink", "purple",
            "red", "silver", "teal", "turquoise", "unbleached", "unknown",
            "violet", "wheat", "white", "yellow"
        ],
                                        ordered=False)
        X["colour"] = X["colour"].astype(colour_types)
        fabric_type_types = CategoricalDtype(categories=["K", "W"],
                                             ordered=False)
        X["fabric_type"] = X["fabric_type"].astype(fabric_type_types)
        gender_types = CategoricalDtype(
            categories=["B", "G", "K", "M", "U", "Y", "W"], ordered=False)
        X["gender"] = X["gender"].astype(gender_types)
        made_in_types = CategoricalDtype(categories=[
            "AU", "BD", "BE", "BG", "BR", "CN", "CO", "CY", "DE", "DK", "EG",
            "ES", "FI", "FR", "GB", "GE", "GR", "HK", "IE", "IN", "IT", "JP",
            "KR", "LT", "LV", "ML", "MX", "PK", "RO", "SE", "TH", "TR", "TW",
            "US", "VE", "VN"
        ],
                                         ordered=False)
        X["made_in"] = X["made_in"].astype(made_in_types)
        season_types = CategoricalDtype(
            categories=["AYR", "MID", "SUM", "WIN"], ordered=False)
        X["season"] = X["season"].astype(season_types)

        # Use ordered categories for size
        size_type = CategoricalDtype(
            categories=["XS", "S", "M", "L", "XL", "XXL"], ordered=True)
        X["size"] = X["size"].astype(size_type)

        # Convert the categoricals into a one-hot vector of binary variables
        X = pd.get_dummies(X)
        #print(X)

        # Fill in 0 for NA in ftp_ columns
        X = X.fillna(0)
        #print(X)

        scaler = MinMaxScaler()
        X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
        #print(X_scaled)

        return X_scaled
Exemple #30
0
def castComplicatedColumn(df, originalCol, newCol, castDict, exclude=None, inPlace=True, tagOldColumn=False):
    '''
    df (DataFrame): dataframe to operate on
    originalCol (string): column to cast from df
    newCol (string): new name for cast column
    castDict (dict): mapping for casting
    exclude (list): list of excludeBy namedtuples to exclude via column and values of said column and replace with storeValue 
    inPlace: modify in place or return new df
    
    returns:
        inPlace: True
            DataFrame: new modified df with cast column newCol
    
    example:
                df =    AAA    BBB    CCC
                   0    True   2      string1
                   1    True   3      string2
                   2    False  4      string3
    
            castComplicatedColumn(df, "AAA", "cast(AAA)", {True: "Yes", False: "No"})
                df =    AAA    BBB    CCC      cast(AAA)
                   0    True   2      string1  Yes
                   1    True   3      string2  Yes 
                   2    False  4      string3  No
            
            castComplicatedColumn(df, "AAA", "cast(BBB)", {2: "Valid", 3:"Valid", 4:"Invalid"}, inPlace=False) => dff
               dff =    AAA    BBB   cast(BBB)   CCC
                   0    True   2     Valid       string1
                   1    True   3     Valid       string2
                   2    False  4     Invalid     string3
            
            
            castComplicatedColumn(df, "CCC", "cast(CCC)", {"string1": "A", "string2": "B", "string3": "C"}, 
                                  exclude=[excludeBy(colName="AAA", values=[False], storeValue=NC)])
                df =    AAA    BBB    CCC      cast(CCC)
                   0    True   2      string1  A
                   1    True   3      string2  B
                   2    False  4      string3  NC
    '''
    # create copy if we don't want to modify the original df
    dff = df if inPlace else df.copy()

    # initalize a new categorical column
    dff[newCol] = np.nan
    categories = list(set(castDict.values()))
    dff[newCol] = dff[newCol].astype(CategoricalDtype(categories=categories))

    # populate new column
    dff[newCol] = applyMapRecursively(dff[originalCol], castDict.get)
    # display(dff[originalCol])

    # Recast to category since we've got three vars: True, False and None 
    # (bool can't hold this in pandas, see https://pandas.pydata.org/pandas-docs/stable/user_guide/gotchas.html#na-type-promotions for more info on casting)
    if exclude:
        for excludeEntry in exclude:
            # add to categories if need be
            if not excludeEntry.storeValue in categories:
                categories.append(excludeEntry.storeValue)
                dff[newCol] = dff[newCol].astype(CategoricalDtype(categories=categories))
            # set excluded entries to passed value
            dff.loc[(dff[excludeEntry.colName].isin(excludeEntry.values)), newCol] = excludeEntry.storeValue
    if tagOldColumn:
        if originalCol + unf in dff.columns:
            dff.drop(columns=originalCol + unf)
        dff.rename(columns={originalCol: originalCol + unf}, inplace=True)
        dff = dff.T.drop_duplicates().T  # remove duplicate columns
    if not inPlace:
        return dff