def _categorize_cvss_data(df_sft3): # The CVE vulnerability data is categorized. # Use the CVE vulnerability data as input. # Categorize the CVSS impact data # Calculate maximum impact for each software # Convert the "worst case" impact into a simple classification # "Hi-Med-Low" self.logger.info('\n\nEntering categorize_cvss_data\n\n') # Function to compute simplistic criticality value def myfn6(row): # self.logger.debug('input: ', row try: my_score = row['cvss_score'] my_ease = row['cvss_acc_cmpl_cat'] in ['LOW', 'MEDIUM'] my_access = row['cvss_acc_vect_cat'] in [ 'NETWORK', 'ADJACENT_NETWORK' ] if (my_score > 7) and my_ease and my_access: return ('High') elif (my_score < 4) or ((not my_ease) and (not my_access)): return ('Low') else: return ('Medium') except: return ("None") # Categorize the CVSS impact data # categories # df_sft3['cvss_acc_cmpl_cat'] = df_sft3[ # 'cvss:access-complexity'].astype( # 'category', # categories=['HIGH', 'MEDIUM', 'LOW'], # ordered=True # ) cats = CategoricalDtype(categories=['HIGH', 'MEDIUM', 'LOW'], ordered=True) df_sft3['cvss_acc_cmpl_cat'] = df_sft3[ 'cvss:access-complexity'].astype(cats) # df_sft3['cvss_acc_vect_cat'] = df_sft3[ # 'cvss:access-vector'].astype( # 'category', # categories=[ # 'LOCAL', # 'ADJACENT_NETWORK', # 'NETWORK' # ], # ordered=True) cats = CategoricalDtype( categories=['LOCAL', 'ADJACENT_NETWORK', 'NETWORK'], ordered=True) df_sft3['cvss_acc_vect_cat'] = df_sft3[ 'cvss:access-vector'].astype(cats) # convert from string to float for max comparisons df_sft3['cvss_score'] = pd.to_numeric(df_sft3['cvss:score'], errors='coerce') # get rid of extraneous data columns df_sft3.drop( [ u'cvss:score', u'cvss:access-complexity', u'cvss:access-vector', u'cvss:authentication', u'cvss:availability-impact', u'cvss:confidentiality-impact', u'cvss:integrity-impact' ], # u'vuln:security-protection'], # 170118 Bug fix: Sometimes not present inplace=True, axis=1) # rename some other columns for easier access df_sft4 = df_sft3.rename( columns={ 'vuln:cve-id': 'cve_id', 'vuln:product': 'cpe_prod', 'cvss:source': 'cvss_src' }) self.logger.info('\n\nProcessing CVE ' 'vulnerability data: \n{0}\n{1}\n\n'.format( df_sft4.shape, df_sft4.columns)) # Calculate maximum impact for each software # group vulns by software df_sft4_gp = df_sft4.groupby('cpe_prod') # compute worst case value for each software df_sft4_agg = df_sft4_gp.agg({ 'cvss_score': max, 'cvss_acc_cmpl_cat': max, 'cvss_acc_vect_cat': max }) self.logger.debug('\n\n Aggregated CVE vuln data ' 'for worst case\n{0}\n{1} '.format( df_sft4_agg.shape, df_sft4_agg.columns)) # Convert the "worst case" impact into a simple classification # of "Hi-Med-Low" # Compute the criticality for the vuln data df_sft4_agg['crit_X'] = df_sft4_agg.apply(myfn6, axis=1) self.logger.debug('\nAggregated vuln data counts ' 'by criticality \n{0}\n\n'.format( df_sft4_agg.crit_X.value_counts())) # The criticality value is converted to a pandas category my_crit_categories = ['None', 'Low', 'Medium', 'High'] # and then convert the calculated value to a category # df_sft4_agg['crit_X_cat'] = df_sft4_agg['crit_X'].astype( # 'category', # categories=my_crit_categories, # ordered=True # ) cats = CategoricalDtype(categories=my_crit_categories, ordered=True) df_sft4_agg['crit_X_cat'] = df_sft4_agg['crit_X'].astype(cats) df_sft4_agg.drop(['crit_X'], axis=1, inplace=True) self.logger.info('\n\nAggregated vuln data: \n{0}\n{1}\n\n'.format( df_sft4_agg.shape, df_sft4_agg.columns)) return (df_sft4_agg)
def dtype(): return CategoricalDtype(categories=[s.name for s in Present], ordered=True)
def dtype(): return CategoricalDtype(categories=[s.name for s in CompassBearing], ordered=True)
def test_rank_categorical(self): # GH issue #15420 rank incorrectly orders ordered categories # Test ascending/descending ranking for ordered categoricals exp = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]) exp_desc = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0]) ordered = Series( ["first", "second", "third", "fourth", "fifth", "sixth"] ).astype( CategoricalDtype( categories=["first", "second", "third", "fourth", "fifth", "sixth"], ordered=True, ) ) assert_series_equal(ordered.rank(), exp) assert_series_equal(ordered.rank(ascending=False), exp_desc) # Unordered categoricals should be ranked as objects unordered = Series( ["first", "second", "third", "fourth", "fifth", "sixth"] ).astype( CategoricalDtype( categories=["first", "second", "third", "fourth", "fifth", "sixth"], ordered=False, ) ) exp_unordered = Series([2.0, 4.0, 6.0, 3.0, 1.0, 5.0]) res = unordered.rank() assert_series_equal(res, exp_unordered) unordered1 = Series([1, 2, 3, 4, 5, 6]).astype( CategoricalDtype([1, 2, 3, 4, 5, 6], False) ) exp_unordered1 = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]) res1 = unordered1.rank() assert_series_equal(res1, exp_unordered1) # Test na_option for rank data na_ser = Series( ["first", "second", "third", "fourth", "fifth", "sixth", np.NaN] ).astype( CategoricalDtype( ["first", "second", "third", "fourth", "fifth", "sixth", "seventh"], True, ) ) exp_top = Series([2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 1.0]) exp_bot = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]) exp_keep = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, np.NaN]) assert_series_equal(na_ser.rank(na_option="top"), exp_top) assert_series_equal(na_ser.rank(na_option="bottom"), exp_bot) assert_series_equal(na_ser.rank(na_option="keep"), exp_keep) # Test na_option for rank data with ascending False exp_top = Series([7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0]) exp_bot = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 7.0]) exp_keep = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0, np.NaN]) assert_series_equal(na_ser.rank(na_option="top", ascending=False), exp_top) assert_series_equal(na_ser.rank(na_option="bottom", ascending=False), exp_bot) assert_series_equal(na_ser.rank(na_option="keep", ascending=False), exp_keep) # Test invalid values for na_option msg = "na_option must be one of 'keep', 'top', or 'bottom'" with pytest.raises(ValueError, match=msg): na_ser.rank(na_option="bad", ascending=False) # invalid type with pytest.raises(ValueError, match=msg): na_ser.rank(na_option=True, ascending=False) # Test with pct=True na_ser = Series(["first", "second", "third", "fourth", np.NaN]).astype( CategoricalDtype(["first", "second", "third", "fourth"], True) ) exp_top = Series([0.4, 0.6, 0.8, 1.0, 0.2]) exp_bot = Series([0.2, 0.4, 0.6, 0.8, 1.0]) exp_keep = Series([0.25, 0.5, 0.75, 1.0, np.NaN]) assert_series_equal(na_ser.rank(na_option="top", pct=True), exp_top) assert_series_equal(na_ser.rank(na_option="bottom", pct=True), exp_bot) assert_series_equal(na_ser.rank(na_option="keep", pct=True), exp_keep)
def test_read_zinc_grid(): expected_grid_info = dict( view=zincio.String("chart"), hisStart=zincio.Datetime(pd.Timestamp("2020-05-18T00:00:00-07:00"), tz="Los_Angeles"), hisEnd=zincio.Datetime(pd.Timestamp("2020-05-18T01:15:00-07:00"), tz="Los_Angeles"), hisLimit=zincio.Number(10000), dis=zincio.String("Mon 18-May-2020")) expected_column_info = dict( ts=dict( disKey=zincio.String('ui::timestamp'), tz=zincio.String('Los_Angeles'), chartFormat=zincio.String('ka'), ), v0=dict(id=zincio.Ref('p:q01b001:r:0197767d-c51944e4', 'Building One VAV1-01 Eff Heat SP'), navName=zincio.String('Eff Heat SP'), point=zincio.MARKER, his=zincio.MARKER, siteRef=zincio.Ref('p:q01b001:r:8fc116f8-72c5320c', 'Building One'), equipRef=zincio.Ref('p:q01b001:r:b78a8dcc-828caa1b', None), curVal=zincio.Number(65.972, '°F'), curStatus=zincio.String('ok'), kind=zincio.String('Number'), unit=zincio.String('°F'), tz=zincio.String('Los_Angeles'), sp=zincio.MARKER, temp=zincio.MARKER, cur=zincio.MARKER, haystackPoint=zincio.MARKER, air=zincio.MARKER, effective=zincio.MARKER, heating=zincio.MARKER), v1=dict( id=zincio.Ref('p:q01b001:r:e69a7401-f4b340ff', 'Building One VAV1-01 Eff Occupancy'), navName=zincio.String('Eff Occupancy'), point=zincio.MARKER, his=zincio.MARKER, siteRef=zincio.Ref('p:q01b001:r:8fc116f8-72c5320c', 'Building One'), equipRef=zincio.Ref('p:q01b001:r:b78a8dcc-828caa1b', 'Building One VAV1-01'), curVal=zincio.String('Occupied'), curStatus=zincio.String('ok'), kind=zincio.String('Str'), tz=zincio.String('Los_Angeles'), sensor=zincio.MARKER, cur=zincio.MARKER, haystackPoint=zincio.MARKER, hisCollectCov=zincio.MARKER, enum=zincio.String('Nul,Occupied,Unoccupied,Bypass,Standby'), effective=zincio.MARKER, occupied=zincio.MARKER, ), v2=dict(id=zincio.Ref('p:q01b001:r:dcfe87d9-cd034388', 'Building One VAV1-01 Damper Pos'), navName=zincio.String('Damper Pos'), point=zincio.MARKER, his=zincio.MARKER, siteRef=zincio.Ref('p:q01b001:r:8fc116f8-72c5320c', 'Building One'), equipRef=zincio.Ref('p:q01b001:r:b78a8dcc-828caa1b', 'Building One VAV1-01'), curVal=zincio.Number(41.5, '%'), curStatus=zincio.String('ok'), kind=zincio.String('Number'), unit=zincio.String('%'), tz=zincio.String('Los_Angeles'), sensor=zincio.MARKER, cur=zincio.MARKER, damper=zincio.MARKER, precision=zincio.Number(1.0), haystackPoint=zincio.MARKER, air=zincio.MARKER), v3=dict(id=zincio.Ref('p:q01b001:r:8fab195e-58ffca99', 'Building One VAV1-01 Occ Heat SP Offset'), navName=zincio.String('Occ Heat SP Offset'), point=zincio.MARKER, his=zincio.MARKER, siteRef=zincio.Ref('p:q01b001:r:8fc116f8-72c5320c', 'Building One'), equipRef=zincio.Ref('p:q01b001:r:b78a8dcc-828caa1b', 'Building One VAV1-01'), curVal=zincio.Number(-2.394, '°C'), curStatus=zincio.String('ok'), kind=zincio.String('Number'), unit=zincio.String('°C'), tz=zincio.String('Los_Angeles'), sp=zincio.MARKER, temp=zincio.MARKER, cur=zincio.MARKER, air=zincio.MARKER, occ=zincio.MARKER, writable=zincio.MARKER, writeStatus=zincio.String('unknown'), zone=zincio.MARKER, hisCollectInterval=zincio.Number(5.0, 'min'), heating=zincio.MARKER, offset=zincio.MARKER, writeLevel=zincio.Number(8.0, None), haystackPoint=zincio.MARKER, writeVal=zincio.Number(-10.0), actions=zincio.String( 'ver:\\"3.0\\"\\ndis,expr\\n\\"Override\\",' '\\"pointOverride(\\$self, \\$val, \\$duration)\\"\\n' '\\"Auto\\",\\"pointAuto(\\$self)\\"\\n')), v4=dict( id=zincio.Ref('p:q01b001:r:260ce2bb-2ef5065f', 'Building One VAV1-01 Air Flow'), navName=zincio.String('Air Flow'), point=zincio.MARKER, his=zincio.MARKER, siteRef=zincio.Ref('p:q01b001:r:8fc116f8-72c5320c', 'Building One'), equipRef=zincio.Ref('p:q01b001:r:b78a8dcc-828caa1b', 'Building One VAV1-01'), curVal=zincio.Number(117.6611, 'cfm'), curStatus=zincio.String('ok'), kind=zincio.String('Number'), unit=zincio.String('cfm'), tz=zincio.String('Los_Angeles'), sensor=zincio.MARKER, cur=zincio.MARKER, )) expected_index = pd.DatetimeIndex([ pd.to_datetime('2020-05-17T23:47:08-07:00'), pd.to_datetime('2020-05-17T23:55:00-07:00'), pd.to_datetime('2020-05-18T00:00:00-07:00'), pd.to_datetime('2020-05-18T00:05:00-07:00'), pd.to_datetime('2020-05-18T01:13:09-07:00'), ], name='ts') expected_dataframe = pd.DataFrame( index=expected_index, data={ ('@p:q01b001:r:0197767d-c51944e4 ' '"Building One VAV1-01 Eff Heat SP"'): [ np.nan, 68.553, 68.554, 69.723, np.nan, ], ('@p:q01b001:r:e69a7401-f4b340ff ' '"Building One VAV1-01 Eff Occupancy"'): pd.Series(['Occupied', '', '', '', 'Unoccupied'], index=expected_index, dtype=CategoricalDtype(categories=[ 'Nul', 'Occupied', 'Unoccupied', 'Bypass', 'Standby' ])), ('@p:q01b001:r:dcfe87d9-cd034388 ' '"Building One VAV1-01 Damper Pos"'): [np.nan, 3, 7, 18, np.nan], ('@p:q01b001:r:8fab195e-58ffca99 ' '"Building One VAV1-01 Occ Heat SP Offset"'): [ np.nan, -1.984, -2.203, 5.471, np.nan, ], '@p:q01b001:r:260ce2bb-2ef5065f "Building One VAV1-01 Air Flow"': [ np.nan, 118.65, 62.0, np.nan, np.nan, ], }) actual = zincio.read(FULL_GRID_FILE) expected = zincio.Grid(version=3, grid_info=expected_grid_info, column_info=expected_column_info, data=expected_dataframe) assert_grid_equal(actual, expected)
def add_categories( self, new_categories: Union[pd.Index, Any, List], inplace: bool = False ) -> Optional["ps.Series"]: """ Add new categories. `new_categories` will be included at the last/highest place in the categories and will be unused directly after this call. Parameters ---------- new_categories : category or list-like of category The new categories to be included. inplace : bool, default False Whether or not to add the categories inplace or return a copy of this categorical with added categories. Returns ------- Series or None Categorical with new categories added or None if ``inplace=True``. Raises ------ ValueError If the new categories include old categories or do not validate as categories Examples -------- >>> s = ps.Series(list("abbccc"), dtype="category") >>> s # doctest: +SKIP 0 a 1 b 2 b 3 c 4 c 5 c dtype: category Categories (3, object): ['a', 'b', 'c'] >>> s.cat.add_categories('x') # doctest: +SKIP 0 a 1 b 2 b 3 c 4 c 5 c dtype: category Categories (4, object): ['a', 'b', 'c', 'x'] """ from pyspark.pandas.frame import DataFrame if is_list_like(new_categories): categories = list(new_categories) # type: List else: categories = [new_categories] if any(cat in self.categories for cat in categories): raise ValueError( "new categories must not include old categories: {{{cats}}}".format( cats=", ".join(set(str(cat) for cat in categories if cat in self.categories)) ) ) internal = self._data._psdf._internal.with_new_spark_column( self._data._column_label, self._data.spark.column, field=self._data._internal.data_fields[0].copy( dtype=CategoricalDtype(list(self.categories) + categories, ordered=self.ordered) ), ) if inplace: self._data._psdf._update_internal_frame(internal) return None else: psser = DataFrame(internal)._psser_for(self._data._column_label) return psser._with_new_scol(psser.spark.column, field=psser._internal.data_fields[0])
def pd_load_acquisition_csv(acquisition_path, **kwargs): """ Loads acquisition data Returns ------- PD DataFrame """ cols = [ 'loan_id', 'orig_channel', 'seller_name', 'orig_interest_rate', 'orig_upb', 'orig_loan_term', 'orig_date', 'first_pay_date', 'orig_ltv', 'orig_cltv', 'num_borrowers', 'dti', 'borrower_credit_score', 'first_home_buyer', 'loan_purpose', 'property_type', 'num_units', 'occupancy_status', 'property_state', 'zip', 'mortgage_insurance_percent', 'product_type', 'coborrow_credit_score', 'mortgage_insurance_type', 'relocation_mortgage_indicator' ] dtypes = { "loan_id": np.int64, "monthly_reporting_period": str, "servicer": str, "interest_rate": np.float64, "current_actual_upb": np.float64, "loan_age": np.float64, "remaining_months_to_legal_maturity": np.float64, "adj_remaining_months_to_maturity": np.float64, "maturity_date": str, "msa": np.float64, "current_loan_delinquency_status": np.int32, "mod_flag": CategoricalDtype(['N', 'Y']), "zero_balance_code": CategoricalDtype(['01', '02', '06', '09', '03', '15', '16']), "zero_balance_effective_date": str, "last_paid_installment_date": str, "foreclosed_after": str, "disposition_date": str, "foreclosure_costs": np.float64, "prop_preservation_and_repair_costs": np.float64, "asset_recovery_costs": np.float64, "misc_holding_expenses": np.float64, "holding_taxes": np.float64, "net_sale_proceeds": np.float64, "credit_enhancement_proceeds": np.float64, "repurchase_make_whole_proceeds": np.float64, "other_foreclosure_proceeds": np.float64, "non_interest_bearing_upb": np.float64, "principal_forgiveness_upb": np.float64, "repurchase_make_whole_proceeds_flag": CategoricalDtype(['N', 'Y']), "foreclosure_principal_write_off_amount": np.float64, "servicing_activity_indicator": CategoricalDtype(['N', 'Y']), } print(acquisition_path) #return pd.read_csv(acquisition_path, names=cols, delimiter='|', dtype=dtypes, parse_dates=[6,7]) return pd.read_csv('acq.csv', names=cols, delimiter='|', dtype=dtypes, parse_dates=[6, 7])
def pd_load_acquisition_csv(acquisition_path, **kwargs): """ Loads acquisition data Returns ------- PD DataFrame """ cols = [ 'loan_id', 'orig_channel', 'seller_name', 'orig_interest_rate', 'orig_upb', 'orig_loan_term', 'orig_date', 'first_pay_date', 'orig_ltv', 'orig_cltv', 'num_borrowers', 'dti', 'borrower_credit_score', 'first_home_buyer', 'loan_purpose', 'property_type', 'num_units', 'occupancy_status', 'property_state', 'zip', 'mortgage_insurance_percent', 'product_type', 'coborrow_credit_score', 'mortgage_insurance_type', 'relocation_mortgage_indicator' ] dtypes = { "loan_id": np.int64, "orig_channel": CategoricalDtype(['B', 'C', 'R']), "seller_name": str, "orig_interest_rate": np.float64, "orig_upb": np.int64, "orig_loan_term": np.int64, "orig_date": str, "first_pay_date": str, "orig_ltv": np.float64, "orig_cltv": np.float64, "num_borrowers": np.float64, "dti": np.float64, "borrower_credit_score": np.float64, "first_home_buyer": CategoricalDtype(['N', 'U', 'Y']), "loan_purpose": CategoricalDtype(['C', 'P', 'R', 'U']), "property_type": CategoricalDtype(['CO', 'CP', 'MH', 'PU', 'SF']), "num_units": np.int64, "occupancy_status": CategoricalDtype(['I', 'P', 'S']), "property_state": CategoricalDtype([ 'AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'PR', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VI', 'VT', 'WA', 'WI', 'WV', 'WY' ]), "zip": np.int64, "mortgage_insurance_percent": np.float64, "product_type": CategoricalDtype(['FRM']), "coborrow_credit_score": np.float64, "mortgage_insurance_type": np.float64, "relocation_mortgage_indicator": CategoricalDtype(['N', 'Y']), } return ddf.read_csv(acquisition_path, names=cols, delimiter='|', dtype=dtypes, parse_dates=[6, 7], assume_missing=True)
def pd_load_performance_csv(performance_path, **kwargs): """ Loads performance data Returns ------- PD DataFrame """ cols = [ "loan_id", "monthly_reporting_period", "servicer", "interest_rate", "current_actual_upb", "loan_age", "remaining_months_to_legal_maturity", "adj_remaining_months_to_maturity", "maturity_date", "msa", "current_loan_delinquency_status", "mod_flag", "zero_balance_code", "zero_balance_effective_date", "last_paid_installment_date", "foreclosed_after", "disposition_date", "foreclosure_costs", "prop_preservation_and_repair_costs", "asset_recovery_costs", "misc_holding_expenses", "holding_taxes", "net_sale_proceeds", "credit_enhancement_proceeds", "repurchase_make_whole_proceeds", "other_foreclosure_proceeds", "non_interest_bearing_upb", "principal_forgiveness_upb", "repurchase_make_whole_proceeds_flag", "foreclosure_principal_write_off_amount", "servicing_activity_indicator" ] dtypes = { "loan_id": np.int64, "monthly_reporting_period": str, "servicer": str, "interest_rate": np.float64, "current_actual_upb": np.float64, "loan_age": np.float64, "remaining_months_to_legal_maturity": np.float64, "adj_remaining_months_to_maturity": np.float64, "maturity_date": str, "msa": np.float64, "current_loan_delinquency_status": np.int32, "mod_flag": CategoricalDtype(['N', 'Y']), "zero_balance_code": CategoricalDtype(['01', '02', '06', '09', '03', '15', '16']), "zero_balance_effective_date": str, "last_paid_installment_date": str, "foreclosed_after": str, "disposition_date": str, "foreclosure_costs": np.float64, "prop_preservation_and_repair_costs": np.float64, "asset_recovery_costs": np.float64, "misc_holding_expenses": np.float64, "holding_taxes": np.float64, "net_sale_proceeds": np.float64, "credit_enhancement_proceeds": np.float64, "repurchase_make_whole_proceeds": np.float64, "other_foreclosure_proceeds": np.float64, "non_interest_bearing_upb": np.float64, "principal_forgiveness_upb": np.float64, "repurchase_make_whole_proceeds_flag": CategoricalDtype(['N', 'Y']), "foreclosure_principal_write_off_amount": np.float64, "servicing_activity_indicator": CategoricalDtype(['N', 'Y']), } return ddf.read_csv(performance_path, names=cols, delimiter='|', dtype=dtypes, parse_dates=[1, 8, 13, 14, 15, 16], assume_missing=True)
def cate_type(levels): return CategoricalDtype(categories=levels, ordered=True)
(np.datetime64, dt.timestamp), (np.timedelta64, dt.interval), ], ) def test_numpy_dtype(numpy_dtype, ibis_dtype): assert dt.dtype(np.dtype(numpy_dtype)) == ibis_dtype @pytest.mark.parametrize( ('pandas_dtype', 'ibis_dtype'), [ ( DatetimeTZDtype(tz='US/Eastern', unit='ns'), dt.Timestamp('US/Eastern'), ), (CategoricalDtype(), dt.Category()), ], ) def test_pandas_dtype(pandas_dtype, ibis_dtype): assert dt.dtype(pandas_dtype) == ibis_dtype def test_series_to_ibis_literal(): values = [1, 2, 3, 4] s = pd.Series(values) expr = ir.as_value_expr(s) expected = ir.sequence(list(s)) assert expr.equals(expected)
def convert_json_field_to_pandas_type(field): """ Converts a JSON field descriptor into its corresponding NumPy / pandas type Parameters ---------- field A JSON field descriptor Returns ------- dtype Raises ----- ValueError If the type of the provided field is unknown or currently unsupported Examples -------- >>> convert_json_field_to_pandas_type({'name': 'an_int', 'type': 'integer'}) 'int64' >>> convert_json_field_to_pandas_type({'name': 'a_categorical', 'type': 'any', 'contraints': {'enum': [ 'a', 'b', 'c']}, 'ordered': True}) 'CategoricalDtype(categories=['a', 'b', 'c'], ordered=True)' >>> convert_json_field_to_pandas_type({'name': 'a_datetime', 'type': 'datetime'}) 'datetime64[ns]' >>> convert_json_field_to_pandas_type({'name': 'a_datetime_with_tz', 'type': 'datetime', 'tz': 'US/Central'}) 'datetime64[ns, US/Central]' """ typ = field['type'] if typ == 'string': return 'object' elif typ == 'integer': return 'int64' elif typ == 'number': return 'float64' elif typ == 'boolean': return 'bool' elif typ == 'duration': return 'timedelta64' elif typ == 'datetime': if field.get('tz'): return 'datetime64[ns, {tz}]'.format(tz=field['tz']) else: return 'datetime64[ns]' elif typ == 'any': if 'constraints' in field and 'ordered' in field: return CategoricalDtype(categories=field['constraints']['enum'], ordered=field['ordered']) else: return 'object' raise ValueError("Unsupported or invalid field type: {}".format(typ))
user_embedding_similarity, ] sim_weights = [ cast_weight, director_weight, keywords_weight, overview_weight, user_embedding_weight, ] sim_mat = generate_weighted_similarity_matrix(arrays=sim_matrices, weights=sim_weights) recommendations = get_recommendations( films=data, titles=liked_films, similarity_matrix=sim_mat, top_n=config.POSTERS_PER_ROW * config.NUM_POSTER_ROWS, ) filtered_films = data[data["title"].isin(recommendations)] recommendation_order = CategoricalDtype(recommendations, ordered=True) filtered_films["title"] = filtered_films["title"].astype( recommendation_order) filtered_films = filtered_films.sort_values("title") display_film_posters( streamlit=st, data=filtered_films, num_rows=config.NUM_POSTER_ROWS, posters_per_row=config.POSTERS_PER_ROW, )
def get_and_filter_panos_by_osm_rid(road_id, df_edges, offset=1, vis=False, debug=False, outlier_filter=True, mul_factor=2, verbose=False): """Get the panos by OSM rid, and then filtered by some conditions. Args: road_id (int, optional): [description]. Defaults to 243387686. vis (bool, optional): [description]. Defaults to False. offset (int, optional): [the attribute `lane_num` is the real lane num or the real lane line num. If `lane_num` represent line num, then offset is 1. Other vise, the offset is 0 ]. Defaults to 1. Returns: matchingPano [dataframe]: [description] fig [plt.figure]: Figure """ # step 1: matching panos atts = [ 'index', 'RID', 'Name', 'geometry', 'lane_num', 'frechet_dis', 'angel', 'osm_road_id', 'osm_road_index', 'related_pos', 'link' ] try: if road_id > 0: matching = get_panos_of_road_and_indentify_lane_type_by_id( road_id, df_edges, False) matching = matching[atts].merge(df_edges[['s', 'e']], left_on='osm_road_index', right_index=True) road_name = df_edges.query(f'rid=={road_id}').name.unique()[0] else: # FIXME -208128058 高新中三道, 街景仅遍历了一遍 df_tmp = _get_revert_df_edges(road_id, df_edges) road_name = df_tmp.name.unique()[0] matching = get_panos_of_road_and_indentify_lane_type_by_id( road_id, df_tmp, False) matching = matching[atts].merge(df_tmp[['s', 'e']], left_on='osm_road_index', right_index=True) if matching.shape[0] == 0: print( f"{sys._getframe(0).f_code.co_name} {road_id}, no matching recods" ) return None, None except: print(f"{sys._getframe(0).f_code.co_name} {road_id}, process error") return None, None rids = [] for i in matching.RID.values: if i in rids: continue rids.append(i) rids_ordered = CategoricalDtype(rids, ordered=True) # filter outlier -> 计算路段的统计属性 points = DB_panos.query(f"RID in {rids}").dropna() tmp = points.groupby('RID').apply(lambda x: _panos_filter(x)).drop( columns='RID').reset_index() if outlier_filter and tmp.shape[0] != 0: if verbose: origin_size = tmp.shape[0] _mean, _std = tmp.lane_num.mean(), tmp.lane_num.std() if not np.isnan(_mean) and not np.isnan(_std): iterverl = (_mean - mul_factor * _std, _mean + mul_factor * _std) tmp.query(f" {iterverl[0]} < lane_num < {iterverl[1]}", inplace=True) if verbose: print( f"{sys._getframe(0).f_code.co_name} outlier_filter, size: {origin_size} -> {tmp.shape[0]}" ) if tmp.shape[0] == 0: print( f"{sys._getframe(0).f_code.co_name} {road_id}, no matching records after filter algorithm" ) return None, None # reorder the panos tmp.loc[:, 'RID'] = tmp['RID'].astype(rids_ordered) tmp.sort_values(by=['RID', 'Order'], inplace=True) tmp.reset_index(drop=True, inplace=True) if offset: tmp.loc[:, 'lane_num'] = tmp.loc[:, 'lane_num'] - 1 if vis: fig, ax = map_visualize(tmp, scale=.1, color='gray', figsize=(15, 15)) df_edges.query(f'rid =={road_id}').plot(ax=ax, linestyle='--', color='black', label='OSM road', alpha=.5) tmp.loc[:, 'lane_num_str'] = tmp.loc[:, 'lane_num'].astype(str) tmp.plot(ax=ax, column='lane_num_str', legend=True) _mean, _std = tmp.lane_num.mean(), tmp.lane_num.std() iterverl = (round(_mean - mul_factor * _std, 1), round(_mean + mul_factor * _std, 1)) ax.set_title( f"{road_id}, {road_name}, mean {_mean:.1f}, std {_std:.1f}, {iterverl}", fontsize=18) if debug: try: fig.savefig( f'../cache/matching_records/{road_name}_{road_id}.jpg', dpi=300) except: print(road_name, road_id) plt.tight_layout(pad=0.1) plt.close() return tmp, fig return tmp, None
def reorder_categories( self, new_categories: Union[pd.Index, List], ordered: Optional[bool] = None, inplace: bool = False, ) -> Optional["ps.Series"]: """ Reorder categories as specified in new_categories. `new_categories` need to include all old categories and no new category items. Parameters ---------- new_categories : Index-like The categories in new order. ordered : bool, optional Whether or not the categorical is treated as a ordered categorical. If not given, do not change the ordered information. inplace : bool, default False Whether or not to reorder the categories inplace or return a copy of this categorical with reordered categories. .. deprecated:: 3.2.0 Returns ------- cat : Series or None Categorical with removed categories or None if ``inplace=True``. Raises ------ ValueError If the new categories do not contain all old category items or any new ones See Also -------- rename_categories : Rename categories. add_categories : Add new categories. remove_categories : Remove the specified categories. remove_unused_categories : Remove categories which are not used. set_categories : Set the categories to the specified ones. Examples -------- >>> s = ps.Series(list("abbccc"), dtype="category") >>> s # doctest: +SKIP 0 a 1 b 2 b 3 c 4 c 5 c dtype: category Categories (3, object): ['a', 'b', 'c'] >>> s.cat.reorder_categories(['c', 'b', 'a'], ordered=True) # doctest: +SKIP 0 a 1 b 2 b 3 c 4 c 5 c dtype: category Categories (3, object): ['c' < 'b' < 'a'] """ if inplace: warnings.warn( "The `inplace` parameter in reorder_categories is deprecated " "and will be removed in a future version.", FutureWarning, ) if not is_list_like(new_categories): raise TypeError( "Parameter 'new_categories' must be list-like, was '{}'". format(new_categories)) elif len(set(new_categories)) != len(set(self.categories)) or any( cat not in self.categories for cat in new_categories): raise ValueError( "items in new_categories are not the same as in old categories" ) if ordered is None: ordered = self.ordered if new_categories == list(self.categories) and ordered == self.ordered: if inplace: return None else: psser = self._data return psser._with_new_scol( psser.spark.column, field=psser._internal.data_fields[0]) else: dtype = CategoricalDtype(categories=new_categories, ordered=ordered) psser = self._data.astype(dtype) if inplace: internal = self._data._psdf._internal.with_new_spark_column( self._data._column_label, psser.spark.column, field=psser._internal.data_fields[0], ) self._data._psdf._update_internal_frame(internal) return None else: return psser
start_station_counter.most_common(10) ## make pandas data frame for visualizing with ggplot/plotnine: dat_start_station_freq = pd.DataFrame( start_station_counter.most_common(20), columns = ['start_station_code', 'frequency']) dat_start_station_freq.rename(index = dat_start_station_freq['start_station_code'], inplace = True) ## frequency series (for sorting): ## (pandas series with index that corresponds to categories): dat_start_station_freq['frequency'] ## create list for sorting: #station_list = dat_start_station_freq['start_station_code'].value_counts().index.tolist() station_list = dat_start_station_freq['frequency'].index.tolist() station_cat = CategoricalDtype(categories=station_list, ordered=True) dat_start_station_freq['start_station_code_cat'] = \ dat_start_station_freq['start_station_code'].astype(str).astype(station_cat) ## plot counter data (frequency table, with identity relation): ## (sorting does not work here) %matplotlib inline ggplot(dat_start_station_freq, aes(x = 'start_station_code_cat', y = 'frequency')) + \ geom_bar(stat = 'identity') + \ coord_flip() ## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ## total number of trips ## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ## total number of trips:
def set_categories( self, new_categories: Union[pd.Index, List], ordered: Optional[bool] = None, rename: bool = False, inplace: bool = False, ) -> Optional["ps.Series"]: """ Set the categories to the specified new_categories. `new_categories` can include new categories (which will result in unused categories) or remove old categories (which results in values set to NaN). If `rename==True`, the categories will simple be renamed (less or more items than in old categories will result in values set to NaN or in unused categories respectively). This method can be used to perform more than one action of adding, removing, and reordering simultaneously and is therefore faster than performing the individual steps via the more specialised methods. On the other hand this methods does not do checks (e.g., whether the old categories are included in the new categories on a reorder), which can result in surprising changes, for example when using special string dtypes, which does not considers a S1 string equal to a single char python string. Parameters ---------- new_categories : Index-like The categories in new order. ordered : bool, default False Whether or not the categorical is treated as a ordered categorical. If not given, do not change the ordered information. rename : bool, default False Whether or not the new_categories should be considered as a rename of the old categories or as reordered categories. inplace : bool, default False Whether or not to reorder the categories in-place or return a copy of this categorical with reordered categories. .. deprecated:: 3.2.0 Returns ------- Series with reordered categories or None if inplace. Raises ------ ValueError If new_categories does not validate as categories See Also -------- rename_categories : Rename categories. reorder_categories : Reorder categories. add_categories : Add new categories. remove_categories : Remove the specified categories. remove_unused_categories : Remove categories which are not used. Examples -------- >>> s = ps.Series(list("abbccc"), dtype="category") >>> s # doctest: +SKIP 0 a 1 b 2 b 3 c 4 c 5 c dtype: category Categories (3, object): ['a', 'b', 'c'] >>> s.cat.set_categories(['b', 'c']) # doctest: +SKIP 0 NaN 1 b 2 b 3 c 4 c 5 c dtype: category Categories (2, object): ['b', 'c'] >>> s.cat.set_categories([1, 2, 3], rename=True) # doctest: +SKIP 0 1 1 2 2 2 3 3 4 3 5 3 dtype: category Categories (3, int64): [1, 2, 3] >>> s.cat.set_categories([1, 2, 3], rename=True, ordered=True) # doctest: +SKIP 0 1 1 2 2 2 3 3 4 3 5 3 dtype: category Categories (3, int64): [1 < 2 < 3] """ from pyspark.pandas.frame import DataFrame if inplace: warnings.warn( "The `inplace` parameter in set_categories is deprecated " "and will be removed in a future version.", FutureWarning, ) if not is_list_like(new_categories): raise TypeError( "Parameter 'new_categories' must be list-like, was '{}'". format(new_categories)) if ordered is None: ordered = self.ordered new_dtype = CategoricalDtype(new_categories, ordered=ordered) scol = self._data.spark.column if rename: new_scol = (F.when( scol >= len(new_categories), SF.lit(-1).cast( self._data.spark.data_type)).otherwise(scol).alias( self._data._internal.data_spark_column_names[0])) internal = self._data._psdf._internal.with_new_spark_column( self._data._column_label, new_scol, field=self._data._internal.data_fields[0].copy( dtype=new_dtype), ) if inplace: self._data._psdf._update_internal_frame(internal) return None else: psser = DataFrame(internal)._psser_for( self._data._column_label) return psser._with_new_scol( psser.spark.column, field=psser._internal.data_fields[0]) else: psser = self._data.astype(new_dtype) if inplace: internal = self._data._psdf._internal.with_new_spark_column( self._data._column_label, psser.spark.column, field=psser._internal.data_fields[0], ) self._data._psdf._update_internal_frame(internal) return None else: return psser
def test_astype_categorical_to_other(self): df = DataFrame({'value': np.random.randint(0, 10000, 100)}) labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] cat_labels = Categorical(labels, labels) df = df.sort_values(by=['value'], ascending=True) df['value_group'] = pd.cut(df.value, range(0, 10500, 500), right=False, labels=cat_labels) s = df['value_group'] expected = s tm.assert_series_equal(s.astype('category'), expected) tm.assert_series_equal(s.astype(CategoricalDtype()), expected) msg = (r"could not convert string to float|" r"invalid literal for float\(\)") with pytest.raises(ValueError, match=msg): s.astype('float64') cat = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])) exp = Series(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) tm.assert_series_equal(cat.astype('str'), exp) s2 = Series(Categorical(['1', '2', '3', '4'])) exp2 = Series([1, 2, 3, 4]).astype(int) tm.assert_series_equal(s2.astype('int'), exp2) # object don't sort correctly, so just compare that we have the same # values def cmp(a, b): tm.assert_almost_equal(np.sort(np.unique(a)), np.sort(np.unique(b))) expected = Series(np.array(s.values), name='value_group') cmp(s.astype('object'), expected) cmp(s.astype(np.object_), expected) # array conversion tm.assert_almost_equal(np.array(s), np.array(s.values)) # valid conversion for valid in [ lambda x: x.astype('category'), lambda x: x.astype(CategoricalDtype()), lambda x: x.astype('object').astype('category'), lambda x: x.astype('object').astype(CategoricalDtype()) ]: result = valid(s) # compare series values # internal .categories can't be compared because it is sorted tm.assert_series_equal(result, s, check_categorical=False) # invalid conversion (these are NOT a dtype) msg = (r"invalid type <class 'pandas\.core\.arrays\.categorical\." "Categorical'> for astype") for invalid in [ lambda x: x.astype(Categorical), lambda x: x.astype('object').astype(Categorical) ]: with pytest.raises(TypeError, match=msg): invalid(s)
def remove_categories( self, removals: Union[pd.Index, Any, List], inplace: bool = False ) -> Optional["ps.Series"]: """ Remove the specified categories. `removals` must be included in the old categories. Values which were in the removed categories will be set to NaN Parameters ---------- removals : category or list of categories The categories which should be removed. inplace : bool, default False Whether or not to remove the categories inplace or return a copy of this categorical with removed categories. Returns ------- Series or None Categorical with removed categories or None if ``inplace=True``. Raises ------ ValueError If the removals are not contained in the categories Examples -------- >>> s = ps.Series(list("abbccc"), dtype="category") >>> s # doctest: +SKIP 0 a 1 b 2 b 3 c 4 c 5 c dtype: category Categories (3, object): ['a', 'b', 'c'] >>> s.cat.remove_categories('b') # doctest: +SKIP 0 a 1 NaN 2 NaN 3 c 4 c 5 c dtype: category Categories (2, object): ['a', 'c'] """ if is_list_like(removals): categories = [cat for cat in removals if cat is not None] # type: List elif removals is None: categories = [] else: categories = [removals] if any(cat not in self.categories for cat in categories): raise ValueError( "removals must all be in old categories: {{{cats}}}".format( cats=", ".join( set(str(cat) for cat in categories if cat not in self.categories) ) ) ) if len(categories) == 0: if inplace: return None else: psser = self._data return psser._with_new_scol( psser.spark.column, field=psser._internal.data_fields[0] ) else: dtype = CategoricalDtype( [cat for cat in self.categories if cat not in categories], ordered=self.ordered ) psser = self._data.astype(dtype) if inplace: internal = self._data._psdf._internal.with_new_spark_column( self._data._column_label, psser.spark.column, field=psser._internal.data_fields[0], ) self._data._psdf._update_internal_frame(internal) return None else: return psser
def dtype(): return CategoricalDtype()
def pd_load_performance_csv(performance_path, **kwargs): """ Loads performance data Returns ------- PD DataFrame """ cols = [ "loan_id", "monthly_reporting_period", "servicer", "interest_rate", "current_actual_upb", "loan_age", "remaining_months_to_legal_maturity", "adj_remaining_months_to_maturity", "maturity_date", "msa", "current_loan_delinquency_status", "mod_flag", "zero_balance_code", "zero_balance_effective_date", "last_paid_installment_date", "foreclosed_after", "disposition_date", "foreclosure_costs", "prop_preservation_and_repair_costs", "asset_recovery_costs", "misc_holding_expenses", "holding_taxes", "net_sale_proceeds", "credit_enhancement_proceeds", "repurchase_make_whole_proceeds", "other_foreclosure_proceeds", "non_interest_bearing_upb", "principal_forgiveness_upb", "repurchase_make_whole_proceeds_flag", "foreclosure_principal_write_off_amount", "servicing_activity_indicator" ] dtypes = { "loan_id": np.int64, "orig_channel": CategoricalDtype(['B', 'C', 'R']), "seller_name": str, "orig_interest_rate": np.float64, "orig_upb": np.int64, "orig_loan_term": np.int64, "orig_date": str, "first_pay_date": str, "orig_ltv": np.float64, "orig_cltv": np.float64, "num_borrowers": np.float64, "dti": np.float64, "borrower_credit_score": np.float64, "first_home_buyer": CategoricalDtype(['N', 'U', 'Y']), "loan_purpose": CategoricalDtype(['C', 'P', 'R', 'U']), "property_type": CategoricalDtype(['CO', 'CP', 'MH', 'PU', 'SF']), "num_units": np.int64, "occupancy_status": CategoricalDtype(['I', 'P', 'S']), "property_state": CategoricalDtype([ 'AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'PR', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VI', 'VT', 'WA', 'WI', 'WV', 'WY' ]), "zip": np.int64, "mortgage_insurance_percent": np.float64, "product_type": CategoricalDtype(['FRM']), "coborrow_credit_score": np.float64, "mortgage_insurance_type": np.float64, "relocation_mortgage_indicator": CategoricalDtype(['N', 'Y']), } print(performance_path) #return pd.read_csv(performance_path, names=cols, delimiter='|', dtype=dtypes, parse_dates=[1,8,13,14,15,16]) return pd.read_csv('perf.csv', names=cols, delimiter='|', dtype=dtypes, parse_dates=[1, 8, 13, 14, 15, 16])
# Load the data. path_train = 'data/heart_disease_train.csv' path_test = 'data/heart_disease_test.csv' target_column_name = "diagnosis" gender_options = ['male', 'female'] chest_pain_options = ['typical angina', 'asymptomatic', 'non-angina pain', 'atypical angina'] fasting_blood_sugar_greater_than_120_options = [True, False] resting_ecg_result_options = ['probable or definite left ventricular hypertrophy', 'normal', 'ST-T wave abnormality'] exercise_induced_angina_options = ['no', 'yes'] exercise_st_slope_options = ['downsloping', 'flat', 'upsloping'] fluoroscopy_vessels_colored_options = ['0', '1', '2', '3'] thallium_stress_test_options = ['fixed defect', 'normal', 'reversible defect'] diagnosis_options = ['Negative', 'Positive'] dtype = { 'age': np.float64, 'gender': CategoricalDtype(categories=gender_options), 'chest_pain': CategoricalDtype(categories=chest_pain_options), 'resting_blood_pressure': np.float64, 'cholesterol': np.float64, 'fasting_blood_sugar_greater_than_120': CategoricalDtype(categories=fasting_blood_sugar_greater_than_120_options), 'resting_ecg_result': CategoricalDtype(categories=resting_ecg_result_options), 'exercise_max_heart_rate': np.float64, 'exercise_induced_angina': CategoricalDtype(categories=exercise_induced_angina_options), 'exercise_st_depression': np.float64, 'exercise_st_slope': CategoricalDtype(categories=exercise_st_slope_options), 'fluoroscopy_vessels_colored': CategoricalDtype(categories=fluoroscopy_vessels_colored_options), 'thallium_stress_test': CategoricalDtype(categories=thallium_stress_test_options), 'diagnosis': CategoricalDtype(categories=diagnosis_options) } data_train = pd.read_csv(path_train, dtype=dtype) data_test = pd.read_csv(path_test, dtype=dtype)
def test_unordered_compare_equal(self): left = pd.Series(['a', 'b', 'c'], dtype=CategoricalDtype(['a', 'b'])) right = pd.Series(pd.Categorical(['a', 'b', np.nan], categories=['a', 'b'])) tm.assert_series_equal(left, right)
def add_categories(self, new_categories: Union[pd.Index, Any, List], inplace: bool = False) -> Optional["ps.Series"]: """ Add new categories. `new_categories` will be included at the last/highest place in the categories and will be unused directly after this call. Parameters ---------- new_categories : category or list-like of category The new categories to be included. inplace : bool, default False Whether or not to add the categories inplace or return a copy of this categorical with added categories. .. deprecated:: 3.2.0 Returns ------- Series or None Categorical with new categories added or None if ``inplace=True``. Raises ------ ValueError If the new categories include old categories or do not validate as categories See Also -------- rename_categories : Rename categories. reorder_categories : Reorder categories. remove_categories : Remove the specified categories. remove_unused_categories : Remove categories which are not used. set_categories : Set the categories to the specified ones. Examples -------- >>> s = ps.Series(list("abbccc"), dtype="category") >>> s # doctest: +SKIP 0 a 1 b 2 b 3 c 4 c 5 c dtype: category Categories (3, object): ['a', 'b', 'c'] >>> s.cat.add_categories('x') # doctest: +SKIP 0 a 1 b 2 b 3 c 4 c 5 c dtype: category Categories (4, object): ['a', 'b', 'c', 'x'] """ from pyspark.pandas.frame import DataFrame if inplace: warnings.warn( "The `inplace` parameter in add_categories is deprecated " "and will be removed in a future version.", FutureWarning, ) categories: List[Any] if is_list_like(new_categories): categories = list(new_categories) else: categories = [new_categories] if any(cat in self.categories for cat in categories): raise ValueError( "new categories must not include old categories: {{{cats}}}". format(cats=", ".join( set( str(cat) for cat in categories if cat in self.categories)))) internal = self._data._psdf._internal.with_new_spark_column( self._data._column_label, self._data.spark.column, field=self._data._internal.data_fields[0].copy( dtype=CategoricalDtype(list(self.categories) + categories, ordered=self.ordered)), ) if inplace: self._data._psdf._update_internal_frame(internal) return None else: return DataFrame(internal)._psser_for( self._data._column_label).copy()
def replace_cat_data(data, col_name, order): data[col_name] = data[col_name].astype( CategoricalDtype(categories=order, ordered=True)) data[col_name] = data[col_name].cat.codes
df = diamonds.copy() df.head() df.info() df.describe().T df["cut"].value_counts() df["color"].value_counts() #ordinal tanımlama from pandas.api.types import CategoricalDtype #cut degerini kategorik değiskene donusturur ve sıralamayı(ordinal) bir sekilde yapar df.cut = df.cut.astype(CategoricalDtype(ordered = True)) #cut degerinin kategorik degisken olarak degerlerini gozlemledim df.dtypes #kendi icerisinde kalite degerlerini sıraladı df.cut.head(1) # fakat bu kategorik degisken dogru bir sıralama yapmadı # doÄŸru sıralama: (Fair > Good > Very Good > Premium > Ideal) # kendi kategorileri sıralamamızı yazmamız daha iyi olacaktır cut_kategoriler = ["Fair", "Good", "Very Good", "Premium", "Ideal"] #categories ile kategorisi belirtilir df.cut = df.cut.astype(CategoricalDtype(categories = cut_kategoriler, ordered = True))
def dtype(): return CategoricalDtype(categories=[s.name for s in BodyPosition], ordered=True)
def rename_categories(self, new_categories: Union[list, dict, Callable], inplace: bool = False) -> Optional["ps.Series"]: """ Rename categories. Parameters ---------- new_categories : list-like, dict-like or callable New categories which will replace old categories. * list-like: all items must be unique and the number of items in the new categories must match the existing number of categories. * dict-like: specifies a mapping from old categories to new. Categories not contained in the mapping are passed through and extra categories in the mapping are ignored. * callable : a callable that is called on all items in the old categories and whose return values comprise the new categories. inplace : bool, default False Whether or not to rename the categories inplace or return a copy of this categorical with renamed categories. .. deprecated:: 3.2.0 Returns ------- cat : Series or None Categorical with removed categories or None if ``inplace=True``. Raises ------ ValueError If new categories are list-like and do not have the same number of items than the current categories or do not validate as categories See Also -------- reorder_categories : Reorder categories. add_categories : Add new categories. remove_categories : Remove the specified categories. remove_unused_categories : Remove categories which are not used. set_categories : Set the categories to the specified ones. Examples -------- >>> s = ps.Series(["a", "a", "b"], dtype="category") >>> s.cat.rename_categories([0, 1]) # doctest: +SKIP 0 0 1 0 2 1 dtype: category Categories (2, int64): [0, 1] For dict-like ``new_categories``, extra keys are ignored and categories not in the dictionary are passed through >>> s.cat.rename_categories({'a': 'A', 'c': 'C'}) # doctest: +SKIP 0 A 1 A 2 b dtype: category Categories (2, object): ['A', 'b'] You may also provide a callable to create the new categories >>> s.cat.rename_categories(lambda x: x.upper()) # doctest: +SKIP 0 A 1 A 2 B dtype: category Categories (2, object): ['A', 'B'] """ from pyspark.pandas.frame import DataFrame if inplace: warnings.warn( "The `inplace` parameter in rename_categories is deprecated " "and will be removed in a future version.", FutureWarning, ) if is_dict_like(new_categories): categories = [ cast(dict, new_categories).get(item, item) for item in self.categories ] elif callable(new_categories): categories = [new_categories(item) for item in self.categories] elif is_list_like(new_categories): if len(self.categories) != len(new_categories): raise ValueError( "new categories need to have the same number of items as the old categories!" ) categories = cast(list, new_categories) else: raise TypeError( "new_categories must be list-like, dict-like or callable.") internal = self._data._psdf._internal.with_new_spark_column( self._data._column_label, self._data.spark.column, field=self._data._internal.data_fields[0].copy( dtype=CategoricalDtype(categories=categories, ordered=self.ordered)), ) if inplace: self._data._psdf._update_internal_frame(internal) return None else: psser = DataFrame(internal)._psser_for(self._data._column_label) return psser._with_new_scol(psser.spark.column, field=psser._internal.data_fields[0])
def __preprocess(self, X): # Drop empty features (dataset v. 1.0.0): unspsc_code, label X = X.drop(["label", "unspsc_code"], axis=1) # Use unordered caterogies for several columns. List category values to support use cases when some # values are absent from a batch of source data. brand_types = CategoricalDtype(categories=[ "b0", "b1", "b10", "b100", "b101", "b102", "b103", "b104", "b105", "b106", "b107", "b108", "b109", "b11", "b110", "b111", "b112", "b113", "b114", "b115", "b116", "b117", "b118", "b119", "b12", "b120", "b121", "b122", "b123", "b124", "b125", "b126", "b127", "b128", "b129", "b13", "b130", "b131", "b132", "b133", "b134", "b135", "b136", "b137", "b138", "b139", "b14", "b140", "b141", "b142", "b143", "b144", "b145", "b146", "b147", "b148", "b149", "b15", "b16", "b17", "b18", "b19", "b2", "b20", "b21", "b22", "b23", "b24", "b25", "b26", "b27", "b28", "b29", "b3", "b30", "b31", "b32", "b33", "b34", "b35", "b36", "b37", "b38", "b39", "b4", "b40", "b41", "b42", "b43", "b44", "b45", "b46", "b47", "b48", "b49", "b5", "b50", "b51", "b52", "b53", "b54", "b55", "b56", "b57", "b58", "b59", "b6", "b60", "b61", "b62", "b63", "b64", "b65", "b66", "b67", "b68", "b69", "b7", "b70", "b71", "b72", "b73", "b74", "b75", "b76", "b77", "b78", "b79", "b8", "b80", "b81", "b82", "b83", "b84", "b85", "b86", "b87", "b88", "b89", "b9", "b90", "b91", "b92", "b93", "b94", "b95", "b96", "b97", "b98", "b99" ], ordered=False) X["brand"] = X["brand"].astype(brand_types) cat1_types = CategoricalDtype(categories=[ "baby", "clothing", "home", "kidswear", "menswear", "womenswear" ], ordered=False) X["category-1"] = X["category-1"].astype(cat1_types) cat2_types = CategoricalDtype(categories=[ "home", "footwear", "nightwear", "thermals", "outerwear", "accessory", "uniform", "suit", "swimwear", "headgear", "sportswear", "costume", "clothing", "undergarments", "baby", "dress", "beachwear", "men-undergarments", "hosiery", "women-beachwear", "women-undergarments", "women-sportswear" ], ordered=False) X["category-2"] = X["category-2"].astype(cat2_types) cat3_types = CategoricalDtype(categories=[ "backpack", "bikin", "body", "boxer-brief", "bra", "brief", "briefs", "cap", "coats", "costume", "curtain", "dress", "evening-dress", "fancy-dress", "flat-cap", "gloves", "hat", "hoodie", "jacket", "jean-shorts", "jeans", "jersey", "knit-cap", "knitwear", "long-sleeved-top", "mat", "overalls", "panties", "pants", "pillow", "pyjama", "scarf", "sheets", "shorts", "skirts", "snow-suit", "socks", "sport-bra", "stockings", "swimsuit", "T-shirt", "tie", "tights", "top", "towel", "trousers", "underpants", "wedding-dress" ], ordered=False) X["category-3"] = X["category-3"].astype(cat3_types) colour_types = CategoricalDtype(categories=[ "Ivory", "amber", "aquamarine", "black", "blue", "blue gray", "bondi blue", "brown", "colourful", "dark green", "dark grey", "gold", "golden", "gray", "green", "grey", "indigo", "light brown", "light grey", "lime", "maroon", "metal", "mosaic", "mustard", "natural", "navy", "neon", "orange", "peach", "pink", "purple", "red", "silver", "teal", "turquoise", "unbleached", "unknown", "violet", "wheat", "white", "yellow" ], ordered=False) X["colour"] = X["colour"].astype(colour_types) fabric_type_types = CategoricalDtype(categories=["K", "W"], ordered=False) X["fabric_type"] = X["fabric_type"].astype(fabric_type_types) gender_types = CategoricalDtype( categories=["B", "G", "K", "M", "U", "Y", "W"], ordered=False) X["gender"] = X["gender"].astype(gender_types) made_in_types = CategoricalDtype(categories=[ "AU", "BD", "BE", "BG", "BR", "CN", "CO", "CY", "DE", "DK", "EG", "ES", "FI", "FR", "GB", "GE", "GR", "HK", "IE", "IN", "IT", "JP", "KR", "LT", "LV", "ML", "MX", "PK", "RO", "SE", "TH", "TR", "TW", "US", "VE", "VN" ], ordered=False) X["made_in"] = X["made_in"].astype(made_in_types) season_types = CategoricalDtype( categories=["AYR", "MID", "SUM", "WIN"], ordered=False) X["season"] = X["season"].astype(season_types) # Use ordered categories for size size_type = CategoricalDtype( categories=["XS", "S", "M", "L", "XL", "XXL"], ordered=True) X["size"] = X["size"].astype(size_type) # Convert the categoricals into a one-hot vector of binary variables X = pd.get_dummies(X) #print(X) # Fill in 0 for NA in ftp_ columns X = X.fillna(0) #print(X) scaler = MinMaxScaler() X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns) #print(X_scaled) return X_scaled
def castComplicatedColumn(df, originalCol, newCol, castDict, exclude=None, inPlace=True, tagOldColumn=False): ''' df (DataFrame): dataframe to operate on originalCol (string): column to cast from df newCol (string): new name for cast column castDict (dict): mapping for casting exclude (list): list of excludeBy namedtuples to exclude via column and values of said column and replace with storeValue inPlace: modify in place or return new df returns: inPlace: True DataFrame: new modified df with cast column newCol example: df = AAA BBB CCC 0 True 2 string1 1 True 3 string2 2 False 4 string3 castComplicatedColumn(df, "AAA", "cast(AAA)", {True: "Yes", False: "No"}) df = AAA BBB CCC cast(AAA) 0 True 2 string1 Yes 1 True 3 string2 Yes 2 False 4 string3 No castComplicatedColumn(df, "AAA", "cast(BBB)", {2: "Valid", 3:"Valid", 4:"Invalid"}, inPlace=False) => dff dff = AAA BBB cast(BBB) CCC 0 True 2 Valid string1 1 True 3 Valid string2 2 False 4 Invalid string3 castComplicatedColumn(df, "CCC", "cast(CCC)", {"string1": "A", "string2": "B", "string3": "C"}, exclude=[excludeBy(colName="AAA", values=[False], storeValue=NC)]) df = AAA BBB CCC cast(CCC) 0 True 2 string1 A 1 True 3 string2 B 2 False 4 string3 NC ''' # create copy if we don't want to modify the original df dff = df if inPlace else df.copy() # initalize a new categorical column dff[newCol] = np.nan categories = list(set(castDict.values())) dff[newCol] = dff[newCol].astype(CategoricalDtype(categories=categories)) # populate new column dff[newCol] = applyMapRecursively(dff[originalCol], castDict.get) # display(dff[originalCol]) # Recast to category since we've got three vars: True, False and None # (bool can't hold this in pandas, see https://pandas.pydata.org/pandas-docs/stable/user_guide/gotchas.html#na-type-promotions for more info on casting) if exclude: for excludeEntry in exclude: # add to categories if need be if not excludeEntry.storeValue in categories: categories.append(excludeEntry.storeValue) dff[newCol] = dff[newCol].astype(CategoricalDtype(categories=categories)) # set excluded entries to passed value dff.loc[(dff[excludeEntry.colName].isin(excludeEntry.values)), newCol] = excludeEntry.storeValue if tagOldColumn: if originalCol + unf in dff.columns: dff.drop(columns=originalCol + unf) dff.rename(columns={originalCol: originalCol + unf}, inplace=True) dff = dff.T.drop_duplicates().T # remove duplicate columns if not inPlace: return dff