def _validate_category_dict(category_dct, cols): assert isinstance(category_dct, dict) for k, v in category_dct.items(): assert k in cols, f"make_dummy_cols: {k} not in allowed columns: \n{cols}\n\n" if not isinstance(v, pd.CategoricalDtype): assert isinstance( v, list ), f"categories must be list or pd.CategoricalDtype. Got {type(v)}" category_dct[k] = pd.CategoricalDtype(list(set(v)))
def test_categorical_groupby(self): df = pd.DataFrame({'A': np.arange(6), 'B': list('aabbca')}) df['B'] = df['B'].astype(pd.CategoricalDtype(list('cab'))) df = df.set_index('B') # TODO(BEAM-11190): These aggregations can be done in index partitions, but # it will require a little more complex logic with beam.dataframe.allow_non_parallel_operations(): self._run_test(lambda df: df.groupby(level=0).sum(), df) self._run_test(lambda df: df.groupby(level=0).mean(), df)
def _set_ohe_categories(self): for col in self.ohe_cols: if col in self.ohe_categories.keys(): continue # to not override user supplied categories unique_vals = self.sample_df[col].dropna().unique() try: self.ohe_categories[col] = pd.CategoricalDtype(unique_vals) except ValueError: pass
def __init__(self, pandas_obj): # validate and assign object self._validate(pandas_obj) self._obj = pandas_obj # define incorporated modules - columns consisting of others will not have the dtype changed self._INCORPORATED_MODULES = ['builtins', 'numpy', 'pandas'] # define a possible list of null values self._NULL_VALS = [ None, np.nan, 'np.nan', 'nan', np.inf, 'np.inf', 'inf', -np.inf, '-np.inf', '', 'n/a', 'na', 'N/A', 'NA', 'unknown', 'unk', 'UNKNOWN', 'UNK' ] # assign dtypes and limits # boolean BOOL_STRINGS_TRUE = ['t', 'true', 'yes', 'on'] BOOL_STRINGS_FALSE = ['f', 'false', 'no', 'off'] self._BOOL_MAP_DICT = {i: True for i in BOOL_STRINGS_TRUE }.update({i: False for i in BOOL_STRINGS_FALSE}) self._DTYPE_BOOL_BASE = np.bool self._DTYPE_BOOL_NULLABLE = pd.BooleanDtype() # unsigned integers - base and nullable self._DTYPES_UINT_BASE = [np.uint8, np.uint16, np.uint32, np.uint64] self._DTYPES_UINT_NULLABLE = [ pd.UInt8Dtype(), pd.UInt16Dtype(), pd.UInt32Dtype(), pd.UInt64Dtype() ] self._LIMIT_LOW_UINT = [ np.iinfo(i).min for i in self._DTYPES_UINT_BASE ] self._LIMIT_HIGH_UINT = [ np.iinfo(i).max for i in self._DTYPES_UINT_BASE ] # signed integers - base and nullable self._DTYPES_INT_BASE = [np.int8, np.int16, np.int32, np.int64] self._DTYPES_INT_NULLABLE = [ pd.Int8Dtype(), pd.Int16Dtype(), pd.Int32Dtype(), pd.Int64Dtype() ] self._LIMIT_LOW_INT = [np.iinfo(i).min for i in self._DTYPES_INT_BASE] self._LIMIT_HIGH_INT = [np.iinfo(i).max for i in self._DTYPES_INT_BASE] # floats - nullable by default self._DTYPES_FLOAT = [np.float16, np.float32, np.float64] # datetime - nullable by default self._DTYPE_DATETIME = np.datetime64 # string self._DTYPE_STRING = pd.StringDtype() # categorical - nullable by default self._DTYPE_CATEGORICAL = pd.CategoricalDtype()
def setLevelForEachLevelFeatureData(self): self.levelFeatureData = self.levelFeatureData.copy() self.levelFeatureData = self.levelFeatureData.astype('str') for _ in self.levelFeatureData.columns: orderCategory = InputController.setLevelForEachFeature(_) cat_dtype = pd.CategoricalDtype( categories=orderCategory.split(','), ordered=True) self.levelFeatureData[_] = self.levelFeatureData[_].astype( cat_dtype)
def test_coordinate_axis_with_category_dtype(self, x): order = ["b", "a", "d", "c"] x = x.astype(pd.CategoricalDtype(order)) ax = mpl.figure.Figure().subplots() s = Nominal()._setup(x, Coordinate(), ax.xaxis) assert_array_equal(s(x), np.array([1, 3, 0, 3], float)) f = ax.xaxis.get_major_formatter() assert f.format_ticks([0, 1, 2, 3]) == order
def as_ordered_weekday(col): return col.astype( pd.CategoricalDtype(ordered=True)).cat.reorder_categories( [ "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday" ], ordered=True, )
def processData_inner(origin_data: pd.DataFrame, cat_lst: list = ["job", "marital", "education", "default", "housing", "loan", "contact", "poutcome"]) -> (pd.DataFrame, pd.DataFrame): """ Process the original data @param origin_data: original data in DataFrame object; @return: the data like (datas, labels) after processing; """ origin_data["month"] = origin_data["month"].astype(pd.CategoricalDtype(['jan', 'feb', 'mar','apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'], ordered=True)).cat.codes dummies_df = pd.get_dummies(origin_data, columns=cat_lst) return dummies_df
def _fit(self, X: Union[ArrayLike, DataFrameType], handle_unknown: str = "error"): X = check_array(X, accept_dask_dataframe=True, dtype=None, preserve_pandas_dataframe=True) if isinstance(X, np.ndarray): return super(OneHotEncoder, self)._fit(X, handle_unknown=handle_unknown) is_array = isinstance(X, da.Array) if is_array: _, n_features = X.shape else: n_features = len(X.columns) if self.categories != "auto": for cats in self.categories: if not np.all(np.sort(cats) == np.array(cats)): raise ValueError( "Unsorted categories are not yet supported") if len(self.categories) != n_features: raise ValueError("Shape mismatch: if n_values is an array," " it has to be of shape (n_features,).") self.categories_ = [] self.dtypes_: List[Optional[pd.CategoricalDtype]] = [] if is_array: for i in range(n_features): Xi = X[:, i] if self.categories == "auto": cats = _encode(Xi) else: cats = np.array(self.categories[i], dtype=X.dtype) self.categories_.append(cats) self.dtypes_.append(None) else: for i in range(len(X.columns)): Xi = X.iloc[:, i] if self.categories != "auto": categories = self.categories[i] Xi = Xi.astype(pd.CategoricalDtype(categories)) else: if not pd.api.types.is_categorical_dtype(Xi.dtype): raise ValueError( "All columns must be Categorical dtype when " "'categories=\"auto\"'.") cats = _encode(Xi, uniques=Xi.cat.categories) self.categories_.append(cats) self.dtypes_.append(Xi.dtype) self.categories_ = dask.compute(self.categories_)[0]
def to_pandas(self) -> pd.CategoricalDtype: if self.categories is None: categories = None else: if isinstance(self.categories, (cudf.Float32Index, cudf.Float64Index)): categories = self.categories.dropna().to_pandas() else: categories = self.categories.to_pandas() return pd.CategoricalDtype(categories=categories, ordered=self.ordered)
def Bonus3(df): diamond = df.mask(df.eq('None')).dropna() diamond["Volume"] = diamond["x"] * diamond["y"] * ( diamond["z"].astype("float32")) diamond["Volume"][diamond["depth"] <= 60] = 8 bins = (pd.qcut(list(diamond['Volume']), q=5, precision=1)).codes d = pd.CategoricalDtype(['1', '2', '3', '4', '5'], ordered=True) category = pd.Categorical.from_codes(bins, dtype=d) return pd.crosstab(diamond['cut'], category).apply(lambda r: r / r.sum(), axis=1)
def test_constructor_no_categories(self): @nb.njit def func(): return pd.CategoricalDtype() boxed = func() expected = pd.CategoricalDtype(ordered=None) assert (boxed == expected) assert (boxed.categories == expected.categories) assert (boxed.ordered == expected.ordered)
def __init__( # pylint:disable=super-init-not-called self, categories: Optional[Iterable[Any]] = None, ordered: bool = False) -> None: dtypes.Category.__init__(self, categories, ordered) object.__setattr__( self, "type", pd.CategoricalDtype(self.categories, self.ordered), )
def convert_datetime(data_set): # datetime 컬럼을 요일/시간으로 변환 try: data_set['weekday'] = data_set.dateTime.apply(lambda \ x: datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S").date().strftime('%A')) data_set['hour'] = data_set.dateTime.apply(lambda \ x: datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S").hour) date_set['minu'] = data_set.dateTime.apply(lambda \ x: datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S").minute) except: data_set['weekday'] = data_set.dateTime.apply( lambda x: x.date().strftime('%A')) data_set['hour'] = data_set.dateTime.apply(lambda x: x.hour) data_set['minu'] = data_set.dateTime.apply(lambda x: x.minute) data_set.drop([str('dateTime')], axis=1, inplace=True) # 요일을 원핫인코딩 변환 try: cat_type = CategoricalDtype(categories=WEEKS, ordered=True) data_set['weekday'] = data_set['weekday'].astype(cat_type) except: data_set["weekday"] = data_set.weekday.astype('category', categories=WEEKS) data_set = pd.concat( [data_set, pd.get_dummies(data_set["weekday"], prefix='weekday')], axis=1) data_set.drop([str('weekday')], axis=1, inplace=True) # 시간을 원핫인코딩으로 변환 try: cat_type = CategoricalDtype(categories=HOURS, ordered=True) data_set['hour'] = data_set['hour'].astype(cat_type) except: data_set["hour"] = data_set.hour.astype('hour', categories=HOURS) data_set = pd.concat( [data_set, pd.get_dummies(data_set["hour"], prefix='hour')], axis=1) data_set.drop([str('hour')], axis=1, inplace=True) # 분 단위 변환 & 원핫인코딩 bins = [x for x in range(0, 60, 5)] minu_CAT = [str(x) + "-" + str(x + 5) for x in range(0, 60, 5)] data_set['minu'] = np.vectorize(dict(enumerate(minu_CAT, 1)).get)(np.digitize( data_set['minu'], bins)) data_set['minu'] = data_set.minu.astype( pd.CategoricalDtype(categories=minu_CAT)) data_set = pd.concat( [data_set, pd.get_dummies(data_set["minu"], prefix='minu')], axis=1) data_set.drop([str('minu')], axis=1, inplace=True) return data_set
def get_setups( ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: try: db_setups = pkg.load_dataset('db_setups') game_setups = pkg.load_dataset('game_setups') free_setups = pkg.load_dataset('free_setups') null_setups = pkg.load_dataset('null_setups') except: st2 = games.get_st2() si2, _ = games.get_si2_sg2() free_setups = si2.query('type == "free"') null_setups = si2.query('field_content.isnull()') game_setups = ( si2.query('type != "free" & field_content.notnull()').assign( setup_str_red=lambda r: r.field_content.str[:40], setup_str_blue=lambda r: r.field_content.str[60:].str[::-1], dmz=lambda r: r.field_content.str[40:60])) assert (game_setups.dmz == 'AA__AA__AAAA__AA__AA').all() db_setups = (pd.wide_to_long( st2.merge(game_setups).drop( columns=['player_red', 'player_blue', 'dmz']), stubnames=['setup_str'], i='gid', j='player', sep='_', suffix='(red|blue)').reset_index().assign( result=lambda r: np.where( r.player == r.winner, 'win', np.where(r.winner == 'draw', 'draw', 'loss')), score=lambda r: np.where( r.result == 'win', 1.0, np.where(r.result == 'draw', 0.5, 0.0)), setup_str=lambda r: r.setup_str.apply(strados2.decode_setup), setup_obj=lambda r: r.apply( lambda x: Setup(x.setup_str, x.type), axis=1) ).astype( dtype={ 'result': pd.CategoricalDtype(categories=['win', 'draw', 'loss']) } ).pipe(lambda df: pd.concat( [df, pd.get_dummies(df.result, prefix='', prefix_sep='')], axis=1 )).loc[:, [ 'gid', 'filename', 'period', 'freq', 'ext', 'type', 'player', 'result', 'win', 'draw', 'loss', 'score', 'ending', 'num_moves', 'num_turns', 'next_move', 'setup_str', 'setup_obj' ]].pipe(label.setups).sort_values(['gid', 'player' ]).reset_index(drop=True)) assert all(db_setups.setup_obj.apply(lambda x: x.ok())) pkg.save_dataset(db_setups, 'db_setups') pkg.save_dataset(game_setups, 'game_setups') pkg.save_dataset(free_setups, 'free_setups') pkg.save_dataset(null_setups, 'null_setups') return db_setups, game_setups, free_setups, null_setups
def calculate_dataset_rates(files_to_process, file_name_style=file_names_style['style4'], group_by=PRED_EVOL_T, region_order=('N-450', 'MF-NCR')): """ Process set of files containing tree summaries to produce a table containing the statistics for each data set. Parameters ---------- files_to_process : tuple A tuple containing tuples, each containing paths of csv files with tree summaries for each genotype to be processed together. file_name_style : regex A string containing a regular expression to extract information from the csv file name. group_by : str The column heading to be used to group results by. region_order : tuple A tuple containing the list of genomic regions in the order to be plotted. Returns ------- None """ for file_set in files_to_process: all_rates = pd.DataFrame() out_path = None for file in file_set: # load results rates = pd.read_csv(file) region_cats = pd.CategoricalDtype(categories=region_order, ordered=True) rates[REGION] = rates[REGION].astype(region_cats) # get genotype from file name in_dir, _ = os.path.split(file) name_info = re.search(file_name_style, file).groupdict() genotype = name_info[REGEX_GENOTYPE] if out_path is None: # set output handle dataset = name_info[REGEX_DATASET] settings = name_info[REGEX_SETTINGS] out_path = os.path.join(in_dir.replace('input', 'output'), f'{dataset}-{settings}-{group_by}') # calculate rates and set df to long format for plotting rates = prepare_df_for_plotting(rates, group_by=group_by) rates[GENOTYPE] = genotype all_rates = pd.concat([all_rates, rates]) stats_out_file = f'{out_path}-stats.csv' all_rates.to_csv(stats_out_file, index=False) files_done = '\n'.join(file_set) print(f'Completed processing set of files containing\n{files_done}\n' f'Model statistics table saved to {stats_out_file}\n')
def main1(args): print(args) input_coordinate_list=args.input_coordinate_list input_vcf=args.input_vcf setting_name=args.setting output_dir = os.path.dirname(input_coordinate_list) input_coordinate_list_data=pd.read_csv(input_coordinate_list,header=None,delimiter="\t") input_coordinate_list_data.columns = ["#CHROM","POS"] # input_vcf_data=pd.read_csv(input_vcf,delimiter="\t", header = 1, skiprows=lambda x: rm_comment(x)) input_vcf_data =[] with open(input_vcf, "r") as fh: while True: line = fh.readline() if not line: break elif re.search(pattern1, line): pass else: input_vcf_data.append(line.rsplit("\t")) input_vcf_data2 = pd.DataFrame(input_vcf_data[1:]) input_vcf_data2.columns = input_vcf_data[0] input_vcf_data2["POS"] = input_vcf_data2["POS"].astype(int) chr_index = pd.CategoricalDtype(["1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","X","Y"], ordered = True) # df3 = input_coordinate_list_data.set_index(["#CHROM","POS"]) # df3.update(input_vcf_data2.set_index(["#CHROM","POS"])) # df3.reset_index() a = set(list(zip(list(input_coordinate_list_data["#CHROM"]),list(input_coordinate_list_data["POS"])))) b = set(list(zip(list(input_vcf_data2["#CHROM"]),list(input_vcf_data2["POS"])))) coordinate_list_not_in_vcf = list(a.difference(b)) coordinate_list_in_vcf = list(a & b) pd_coordinate_list_not_in_vcf = pd.DataFrame(coordinate_list_not_in_vcf) pd_coordinate_list_in_vcf = pd.DataFrame(coordinate_list_in_vcf) pd_coordinate_list_not_in_vcf.columns = ["#CHROM","POS"] pd_coordinate_list_in_vcf.columns = ["#CHROM","POS"] pd_coordinate_list_not_in_vcf['#CHROM'] = pd_coordinate_list_not_in_vcf['#CHROM'].astype(chr_index) pd_coordinate_list_in_vcf['#CHROM'] = pd_coordinate_list_in_vcf['#CHROM'].astype(chr_index) pd_coordinate_list_not_in_vcf['POS'] = pd_coordinate_list_not_in_vcf['POS'].astype('int') pd_coordinate_list_in_vcf['POS'] = pd_coordinate_list_in_vcf['POS'].astype('int') pd_coordinate_list_not_in_vcf2 = pd_coordinate_list_not_in_vcf.sort_values(['#CHROM','POS'], ascending=[True, True]) pd_coordinate_list_in_vcf2 = pd_coordinate_list_in_vcf.sort_values(['#CHROM','POS'], ascending=[True, True]) pd_coordinate_list_not_in_vcf2.to_csv(os.path.join(output_dir,"%s_coordinate_list_NOT_in_vcf.tsv"%(setting_name)), sep='\t',index=False, header=False) pd_coordinate_list_in_vcf2.to_csv(os.path.join(output_dir,"%s_coordinate_list_in_vcf.tsv"%(setting_name)), sep='\t',index=False, header=False)
def test_init(self): t = r.ReduceMemoryTransformer(verbose=True) df = create_df_all() t.fit_transform(df) dtypes_expected = { "color": pd.CategoricalDtype(["blue", "red"]), "amount": np.dtype("int8"), "price": np.dtype("float32"), } # This transformer modifies df in place! self.assertDictEqual(df.dtypes.to_dict(), dtypes_expected)
def deserialize_onehot_encoder(model_dict): model = OneHotEncoder(**model_dict['params']) categories_ = list(map(lambda x: np.array(x), model_dict['categories_'])) dtypes_ = list( map(lambda x: pd.CategoricalDtype(categories=x), model_dict['categories_'])) model.categories_ = categories_ model.dtypes_ = dtypes_ return model
def sample(self, incomplete_data: pd.DataFrame) -> pd.DataFrame: """Sample based on parent values.""" parent_values_array = incomplete_data[self.parents].apply( lambda x: x.cat.codes).values random_vector = np.random.uniform(size=parent_values_array.shape[0]) parent_values: List[Tuple[int, ...]] = list(map(tuple, parent_values_array)) out_array = _sample_cpt(self.cumsum_array, parent_values, random_vector) dtype = pd.CategoricalDtype(self.levels, ordered=True) return pd.Categorical.from_codes(codes=out_array, dtype=dtype)
def test_read_categorical(store): df = pd.DataFrame({"col": ["a"]}).astype({"col": "category"}) serialiser = ParquetSerializer() key = serialiser.store(store, "prefix", df) df = serialiser.restore_dataframe(store, key) assert df.dtypes["col"] == "O" df = serialiser.restore_dataframe(store, key, categories=["col"]) assert df.dtypes["col"] == pd.CategoricalDtype(["a"], ordered=False)
def factorize(series: pd.Series, categories: List = None, ordered: int = False) -> pd.Series: ''' factorize / make column a categorical dtype Parameters ---------- series pd.Series object to be converted to categorical categories list of unique category values within pd.Series ordered If true, categorical is ordered. Returns ------- pd.Series Returned series with categorical data type Examples -------- .. code-block:: cat_order = ['Tops & Blouses', 'Beachwear', 'Footwear', 'Jeans', 'Sportswear'] %%piper sample_sales() >> assign(product=lambda x: factorize(x['product'], categories=cat_order, ordered=True)) >> group_by(['location', 'product']) >> summarise(Total=('actual_sales', 'sum')) >> unstack() >> flatten_cols(remove_prefix='Total') >> head(tablefmt='plain') location Tops & Blouses Beachwear Footwear Jeans Sportswear London 339236 388762 274674 404440 291561 Milan 523052 368373 444624 364343 319199 Paris 481787 464725 383093 178117 150222 ''' if categories is None: series = series.astype('category') else: category_dtype = pd.CategoricalDtype(categories=categories, ordered=ordered) series = series.astype(category_dtype) return series
def test_align_categorical(self, l_ordered, r_ordered, expected): # GH-28397 df_1 = DataFrame({ "A": np.arange(6, dtype="int64"), "B": Series(list("aabbca")).astype( pd.CategoricalDtype(list("cab"), ordered=l_ordered)), }).set_index("B") df_2 = DataFrame({ "A": np.arange(5, dtype="int64"), "B": Series(list("babca")).astype( pd.CategoricalDtype(list("cab"), ordered=r_ordered)), }).set_index("B") aligned_1, aligned_2 = df_1.align(df_2) assert isinstance(aligned_1.index, expected) assert isinstance(aligned_2.index, expected) tm.assert_index_equal(aligned_1.index, aligned_2.index)
def fetch_data_for_status_redenna_piechart(project_name, client, click_filter=None): counts = pd.DataFrame( collection.get_document( collection="Indicators", graph_name="ActualStatusBarChartIndicator", project=project_name, client=client, ) ) if not counts.empty: clusters = config.client_config[client]["clusters_reden_na"] cluster_types = pd.CategoricalDtype( categories=list(clusters.keys()), ordered=True ) counts["cluster_redenna"] = counts["cluster_redenna"].astype(cluster_types) mask = pd.Series([True]).repeat(len(counts.index)).values if click_filter: for col, value in click_filter.items(): mask = mask & (counts[col] == value) cols_filter = list(click_filter.keys()) else: cols_filter = [] cols = list(dict.fromkeys(cols_filter + ["cluster_redenna"])) + ["laagbouw"] result = counts[mask][cols + ["count"]].groupby(cols).sum().reset_index() total = ( result.groupby("cluster_redenna") .sum() .reset_index()[["cluster_redenna", "count"]] ) laagbouw = ( result[result.laagbouw] .groupby("cluster_redenna") .sum() .reset_index()[["cluster_redenna", "count"]] ) hoogbouw = ( result[~result.laagbouw] .groupby("cluster_redenna") .sum() .reset_index()[["cluster_redenna", "count"]] ) else: total = pd.DataFrame() laagbouw = pd.DataFrame() hoogbouw = pd.DataFrame() return total, laagbouw, hoogbouw
def _order_strings(str_series: pd.Series) -> pd.Series: '''Turns string series of the form cNN where c is a character and NN is an integer into an order categorical''' # get unique ints num_series = str_series.str.slice(start = 1).astype(int) unique_num = np.unique(num_series) # append c to ints c = str_series[0][0] unique_str = c + pd.Series(unique_num).astype(str) # categorize the original series category_type = pd.CategoricalDtype(unique_str, ordered = True) return str_series.astype(category_type)
def to_dataframe(cls, aligns: List, chrom_order: List[str] = None): columns = [a[0] for a in aligns[0]] if chrom_order: overrides = { "chrom": pd.CategoricalDtype(chrom_order, ordered=True) } else: overrides = {} dtype = cls.pandas_dtype(overrides=overrides) df = pd.DataFrame([a.to_tuple() for a in aligns], columns=columns) df = df.astype(dtype) return df
def as_dtype(nbtype): """ Return a Pandas *dtype* instance corresponding to the given Numba type. NotImplementedError is raised if no correspondence is known. """ nbtype = types.unliteral(nbtype) if isinstance(nbtype, CategoricalDtypeType): return pd.CategoricalDtype(categories=nbtype.categories, ordered=nbtype.ordered) raise NotImplementedError("%r cannot be represented as a Pandas dtype" % (nbtype, ))
def _draw_nr_of_contacts(distribution, is_participating, states, seed): """Draw the number of contacts for everyone in a is_participating. Args: distribution (pandas.Series): slice of the params DataFrame with the distribution. The `subcategory` level of the index either identifies the age group specific distribution or must be the same for the whole slice. The `name` index level gives the support and the values of the Series give the probabilities. is_participating (pandas.Series): same index as states. True for the individuals that participate in the current contact model, i.e. for which the number of contacts should be drawn. states (pandas.DataFrame): sid states DataFrame. Returns: nr_of_contacts (pandas.Series): Same index as the states, values are the number of contacts for each person. """ is_age_varying = distribution.index.get_level_values( "subcategory").nunique() > 1 if is_age_varying: age_labels = [f"{i}-{i + 9}" for i in range(0, 71, 10)] + ["80-100"] age_dtype = pd.CategoricalDtype(categories=age_labels, ordered=True) age_group = states["age_group"].astype(age_dtype) age_codes = age_group.cat.codes.to_numpy() probs_df = distribution.unstack().reindex(age_labels).fillna(0) support = probs_df.columns.to_numpy().astype(int) probs = probs_df.to_numpy() cum_probs = probs.cumsum(axis=1) nr_of_contacts_arr = _draw_age_varying_nr_of_contacts_numba( support=support, cum_probs=cum_probs, age_codes=age_codes, is_participating=is_participating.to_numpy(), seed=seed, ) else: np.random.seed(seed) support = distribution.index.get_level_values("name").to_numpy() probs = distribution.to_numpy() nr_of_contacts_arr = np.where( is_participating.to_numpy(), np.random.choice(support, p=probs, size=len(states)), 0, ) return pd.Series(nr_of_contacts_arr, index=states.index)
def find_common_type_cat(types): if all(isinstance(t, pandas.CategoricalDtype) for t in types): if all(t.ordered for t in types): return pandas.CategoricalDtype( np.sort(np.unique([c for t in types for c in t.categories])[0]), ordered=True, ) return union_categoricals( [pandas.Categorical([], dtype=t) for t in types], sort_categories=all(t.ordered for t in types), ).dtype else: return find_common_type(types)
def _generate_value(dtype, fill_value): # special handle for datetime64 and timedelta64 dispatch = { np.datetime64: pd.Timestamp, np.timedelta64: pd.Timedelta, pd.CategoricalDtype.type: lambda x: pd.CategoricalDtype([x]), # for object, we do not know the actual dtype, # just convert to str for common usage np.object_: lambda x: str(fill_value), } # otherwise, just use dtype.type itself to convert convert = dispatch.get(dtype.type, dtype.type) return convert(fill_value)