コード例 #1
0
def _validate_category_dict(category_dct, cols):
    assert isinstance(category_dct, dict)
    for k, v in category_dct.items():
        assert k in cols, f"make_dummy_cols: {k} not in allowed columns: \n{cols}\n\n"
        if not isinstance(v, pd.CategoricalDtype):
            assert isinstance(
                v, list
            ), f"categories must be list or pd.CategoricalDtype. Got {type(v)}"
            category_dct[k] = pd.CategoricalDtype(list(set(v)))
コード例 #2
0
ファイル: frames_test.py プロジェクト: ymatzki/beam
 def test_categorical_groupby(self):
     df = pd.DataFrame({'A': np.arange(6), 'B': list('aabbca')})
     df['B'] = df['B'].astype(pd.CategoricalDtype(list('cab')))
     df = df.set_index('B')
     # TODO(BEAM-11190): These aggregations can be done in index partitions, but
     # it will require a little more complex logic
     with beam.dataframe.allow_non_parallel_operations():
         self._run_test(lambda df: df.groupby(level=0).sum(), df)
         self._run_test(lambda df: df.groupby(level=0).mean(), df)
コード例 #3
0
 def _set_ohe_categories(self):
     for col in self.ohe_cols:
         if col in self.ohe_categories.keys():
             continue  # to not override user supplied categories
         unique_vals = self.sample_df[col].dropna().unique()
         try:
             self.ohe_categories[col] = pd.CategoricalDtype(unique_vals)
         except ValueError:
             pass
コード例 #4
0
    def __init__(self, pandas_obj):
        # validate and assign object
        self._validate(pandas_obj)
        self._obj = pandas_obj

        # define incorporated modules - columns consisting of others will not have the dtype changed
        self._INCORPORATED_MODULES = ['builtins', 'numpy', 'pandas']

        # define a possible list of null values
        self._NULL_VALS = [
            None, np.nan, 'np.nan', 'nan', np.inf, 'np.inf', 'inf', -np.inf,
            '-np.inf', '', 'n/a', 'na', 'N/A', 'NA', 'unknown', 'unk',
            'UNKNOWN', 'UNK'
        ]

        # assign dtypes and limits
        # boolean
        BOOL_STRINGS_TRUE = ['t', 'true', 'yes', 'on']
        BOOL_STRINGS_FALSE = ['f', 'false', 'no', 'off']
        self._BOOL_MAP_DICT = {i: True
                               for i in BOOL_STRINGS_TRUE
                               }.update({i: False
                                         for i in BOOL_STRINGS_FALSE})
        self._DTYPE_BOOL_BASE = np.bool
        self._DTYPE_BOOL_NULLABLE = pd.BooleanDtype()
        # unsigned integers - base and nullable
        self._DTYPES_UINT_BASE = [np.uint8, np.uint16, np.uint32, np.uint64]
        self._DTYPES_UINT_NULLABLE = [
            pd.UInt8Dtype(),
            pd.UInt16Dtype(),
            pd.UInt32Dtype(),
            pd.UInt64Dtype()
        ]
        self._LIMIT_LOW_UINT = [
            np.iinfo(i).min for i in self._DTYPES_UINT_BASE
        ]
        self._LIMIT_HIGH_UINT = [
            np.iinfo(i).max for i in self._DTYPES_UINT_BASE
        ]
        # signed integers - base and nullable
        self._DTYPES_INT_BASE = [np.int8, np.int16, np.int32, np.int64]
        self._DTYPES_INT_NULLABLE = [
            pd.Int8Dtype(),
            pd.Int16Dtype(),
            pd.Int32Dtype(),
            pd.Int64Dtype()
        ]
        self._LIMIT_LOW_INT = [np.iinfo(i).min for i in self._DTYPES_INT_BASE]
        self._LIMIT_HIGH_INT = [np.iinfo(i).max for i in self._DTYPES_INT_BASE]
        # floats - nullable by default
        self._DTYPES_FLOAT = [np.float16, np.float32, np.float64]
        # datetime - nullable by default
        self._DTYPE_DATETIME = np.datetime64
        # string
        self._DTYPE_STRING = pd.StringDtype()
        # categorical - nullable by default
        self._DTYPE_CATEGORICAL = pd.CategoricalDtype()
コード例 #5
0
 def setLevelForEachLevelFeatureData(self):
     self.levelFeatureData = self.levelFeatureData.copy()
     self.levelFeatureData = self.levelFeatureData.astype('str')
     for _ in self.levelFeatureData.columns:
         orderCategory = InputController.setLevelForEachFeature(_)
         cat_dtype = pd.CategoricalDtype(
             categories=orderCategory.split(','), ordered=True)
         self.levelFeatureData[_] = self.levelFeatureData[_].astype(
             cat_dtype)
コード例 #6
0
    def test_coordinate_axis_with_category_dtype(self, x):

        order = ["b", "a", "d", "c"]
        x = x.astype(pd.CategoricalDtype(order))
        ax = mpl.figure.Figure().subplots()
        s = Nominal()._setup(x, Coordinate(), ax.xaxis)
        assert_array_equal(s(x), np.array([1, 3, 0, 3], float))
        f = ax.xaxis.get_major_formatter()
        assert f.format_ticks([0, 1, 2, 3]) == order
コード例 #7
0
def as_ordered_weekday(col):
    return col.astype(
        pd.CategoricalDtype(ordered=True)).cat.reorder_categories(
            [
                "Monday", "Tuesday", "Wednesday", "Thursday", "Friday",
                "Saturday", "Sunday"
            ],
            ordered=True,
        )
コード例 #8
0
def processData_inner(origin_data: pd.DataFrame, cat_lst: list = ["job", "marital", "education", "default", "housing", "loan", "contact", "poutcome"]) -> (pd.DataFrame, pd.DataFrame):
    """
    Process the original data
    @param origin_data: original data in DataFrame object;
    @return: the data like (datas, labels) after processing;
    """
    origin_data["month"] = origin_data["month"].astype(pd.CategoricalDtype(['jan', 'feb', 'mar','apr', 'may', 'jun', 'jul', 'aug', 'sep',  'oct', 'nov', 'dec'], ordered=True)).cat.codes
    dummies_df = pd.get_dummies(origin_data, columns=cat_lst)
    return dummies_df
コード例 #9
0
    def _fit(self,
             X: Union[ArrayLike, DataFrameType],
             handle_unknown: str = "error"):
        X = check_array(X,
                        accept_dask_dataframe=True,
                        dtype=None,
                        preserve_pandas_dataframe=True)
        if isinstance(X, np.ndarray):
            return super(OneHotEncoder,
                         self)._fit(X, handle_unknown=handle_unknown)

        is_array = isinstance(X, da.Array)

        if is_array:
            _, n_features = X.shape
        else:
            n_features = len(X.columns)

        if self.categories != "auto":
            for cats in self.categories:
                if not np.all(np.sort(cats) == np.array(cats)):
                    raise ValueError(
                        "Unsorted categories are not yet supported")
            if len(self.categories) != n_features:
                raise ValueError("Shape mismatch: if n_values is an array,"
                                 " it has to be of shape (n_features,).")

        self.categories_ = []
        self.dtypes_: List[Optional[pd.CategoricalDtype]] = []

        if is_array:
            for i in range(n_features):
                Xi = X[:, i]
                if self.categories == "auto":
                    cats = _encode(Xi)
                else:
                    cats = np.array(self.categories[i], dtype=X.dtype)
                self.categories_.append(cats)
                self.dtypes_.append(None)
        else:
            for i in range(len(X.columns)):
                Xi = X.iloc[:, i]
                if self.categories != "auto":
                    categories = self.categories[i]
                    Xi = Xi.astype(pd.CategoricalDtype(categories))
                else:
                    if not pd.api.types.is_categorical_dtype(Xi.dtype):
                        raise ValueError(
                            "All columns must be Categorical dtype when "
                            "'categories=\"auto\"'.")

                cats = _encode(Xi, uniques=Xi.cat.categories)
                self.categories_.append(cats)
                self.dtypes_.append(Xi.dtype)

        self.categories_ = dask.compute(self.categories_)[0]
コード例 #10
0
ファイル: dtypes.py プロジェクト: zkh2016/cudf
 def to_pandas(self) -> pd.CategoricalDtype:
     if self.categories is None:
         categories = None
     else:
         if isinstance(self.categories,
                       (cudf.Float32Index, cudf.Float64Index)):
             categories = self.categories.dropna().to_pandas()
         else:
             categories = self.categories.to_pandas()
     return pd.CategoricalDtype(categories=categories, ordered=self.ordered)
コード例 #11
0
def Bonus3(df):
    diamond = df.mask(df.eq('None')).dropna()
    diamond["Volume"] = diamond["x"] * diamond["y"] * (
        diamond["z"].astype("float32"))
    diamond["Volume"][diamond["depth"] <= 60] = 8
    bins = (pd.qcut(list(diamond['Volume']), q=5, precision=1)).codes
    d = pd.CategoricalDtype(['1', '2', '3', '4', '5'], ordered=True)
    category = pd.Categorical.from_codes(bins, dtype=d)
    return pd.crosstab(diamond['cut'], category).apply(lambda r: r / r.sum(),
                                                       axis=1)
コード例 #12
0
    def test_constructor_no_categories(self):
        @nb.njit
        def func():
            return pd.CategoricalDtype()

        boxed = func()
        expected = pd.CategoricalDtype(ordered=None)
        assert (boxed == expected)
        assert (boxed.categories == expected.categories)
        assert (boxed.ordered == expected.ordered)
コード例 #13
0
ファイル: pandas_engine.py プロジェクト: tfwillems/pandera
 def __init__(  # pylint:disable=super-init-not-called
         self,
         categories: Optional[Iterable[Any]] = None,
         ordered: bool = False) -> None:
     dtypes.Category.__init__(self, categories, ordered)
     object.__setattr__(
         self,
         "type",
         pd.CategoricalDtype(self.categories, self.ordered),
     )
コード例 #14
0
def convert_datetime(data_set):
    # datetime 컬럼을 요일/시간으로 변환
    try:
        data_set['weekday'] = data_set.dateTime.apply(lambda \
            x: datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S").date().strftime('%A'))
        data_set['hour'] = data_set.dateTime.apply(lambda \
            x: datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S").hour)
        date_set['minu'] = data_set.dateTime.apply(lambda \
            x: datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S").minute)
    except:
        data_set['weekday'] = data_set.dateTime.apply(
            lambda x: x.date().strftime('%A'))
        data_set['hour'] = data_set.dateTime.apply(lambda x: x.hour)
        data_set['minu'] = data_set.dateTime.apply(lambda x: x.minute)

    data_set.drop([str('dateTime')], axis=1, inplace=True)

    # 요일을 원핫인코딩 변환
    try:
        cat_type = CategoricalDtype(categories=WEEKS, ordered=True)
        data_set['weekday'] = data_set['weekday'].astype(cat_type)
    except:
        data_set["weekday"] = data_set.weekday.astype('category',
                                                      categories=WEEKS)
    data_set = pd.concat(
        [data_set,
         pd.get_dummies(data_set["weekday"], prefix='weekday')],
        axis=1)
    data_set.drop([str('weekday')], axis=1, inplace=True)

    # 시간을 원핫인코딩으로 변환
    try:
        cat_type = CategoricalDtype(categories=HOURS, ordered=True)
        data_set['hour'] = data_set['hour'].astype(cat_type)
    except:
        data_set["hour"] = data_set.hour.astype('hour', categories=HOURS)
    data_set = pd.concat(
        [data_set, pd.get_dummies(data_set["hour"], prefix='hour')], axis=1)
    data_set.drop([str('hour')], axis=1, inplace=True)

    # 분 단위 변환 & 원핫인코딩
    bins = [x for x in range(0, 60, 5)]
    minu_CAT = [str(x) + "-" + str(x + 5) for x in range(0, 60, 5)]

    data_set['minu'] = np.vectorize(dict(enumerate(minu_CAT,
                                                   1)).get)(np.digitize(
                                                       data_set['minu'], bins))
    data_set['minu'] = data_set.minu.astype(
        pd.CategoricalDtype(categories=minu_CAT))
    data_set = pd.concat(
        [data_set, pd.get_dummies(data_set["minu"], prefix='minu')], axis=1)

    data_set.drop([str('minu')], axis=1, inplace=True)

    return data_set
コード例 #15
0
ファイル: setups.py プロジェクト: rhalbersma/gravon
def get_setups(
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    try:
        db_setups = pkg.load_dataset('db_setups')
        game_setups = pkg.load_dataset('game_setups')
        free_setups = pkg.load_dataset('free_setups')
        null_setups = pkg.load_dataset('null_setups')
    except:
        st2 = games.get_st2()
        si2, _ = games.get_si2_sg2()
        free_setups = si2.query('type == "free"')
        null_setups = si2.query('field_content.isnull()')
        game_setups = (
            si2.query('type != "free" & field_content.notnull()').assign(
                setup_str_red=lambda r: r.field_content.str[:40],
                setup_str_blue=lambda r: r.field_content.str[60:].str[::-1],
                dmz=lambda r: r.field_content.str[40:60]))
        assert (game_setups.dmz == 'AA__AA__AAAA__AA__AA').all()
        db_setups = (pd.wide_to_long(
            st2.merge(game_setups).drop(
                columns=['player_red', 'player_blue', 'dmz']),
            stubnames=['setup_str'],
            i='gid',
            j='player',
            sep='_',
            suffix='(red|blue)').reset_index().assign(
                result=lambda r: np.where(
                    r.player == r.winner, 'win',
                    np.where(r.winner == 'draw', 'draw', 'loss')),
                score=lambda r: np.where(
                    r.result == 'win', 1.0,
                    np.where(r.result == 'draw', 0.5, 0.0)),
                setup_str=lambda r: r.setup_str.apply(strados2.decode_setup),
                setup_obj=lambda r: r.apply(
                    lambda x: Setup(x.setup_str, x.type), axis=1)
            ).astype(
                dtype={
                    'result':
                    pd.CategoricalDtype(categories=['win', 'draw', 'loss'])
                }
            ).pipe(lambda df: pd.concat(
                [df, pd.get_dummies(df.result, prefix='', prefix_sep='')],
                axis=1
            )).loc[:, [
                'gid', 'filename', 'period', 'freq', 'ext', 'type', 'player',
                'result', 'win', 'draw', 'loss', 'score', 'ending',
                'num_moves', 'num_turns', 'next_move', 'setup_str', 'setup_obj'
            ]].pipe(label.setups).sort_values(['gid', 'player'
                                               ]).reset_index(drop=True))
        assert all(db_setups.setup_obj.apply(lambda x: x.ok()))
        pkg.save_dataset(db_setups, 'db_setups')
        pkg.save_dataset(game_setups, 'game_setups')
        pkg.save_dataset(free_setups, 'free_setups')
        pkg.save_dataset(null_setups, 'null_setups')
    return db_setups, game_setups, free_setups, null_setups
def calculate_dataset_rates(files_to_process,
                            file_name_style=file_names_style['style4'],
                            group_by=PRED_EVOL_T,
                            region_order=('N-450', 'MF-NCR')):
    """
    Process set of files containing tree summaries to produce a table
    containing the statistics for each data set.

    Parameters
    ----------
    files_to_process : tuple
        A tuple containing tuples, each containing paths of csv files with tree
        summaries for each genotype to be processed together.
    file_name_style : regex
        A string containing a regular expression to extract information from
        the csv file name.
    group_by : str
        The column heading to be used to group results by.
    region_order : tuple
        A tuple containing the list of genomic regions in the order to be
        plotted.

    Returns
    -------
    None
    """
    for file_set in files_to_process:
        all_rates = pd.DataFrame()
        out_path = None
        for file in file_set:
            # load results
            rates = pd.read_csv(file)
            region_cats = pd.CategoricalDtype(categories=region_order,
                                              ordered=True)
            rates[REGION] = rates[REGION].astype(region_cats)
            # get genotype from file name
            in_dir, _ = os.path.split(file)
            name_info = re.search(file_name_style, file).groupdict()
            genotype = name_info[REGEX_GENOTYPE]
            if out_path is None:
                # set output handle
                dataset = name_info[REGEX_DATASET]
                settings = name_info[REGEX_SETTINGS]
                out_path = os.path.join(in_dir.replace('input', 'output'),
                                        f'{dataset}-{settings}-{group_by}')
            # calculate rates and set df to long format for plotting
            rates = prepare_df_for_plotting(rates, group_by=group_by)
            rates[GENOTYPE] = genotype
            all_rates = pd.concat([all_rates, rates])
        stats_out_file = f'{out_path}-stats.csv'
        all_rates.to_csv(stats_out_file, index=False)
        files_done = '\n'.join(file_set)
        print(f'Completed processing set of files containing\n{files_done}\n'
              f'Model statistics table saved to {stats_out_file}\n')
コード例 #17
0
def main1(args):
    print(args)
    input_coordinate_list=args.input_coordinate_list
    input_vcf=args.input_vcf
    setting_name=args.setting
    
    output_dir = os.path.dirname(input_coordinate_list)
    input_coordinate_list_data=pd.read_csv(input_coordinate_list,header=None,delimiter="\t")
    input_coordinate_list_data.columns = ["#CHROM","POS"]
#    input_vcf_data=pd.read_csv(input_vcf,delimiter="\t", header = 1, skiprows=lambda x: rm_comment(x))
    input_vcf_data =[]
    with open(input_vcf, "r") as fh:
        while True:
            line = fh.readline()
            if not line:
                break
            elif re.search(pattern1, line):
                pass
            else:
                input_vcf_data.append(line.rsplit("\t"))
    
    input_vcf_data2 =  pd.DataFrame(input_vcf_data[1:]) 
    input_vcf_data2.columns = input_vcf_data[0]
    input_vcf_data2["POS"] = input_vcf_data2["POS"].astype(int)   
    chr_index = pd.CategoricalDtype(["1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","X","Y"], ordered = True)

#    df3 = input_coordinate_list_data.set_index(["#CHROM","POS"])
#    df3.update(input_vcf_data2.set_index(["#CHROM","POS"]))
#    df3.reset_index()

    a = set(list(zip(list(input_coordinate_list_data["#CHROM"]),list(input_coordinate_list_data["POS"]))))
    b = set(list(zip(list(input_vcf_data2["#CHROM"]),list(input_vcf_data2["POS"]))))
    coordinate_list_not_in_vcf = list(a.difference(b))
    coordinate_list_in_vcf = list(a & b)
    pd_coordinate_list_not_in_vcf = pd.DataFrame(coordinate_list_not_in_vcf)
    pd_coordinate_list_in_vcf = pd.DataFrame(coordinate_list_in_vcf)

    pd_coordinate_list_not_in_vcf.columns = ["#CHROM","POS"]
    pd_coordinate_list_in_vcf.columns = ["#CHROM","POS"]

    pd_coordinate_list_not_in_vcf['#CHROM'] = pd_coordinate_list_not_in_vcf['#CHROM'].astype(chr_index)
    pd_coordinate_list_in_vcf['#CHROM'] = pd_coordinate_list_in_vcf['#CHROM'].astype(chr_index)

    pd_coordinate_list_not_in_vcf['POS'] = pd_coordinate_list_not_in_vcf['POS'].astype('int')
    pd_coordinate_list_in_vcf['POS'] =  pd_coordinate_list_in_vcf['POS'].astype('int')



    pd_coordinate_list_not_in_vcf2 = pd_coordinate_list_not_in_vcf.sort_values(['#CHROM','POS'], ascending=[True, True])
    pd_coordinate_list_in_vcf2 = pd_coordinate_list_in_vcf.sort_values(['#CHROM','POS'], ascending=[True, True])
    

    pd_coordinate_list_not_in_vcf2.to_csv(os.path.join(output_dir,"%s_coordinate_list_NOT_in_vcf.tsv"%(setting_name)), sep='\t',index=False, header=False)
    pd_coordinate_list_in_vcf2.to_csv(os.path.join(output_dir,"%s_coordinate_list_in_vcf.tsv"%(setting_name)), sep='\t',index=False, header=False)
コード例 #18
0
 def test_init(self):
     t = r.ReduceMemoryTransformer(verbose=True)
     df = create_df_all()
     t.fit_transform(df)
     dtypes_expected = {
         "color": pd.CategoricalDtype(["blue", "red"]),
         "amount": np.dtype("int8"),
         "price": np.dtype("float32"),
     }
     # This transformer modifies df in place!
     self.assertDictEqual(df.dtypes.to_dict(), dtypes_expected)
コード例 #19
0
def deserialize_onehot_encoder(model_dict):
    model = OneHotEncoder(**model_dict['params'])
    categories_ = list(map(lambda x: np.array(x), model_dict['categories_']))
    dtypes_ = list(
        map(lambda x: pd.CategoricalDtype(categories=x),
            model_dict['categories_']))

    model.categories_ = categories_
    model.dtypes_ = dtypes_

    return model
コード例 #20
0
ファイル: parameters.py プロジェクト: Stoffle/BayNet
 def sample(self, incomplete_data: pd.DataFrame) -> pd.DataFrame:
     """Sample based on parent values."""
     parent_values_array = incomplete_data[self.parents].apply(
         lambda x: x.cat.codes).values
     random_vector = np.random.uniform(size=parent_values_array.shape[0])
     parent_values: List[Tuple[int,
                               ...]] = list(map(tuple, parent_values_array))
     out_array = _sample_cpt(self.cumsum_array, parent_values,
                             random_vector)
     dtype = pd.CategoricalDtype(self.levels, ordered=True)
     return pd.Categorical.from_codes(codes=out_array, dtype=dtype)
コード例 #21
0
ファイル: test_parquet.py プロジェクト: seanahmad/kartothek
def test_read_categorical(store):
    df = pd.DataFrame({"col": ["a"]}).astype({"col": "category"})

    serialiser = ParquetSerializer()
    key = serialiser.store(store, "prefix", df)

    df = serialiser.restore_dataframe(store, key)
    assert df.dtypes["col"] == "O"

    df = serialiser.restore_dataframe(store, key, categories=["col"])
    assert df.dtypes["col"] == pd.CategoricalDtype(["a"], ordered=False)
コード例 #22
0
def factorize(series: pd.Series,
              categories: List = None,
              ordered: int = False) -> pd.Series:
    ''' factorize / make column a categorical dtype

    Parameters
    ----------
    series
        pd.Series object to be converted to categorical
    categories
        list of unique category values within pd.Series
    ordered
        If true, categorical is ordered.


    Returns
    -------
    pd.Series
        Returned series with categorical data type

    Examples
    --------

    .. code-block::

        cat_order = ['Tops & Blouses', 'Beachwear',
                     'Footwear', 'Jeans', 'Sportswear']

        %%piper
        sample_sales()
        >> assign(product=lambda x: factorize(x['product'],
                                              categories=cat_order,
                                              ordered=True))
        >> group_by(['location', 'product'])
        >> summarise(Total=('actual_sales', 'sum'))
        >> unstack()
        >> flatten_cols(remove_prefix='Total')
        >> head(tablefmt='plain')

        location Tops & Blouses Beachwear  Footwear   Jeans Sportswear
        London           339236    388762    274674  404440     291561
        Milan            523052    368373    444624  364343     319199
        Paris            481787    464725    383093  178117     150222

    '''
    if categories is None:
        series = series.astype('category')
    else:
        category_dtype = pd.CategoricalDtype(categories=categories,
                                             ordered=ordered)
        series = series.astype(category_dtype)

    return series
コード例 #23
0
    def test_align_categorical(self, l_ordered, r_ordered, expected):
        # GH-28397
        df_1 = DataFrame({
            "A":
            np.arange(6, dtype="int64"),
            "B":
            Series(list("aabbca")).astype(
                pd.CategoricalDtype(list("cab"), ordered=l_ordered)),
        }).set_index("B")
        df_2 = DataFrame({
            "A":
            np.arange(5, dtype="int64"),
            "B":
            Series(list("babca")).astype(
                pd.CategoricalDtype(list("cab"), ordered=r_ordered)),
        }).set_index("B")

        aligned_1, aligned_2 = df_1.align(df_2)
        assert isinstance(aligned_1.index, expected)
        assert isinstance(aligned_2.index, expected)
        tm.assert_index_equal(aligned_1.index, aligned_2.index)
コード例 #24
0
def fetch_data_for_status_redenna_piechart(project_name, client, click_filter=None):

    counts = pd.DataFrame(
        collection.get_document(
            collection="Indicators",
            graph_name="ActualStatusBarChartIndicator",
            project=project_name,
            client=client,
        )
    )

    if not counts.empty:
        clusters = config.client_config[client]["clusters_reden_na"]
        cluster_types = pd.CategoricalDtype(
            categories=list(clusters.keys()), ordered=True
        )
        counts["cluster_redenna"] = counts["cluster_redenna"].astype(cluster_types)

        mask = pd.Series([True]).repeat(len(counts.index)).values
        if click_filter:
            for col, value in click_filter.items():
                mask = mask & (counts[col] == value)
            cols_filter = list(click_filter.keys())
        else:
            cols_filter = []

        cols = list(dict.fromkeys(cols_filter + ["cluster_redenna"])) + ["laagbouw"]
        result = counts[mask][cols + ["count"]].groupby(cols).sum().reset_index()

        total = (
            result.groupby("cluster_redenna")
            .sum()
            .reset_index()[["cluster_redenna", "count"]]
        )
        laagbouw = (
            result[result.laagbouw]
            .groupby("cluster_redenna")
            .sum()
            .reset_index()[["cluster_redenna", "count"]]
        )
        hoogbouw = (
            result[~result.laagbouw]
            .groupby("cluster_redenna")
            .sum()
            .reset_index()[["cluster_redenna", "count"]]
        )

    else:
        total = pd.DataFrame()
        laagbouw = pd.DataFrame()
        hoogbouw = pd.DataFrame()

    return total, laagbouw, hoogbouw
コード例 #25
0
def _order_strings(str_series: pd.Series) -> pd.Series:
    '''Turns string series of the form cNN where c is a character and 
       NN is an integer into an order categorical'''
    # get unique ints
    num_series = str_series.str.slice(start = 1).astype(int)
    unique_num = np.unique(num_series)
    # append c to ints
    c = str_series[0][0]
    unique_str = c + pd.Series(unique_num).astype(str)
    # categorize the original series
    category_type = pd.CategoricalDtype(unique_str, ordered = True)
    return str_series.astype(category_type)
コード例 #26
0
 def to_dataframe(cls, aligns: List, chrom_order: List[str] = None):
     columns = [a[0] for a in aligns[0]]
     if chrom_order:
         overrides = {
             "chrom": pd.CategoricalDtype(chrom_order, ordered=True)
         }
     else:
         overrides = {}
     dtype = cls.pandas_dtype(overrides=overrides)
     df = pd.DataFrame([a.to_tuple() for a in aligns], columns=columns)
     df = df.astype(dtype)
     return df
コード例 #27
0
ファイル: pandas_support.py プロジェクト: ls-pepper/sdc
def as_dtype(nbtype):
    """
    Return a Pandas *dtype* instance corresponding to the given Numba type.
    NotImplementedError is raised if no correspondence is known.
    """
    nbtype = types.unliteral(nbtype)
    if isinstance(nbtype, CategoricalDtypeType):
        return pd.CategoricalDtype(categories=nbtype.categories,
                                   ordered=nbtype.ordered)

    raise NotImplementedError("%r cannot be represented as a Pandas dtype" %
                              (nbtype, ))
コード例 #28
0
def _draw_nr_of_contacts(distribution, is_participating, states, seed):
    """Draw the number of contacts for everyone in a is_participating.

    Args:
        distribution (pandas.Series): slice of the params DataFrame with
            the distribution. The `subcategory` level of the index either
            identifies the age group specific distribution or must be the
            same for the whole slice. The `name` index level gives the support
            and the values of the Series give the probabilities.
        is_participating (pandas.Series): same index as states. True for the individuals
            that participate in the current contact model, i.e. for which the
            number of contacts should be drawn.
        states (pandas.DataFrame): sid states DataFrame.

    Returns:
        nr_of_contacts (pandas.Series): Same index as the states, values are
            the number of contacts for each person.

    """
    is_age_varying = distribution.index.get_level_values(
        "subcategory").nunique() > 1

    if is_age_varying:
        age_labels = [f"{i}-{i + 9}" for i in range(0, 71, 10)] + ["80-100"]
        age_dtype = pd.CategoricalDtype(categories=age_labels, ordered=True)
        age_group = states["age_group"].astype(age_dtype)
        age_codes = age_group.cat.codes.to_numpy()

        probs_df = distribution.unstack().reindex(age_labels).fillna(0)
        support = probs_df.columns.to_numpy().astype(int)
        probs = probs_df.to_numpy()
        cum_probs = probs.cumsum(axis=1)

        nr_of_contacts_arr = _draw_age_varying_nr_of_contacts_numba(
            support=support,
            cum_probs=cum_probs,
            age_codes=age_codes,
            is_participating=is_participating.to_numpy(),
            seed=seed,
        )

    else:
        np.random.seed(seed)
        support = distribution.index.get_level_values("name").to_numpy()
        probs = distribution.to_numpy()
        nr_of_contacts_arr = np.where(
            is_participating.to_numpy(),
            np.random.choice(support, p=probs, size=len(states)),
            0,
        )

    return pd.Series(nr_of_contacts_arr, index=states.index)
コード例 #29
0
def find_common_type_cat(types):
    if all(isinstance(t, pandas.CategoricalDtype) for t in types):
        if all(t.ordered for t in types):
            return pandas.CategoricalDtype(
                np.sort(np.unique([c for t in types for c in t.categories])[0]),
                ordered=True,
            )
        return union_categoricals(
            [pandas.Categorical([], dtype=t) for t in types],
            sort_categories=all(t.ordered for t in types),
        ).dtype
    else:
        return find_common_type(types)
コード例 #30
0
def _generate_value(dtype, fill_value):
    # special handle for datetime64 and timedelta64
    dispatch = {
        np.datetime64: pd.Timestamp,
        np.timedelta64: pd.Timedelta,
        pd.CategoricalDtype.type: lambda x: pd.CategoricalDtype([x]),
        # for object, we do not know the actual dtype,
        # just convert to str for common usage
        np.object_: lambda x: str(fill_value),
    }
    # otherwise, just use dtype.type itself to convert
    convert = dispatch.get(dtype.type, dtype.type)
    return convert(fill_value)