コード例 #1
0
ファイル: structured.py プロジェクト: gil2abir/fastai
def scale_vars(df, mapper):
    warnings.filterwarnings('ignore', category=sklearn.exceptions.DataConversionWarning)
    if mapper is None:
        map_f = [([n],StandardScaler()) for n in df.columns if is_numeric_dtype(df[n])]
        mapper = DataFrameMapper(map_f).fit(df)
    df[mapper.transformed_names_] = mapper.transform(df)
    return mapper
コード例 #2
0
ファイル: rfpimp.py プロジェクト: canard0328/malss
def feature_dependence_matrix(rf, X_train, n_samples=5000):
    """
    Given training observation independent variables in X_train (a dataframe),
    compute the feature importance using each var as a dependent variable.
    We retrain a random forest for each var as target using the others as
    independent vars.  Only numeric columns are considered.

    By default, sample up to 5000 observations to compute feature dependencies.

    :return: a non-symmetric data frame with the dependence matrix where each row is the importance of each var to the row's var used as a model target.
    """
    numcols = [col for col in X_train if is_numeric_dtype(X_train[col])]

    X_train = sample_rows(X_train, n_samples)

    df_dep = pd.DataFrame(index=X_train.columns, columns=['Dependence']+X_train.columns.tolist())
    for i in range(len(numcols)):
        col = numcols[i]
        X, y = X_train.drop(col, axis=1), X_train[col]
        rf.fit(X,y)
        #imp = rf.feature_importances_
        imp = permutation_importances_raw(rf, X, y, oob_regression_r2_score, n_samples)
        imp = np.insert(imp, i, 1.0)
        df_dep.iloc[i] = np.insert(imp, 0, rf.oob_score_) # add overall dependence

    return df_dep
コード例 #3
0
def _check_Xy(X: pd.DataFrame,
              y: pd.Series, *,
              norm_y=False) -> Tuple[pd.Series, pd.Series]:
    if np.ndim(X) == 1:
        X = pd.Series(X).to_frame()
    elif np.ndim(X) == 2:
        X = pd.DataFrame(X)

    assert X.ndim == 2
    assert np.ndim(y) == 1
    assert len(X) == len(y)

    valid = ~X.isnull().any(1).values
    X = pd.Series(list(zip(*X.values[valid].T)),
                  name=tuple(X.columns)).astype('category')
    y = pd.Series(y).reset_index(drop=True)[valid]

    if is_object_dtype(y):
        y = pd.Categorical(y)

    if norm_y:
        assert is_numeric_dtype(y)
        y = (y - y.mean()) / y.std()

    return X, y
コード例 #4
0
ファイル: density.py プロジェクト: jwhendy/plotnine
def get_var_type(col):
    """
    Return var_type (for KDEMultivariate) of the column

    Parameters
    ----------
    col : pandas.Series
        A dataframe column.

    Returns
    -------
    out : str
        One of ['c', 'o', 'u'].

    See Also
    --------
    The origin of the character codes is
    :class:`statsmodels.nonparametric.kernel_density.KDEMultivariate`.
    """
    if pdtypes.is_numeric_dtype(col):
        # continuous
        return 'c'
    elif pdtypes.is_categorical_dtype(col):
        # ordered or unordered
        return 'o' if col.cat.ordered else 'u'
    else:
        # unordered if unsure, e.g string columns that
        # are not categorical
        return 'u'
コード例 #5
0
ファイル: data_prep.py プロジェクト: xiangnanyue/Pyod
def missing_check(df, imputation="zero", verbose=True):
    '''check the missing percentage. Impute the missing values if necessary
    Note: for numerical variables and categorical variables we should handle
    differently

    :param df:
    :param imputation: "zero" or "mean"
    :return:
    '''
    n_df = df.shape[0]
    cols = df.columns.tolist()
    if verbose:
        print("\nMissing value check and imputation starts...")

    for col in cols:
        missing = n_df - np.count_nonzero(df[col].isnull().values)
        mis_perc = 100 - float(missing) / n_df * 100

        if mis_perc > 0:
            if verbose:
                print("    {col} missing percentage is {miss}%" \
                      .format(col=col, miss=mis_perc))
            # impute categorical var by NaN
            if not is_numeric_dtype(df[col]):
                df[col].fillna('NaN', inplace=True)
                continue
            # impute num variables
            if imputation == "mean":
                df[col].fillna(df[col].mean, inplace=True)
            else:
                df[col].fillna(int(0), inplace=True)
    return df
コード例 #6
0
ファイル: structured.py プロジェクト: SiddharthTiwari/fastai
def fix_missing(df, col, name, na_dict):
    """ Fill missing data in a column of df with the median, and add a {name}_na column
    which specifies if the data was missing.
    Parameters:
    -----------
    df: The data frame that will be changed.
    col: The column of data to fix by filling in missing data.
    name: The name of the new filled column in df.
    na_dict: A dictionary of values to create na's of and the value to insert. If
        name is not a key of na_dict the median will fill any missing data. Also
        if name is not a key of na_dict and there is no missing data in col, then
        no {name}_na column is not created.
    Examples:
    ---------
    >>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]})
    >>> df
       col1 col2
    0     1    5
    1   nan    2
    2     3    2
    >>> fix_missing(df, df['col1'], 'col1', {})
    >>> df
       col1 col2 col1_na
    0     1    5   False
    1     2    2    True
    2     3    2   False
    >>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]})
    >>> df
       col1 col2
    0     1    5
    1   nan    2
    2     3    2
    >>> fix_missing(df, df['col2'], 'col2', {})
    >>> df
       col1 col2
    0     1    5
    1   nan    2
    2     3    2
    >>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]})
    >>> df
       col1 col2
    0     1    5
    1   nan    2
    2     3    2
    >>> fix_missing(df, df['col1'], 'col1', {'col1' : 500})
    >>> df
       col1 col2 col1_na
    0     1    5   False
    1   500    2    True
    2     3    2   False
    """
    if is_numeric_dtype(col):
        if pd.isnull(col).sum() or (name in na_dict):
            df[name+'_na'] = pd.isnull(col)
            filler = na_dict[name] if name in na_dict else col.median()
            df[name] = col.fillna(filler)
            na_dict[name] = filler
    return na_dict
コード例 #7
0
ファイル: serializers.py プロジェクト: wq/django-rest-pandas
 def compute_boxplot(self, series):
     """
     Compute boxplot for given pandas Series.
     """
     from matplotlib.cbook import boxplot_stats
     series = series[series.notnull()]
     if len(series.values) == 0:
         return {}
     elif not is_numeric_dtype(series):
         return self.non_numeric_stats(series)
     stats = boxplot_stats(list(series.values))[0]
     stats['count'] = len(series.values)
     stats['fliers'] = "|".join(map(str, stats['fliers']))
     return stats
コード例 #8
0
ファイル: visual_utils.py プロジェクト: robertnishihara/ray
def generate_plotly_dim_dict(df, field):
    dim_dict = {}
    dim_dict["label"] = field
    column = df[field]
    if is_numeric_dtype(column):
        dim_dict["values"] = column
    elif is_string_dtype(column):
        texts = column.unique()
        dim_dict["values"] = [
            np.argwhere(texts == x).flatten()[0] for x in column
        ]
        dim_dict["tickvals"] = list(range(len(texts)))
        dim_dict["ticktext"] = texts
    else:
        raise Exception("Unidentifiable Type")

    return dim_dict
コード例 #9
0
ファイル: structured.py プロジェクト: gil2abir/fastai
def numericalize(df, col, name, max_n_cat):
    """ Changes the column col from a categorical type to it's integer codes.

    Parameters:
    -----------
    df: A pandas dataframe. df[name] will be filled with the integer codes from
        col.

    col: The column you wish to change into the categories.
    name: The column name you wish to insert into df. This column will hold the
        integer codes.

    max_n_cat: If col has more categories than max_n_cat it will not change the
        it to its integer codes. If max_n_cat is None, then col will always be
        converted.

    Examples:
    ---------
    >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']})
    >>> df
       col1 col2
    0     1    a
    1     2    b
    2     3    a

    note the type of col2 is string

    >>> train_cats(df)
    >>> df

       col1 col2
    0     1    a
    1     2    b
    2     3    a

    now the type of col2 is category { a : 1, b : 2}

    >>> numericalize(df, df['col2'], 'col3', None)

       col1 col2 col3
    0     1    a    1
    1     2    b    2
    2     3    a    1
    """
    if not is_numeric_dtype(col) and ( max_n_cat is None or col.nunique()>max_n_cat):
        df[name] = col.cat.codes+1
コード例 #10
0
def table_from_frame(df, *, force_nominal=False):

    def _is_discrete(s):
        return (is_categorical_dtype(s) or
                is_object_dtype(s) and (force_nominal or
                                        s.nunique() < s.size**.666))

    def _is_datetime(s):
        if is_datetime64_any_dtype(s):
            return True
        try:
            if is_object_dtype(s):
                pd.to_datetime(s, infer_datetime_format=True)
                return True
        except Exception:
            pass
        return False

    attrs, metas = [], []
    X, M = [], []

    for name, s in df.items():
        name = str(name)
        if _is_discrete(s):
            discrete = s.astype('category').cat
            attrs.append(DiscreteVariable(name, discrete.categories.astype(str).tolist()))
            X.append(discrete.codes.replace(-1, np.nan).values)
        elif _is_datetime(s):
            tvar = TimeVariable(name)
            attrs.append(tvar)
            s = pd.to_datetime(s, infer_datetime_format=True)
            X.append(s.astype('str').map(tvar.parse).values)
        elif is_numeric_dtype(s):
            attrs.append(ContinuousVariable(name))
            X.append(s.values)
        else:
            metas.append(StringVariable(name))
            M.append(s.values.astype(object))

    MAX_LENGTH = max(len(X[0]) if X else 0,
                     len(M[0]) if M else 0)
    return Table.from_numpy(Domain(attrs, None, metas),
                            np.column_stack(X) if X else np.empty(
                                (MAX_LENGTH, 0)),
                            None, np.column_stack(M) if M else None)
コード例 #11
0
def _table_from_numpy(x):
    def _to2d(x):
        if x.ndim <= 1:
            return np.c_[x]
        if x.ndim == 2:
            return x
        return None
        # When the shitty internals get fixed, this below will work
        # 2d array of (n-2)d-list fields
        x2d = np.empty(x.shape[:2], dtype=object)
        x2d[:] = x.tolist()
        return x2d

    x = _to2d(x)
    # 2d or str arrays etc. not supported
    if x is None or not is_numeric_dtype(x):
        return None
    return Table.from_numpy(None, x)
コード例 #12
0
ファイル: kollekshuns.py プロジェクト: INCF/pybids
    def resample(self, sampling_rate=None, variables=None, force_dense=False,
                 in_place=False, kind='linear'):
        ''' Resample all dense variables (and optionally, sparse ones) to the
        specified sampling rate.

        Args:
            sampling_rate (int, float): Target sampling rate (in Hz). If None,
                uses the instance sampling rate.
            variables (list): Optional list of Variables to resample. If None,
                all variables are resampled.
            force_dense (bool): if True, all sparse variables will be forced to
                dense.
            in_place (bool): When True, all variables are overwritten in-place.
                When False, returns resampled versions of all variables.
            kind (str): Argument to pass to scipy's interp1d; indicates the
                kind of interpolation approach to use. See interp1d docs for
                valid values.
        '''

        # Store old sampling rate-based variables
        sampling_rate = sampling_rate or self.sampling_rate

        _variables = {}

        for name, var in self.variables.items():
            if variables is not None and name not in variables:
                continue
            if isinstance(var, SparseRunVariable):
                if force_dense and is_numeric_dtype(var.values):
                    _variables[name] = var.to_dense(sampling_rate)
            else:
                # None if in_place; no update needed
                _var = var.resample(sampling_rate,
                                    inplace=in_place,
                                    kind=kind)
                if not in_place:
                    _variables[name] = _var

        if in_place:
            for k, v in _variables.items():
                self.variables[k] = v
            self.sampling_rate = sampling_rate
        else:
            return _variables
コード例 #13
0
ファイル: test_dataframes.py プロジェクト: eike-welk/clair
def test__write_frame__read_frame():
    print("Start")
    from django.db import utils
    from econdata.models import Listing
    from libclair.dataframes import write_frame_create, read_frame, write_frame
    
    # Create a DataFrame and write it ino the database
    fr1 = pd.DataFrame([{'id':'foo-1', 'site':'a', 'id_site':'1', 'title':'The 1st record.'},
                        {'id':'foo-2', 'site':'a', 'id_site':'2', 'title':'The 2nd record.'}])
    print('\nfr1:\n', fr1)
    write_frame_create(fr1, Listing, delete=True)
    # The records already exist. Creating them again, without deleting them, 
    # must raise an exception.
    with pytest.raises(utils.IntegrityError):
        write_frame_create(fr1, Listing)
    
    # Read the records, that were just created, from the database.
    # Read a few additional empty columns.
    qset = Listing.objects.filter(id__in=['foo-1', 'foo-2'])
    fr2 = read_frame(qset, ['id', 'title', 'time', 'price'])
    print('\nfr2:\n', fr2)
    
    assert pd_types.is_string_dtype(fr2['title'])
    assert pd_types.is_datetime64_any_dtype(fr2['time'])
    assert pd_types.is_numeric_dtype(fr2['price'])
    assert fr2['id'][0] == 'foo-1'
    assert fr2['id'][1] == 'foo-2'
    assert fr2['title'][0] == 'The 1st record.'
    assert fr2['title'][1] == 'The 2nd record.'
    
    # Change the dataframe
    fr2['time'] = [pd.Timestamp('2017-01-01 12:00+0'), 
                   pd.Timestamp('2017-01-02 12:00+0'),]
    fr2['price'] = [101.0, 102.0,]
    print('\nfr2:\n', fr2)
    # Update the records in the database
    write_frame(fr2, Listing)
    
    # Read the updated records from the database.
    qset = Listing.objects.filter(id__in=['foo-1', 'foo-2'])
    fr3 = read_frame(qset, ['id', 'title', 'time', 'price'])
    print('\nfr3:\n', fr3)
    assert_frames_equal(fr2, fr3)
コード例 #14
0
 def _get_columns_info(self, stats):
     column_info = {}
     column_info[self.TYPE_CONSTANT] = stats['uniques'][stats['uniques'] == 1].index
     column_info[self.TYPE_BOOL] = stats['uniques'][stats['uniques'] == 2].index
     rest_columns = self.get_columns(self.df,
                                     self.EXCLUDE,
                                     column_info['constant'].union(column_info['bool']))
     column_info[self.TYPE_NUMERIC] = pd.Index([c for c in rest_columns
                                                if types.is_numeric_dtype(self.df[c])])
     rest_columns = self.get_columns(
         self.df[rest_columns], self.EXCLUDE, column_info['numeric'])
     column_info[self.TYPE_DATE] = pd.Index([c for c in rest_columns
                                             if types.is_datetime64_dtype(self.df[c])])
     rest_columns = self.get_columns(
         self.df[rest_columns], self.EXCLUDE, column_info['date'])
     unique_columns = stats['uniques'][rest_columns] == stats['counts'][rest_columns]
     column_info[self.TYPE_UNIQUE] = stats['uniques'][rest_columns][unique_columns].index
     column_info[self.TYPE_CATEGORICAL] = stats['uniques'][rest_columns][~unique_columns].index
     return column_info
コード例 #15
0
ファイル: anaddbnc.py プロジェクト: gmatteo/abipy
    def plot_elastic_properties(self, fontsize=10, **kwargs):
        """
        Args:
            fontsize: legend and label fontsize.

        Returns: |matplotlib-Figure|
        """
        df = self.get_elastic_dataframe(with_geo=False, abspath=False, with_params=False)
        from pandas.api.types import is_numeric_dtype
        keys = [k for k in df.keys() if is_numeric_dtype(df[k])]
        i = keys.index("fitted_to_structure")
        if i != -1:
            keys.pop(i)

        num_plots, ncols, nrows = len(keys), 1, 1
        if num_plots > 1:
            ncols = 3
            nrows = (num_plots // ncols) + (num_plots % ncols)

        ax_list, fig, plt = get_axarray_fig_plt(None, nrows=nrows, ncols=ncols,
                                                sharex=False, sharey=False, squeeze=False)
        ax_list = ax_list.ravel()

        for ix, (key, ax) in enumerate(zip(keys, ax_list)):
            irow, icol = divmod(ix, ncols)
            xn = range(len(df.index))
            ax.plot(xn, df[key], marker="o")
            ax.grid(True)
            ax.set_xticks(xn)
            ax.set_ylabel(key, fontsize=fontsize)
            ax.set_xticklabels([])

        ax.set_xticklabels(self.keys(), fontsize=fontsize)
        rotate_ticklabels(ax, 15)

        if ix != len(ax_list) -1:
            for ix in range(ix + 1, len(ax_list)):
                ax_list[ix].axis('off')

        return fig
コード例 #16
0
ファイル: rfpimp.py プロジェクト: canard0328/malss
def oob_dependences(rf, X_train, n_samples=5000):
    """
    Given a random forest model, rf, and training observation independent
    variables in X_train (a dataframe), compute the OOB R^2 score using each var
    as a dependent variable. We retrain rf for each var.    Only numeric columns are considered.

    By default, sample up to 5000 observations to compute feature dependencies.

    :return: Return a DataFrame with Feature/Dependence values for each variable. Feature is the dataframe index.
    """
    numcols = [col for col in X_train if is_numeric_dtype(X_train[col])]

    X_train = sample_rows(X_train, n_samples)

    df_dep = pd.DataFrame(columns=['Feature','Dependence'])
    df_dep = df_dep.set_index('Feature')
    for col in numcols:
        X, y = X_train.drop(col, axis=1), X_train[col]
        rf.fit(X, y)
        df_dep.loc[col] = rf.oob_score_
    df_dep = df_dep.sort_values('Dependence', ascending=False)
    return df_dep
コード例 #17
0
 def contains_op(cls, series: pd.Series, state: dict) -> bool:
     return pdt.is_numeric_dtype(series)
コード例 #18
0
def table_from_frame(df, *, force_nominal=False):
    """
    Convert pandas.DataFrame to Orange.data.Table

    Parameters
    ----------
    df : pandas.DataFrame
    force_nominal : boolean
        If True, interpret ALL string columns as nominal (DiscreteVariable).

    Returns
    -------
    Table
    """

    def _is_discrete(s):
        return (is_categorical_dtype(s) or
                is_object_dtype(s) and (force_nominal or
                                        s.nunique() < s.size**.666))

    def _is_datetime(s):
        if is_datetime64_any_dtype(s):
            return True
        try:
            if is_object_dtype(s):
                pd.to_datetime(s, infer_datetime_format=True)
                return True
        except Exception:  # pylint: disable=broad-except
            pass
        return False

    # If df index is not a simple RangeIndex (or similar), put it into data
    if not (df.index.is_integer() and (df.index.is_monotonic_increasing or
                                       df.index.is_monotonic_decreasing)):
        df = df.reset_index()

    attrs, metas = [], []
    X, M = [], []

    # Iter over columns
    for name, s in df.items():
        name = str(name)
        if _is_discrete(s):
            discrete = s.astype('category').cat
            attrs.append(DiscreteVariable(name, discrete.categories.astype(str).tolist()))
            X.append(discrete.codes.replace(-1, np.nan).values)
        elif _is_datetime(s):
            tvar = TimeVariable(name)
            attrs.append(tvar)
            s = pd.to_datetime(s, infer_datetime_format=True)
            X.append(s.astype('str').replace('NaT', np.nan).map(tvar.parse).values)
        elif is_numeric_dtype(s):
            attrs.append(ContinuousVariable(name))
            X.append(s.values)
        else:
            metas.append(StringVariable(name))
            M.append(s.values.astype(object))

    return Table.from_numpy(Domain(attrs, None, metas),
                            np.column_stack(X) if X else np.empty((df.shape[0], 0)),
                            None,
                            np.column_stack(M) if M else None)
コード例 #19
0
ファイル: structured.py プロジェクト: gil2abir/fastai
def proc_df(df, y_fld=None, skip_flds=None, ignore_flds=None, do_scale=False, na_dict=None,
            preproc_fn=None, max_n_cat=None, subset=None, mapper=None):
    """ proc_df takes a data frame df and splits off the response variable, and
    changes the df into an entirely numeric dataframe.

    Parameters:
    -----------
    df: The data frame you wish to process.

    y_fld: The name of the response variable

    skip_flds: A list of fields that dropped from df.

    ignore_flds: A list of fields that are ignored during processing.

    do_scale: Standardizes each column in df. Takes Boolean Values(True,False)

    na_dict: a dictionary of na columns to add. Na columns are also added if there
        are any missing values.

    preproc_fn: A function that gets applied to df.

    max_n_cat: The maximum number of categories to break into dummy values, instead
        of integer codes.

    subset: Takes a random subset of size subset from df.

    mapper: If do_scale is set as True, the mapper variable
        calculates the values used for scaling of variables during training time (mean and standard deviation).

    Returns:
    --------
    [x, y, nas, mapper(optional)]:

        x: x is the transformed version of df. x will not have the response variable
            and is entirely numeric.

        y: y is the response variable

        nas: returns a dictionary of which nas it created, and the associated median.

        mapper: A DataFrameMapper which stores the mean and standard deviation of the corresponding continuous
        variables which is then used for scaling of during test-time.

    Examples:
    ---------
    >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']})
    >>> df
       col1 col2
    0     1    a
    1     2    b
    2     3    a

    note the type of col2 is string

    >>> train_cats(df)
    >>> df

       col1 col2
    0     1    a
    1     2    b
    2     3    a

    now the type of col2 is category { a : 1, b : 2}

    >>> x, y, nas = proc_df(df, 'col1')
    >>> x

       col2
    0     1
    1     2
    2     1

    >>> data = DataFrame(pet=["cat", "dog", "dog", "fish", "cat", "dog", "cat", "fish"],
                 children=[4., 6, 3, 3, 2, 3, 5, 4],
                 salary=[90, 24, 44, 27, 32, 59, 36, 27])

    >>> mapper = DataFrameMapper([(:pet, LabelBinarizer()),
                          ([:children], StandardScaler())])

    >>>round(fit_transform!(mapper, copy(data)), 2)

    8x4 Array{Float64,2}:
    1.0  0.0  0.0   0.21
    0.0  1.0  0.0   1.88
    0.0  1.0  0.0  -0.63
    0.0  0.0  1.0  -0.63
    1.0  0.0  0.0  -1.46
    0.0  1.0  0.0  -0.63
    1.0  0.0  0.0   1.04
    0.0  0.0  1.0   0.21
    """
    if not ignore_flds: ignore_flds=[]
    if not skip_flds: skip_flds=[]
    if subset: df = get_sample(df,subset)
    ignored_flds = df.loc[:, ignore_flds]
    df.drop(ignore_flds, axis=1, inplace=True)
    df = df.copy()
    if preproc_fn: preproc_fn(df)
    if y_fld is None: y = None
    else:
        if not is_numeric_dtype(df[y_fld]): df[y_fld] = df[y_fld].cat.codes
        y = df[y_fld].values
        skip_flds += [y_fld]
    df.drop(skip_flds, axis=1, inplace=True)

    if na_dict is None: na_dict = {}
    for n,c in df.items(): na_dict = fix_missing(df, c, n, na_dict)
    if do_scale: mapper = scale_vars(df, mapper)
    for n,c in df.items(): numericalize(df, c, n, max_n_cat)
    df = pd.get_dummies(df, dummy_na=True)
    df = pd.concat([ignored_flds, df], axis=1)
    res = [df, y, na_dict]
    if do_scale: res = res + [mapper]
    return res
コード例 #20
0
ファイル: test_datasets.py プロジェクト: RamyaGuru/matminer
    def universal_dataset_check(self, dataset_name, object_headers=None,
                                numeric_headers=None, bool_headers=None,
                                test_func=None):

        # "Hard" integrity checks that take a long time.
        # These tests only run if the MATMINER_DATASET_FULL_TEST
        # environment variable is set to True
        if do_complete_test:
            # Get rid of dataset if it's on the disk already
            data_path = os.path.join(
                self.dataset_dir,
                dataset_name + "." + self.dataset_dict[dataset_name][
                    'file_type'
                ]
            )
            if os.path.exists(data_path):
                os.remove(data_path)

            # Test that dataset can be downloaded
            load_dataset(dataset_name)
            self.assertTrue(os.path.exists(data_path))

            # Test that data is now available and has all its elements
            df = load_dataset(dataset_name, download_if_missing=False)
            self.assertEqual(
                len(df), self.dataset_dict[dataset_name]["num_entries"]
            )

            # Test all columns are there
            self.assertEqual(sorted(list(df)), sorted(
                [header for header in
                 self.dataset_dict[dataset_name]['columns'].keys()]
            ))

            # Test each column for appropriate type
            if object_headers is None:
                object_headers = []
            if numeric_headers is None:
                numeric_headers = []
            if bool_headers is None:
                bool_headers = []

            df = load_dataset(dataset_name, download_if_missing=False)
            if object_headers:
                self.assertTrue(is_object_dtype(df[object_headers].values))
            if numeric_headers:
                self.assertTrue(is_numeric_dtype(df[numeric_headers].values))
            if bool_headers:
                self.assertTrue(is_bool_dtype(df[bool_headers].values))

            # Make sure all columns are accounted for
            column_headers = object_headers + numeric_headers + bool_headers
            self.assertEqual(sorted(list(df)), sorted(column_headers))

            # Run tests unique to the dataset
            if test_func is not None:
                test_func(df)

        # "Soft" check that just makes sure the dataset download page is active
        # This runs when on a system with the CI environment var present
        # (e.g. when running a continuous integration VCS system)
        else:
            download_page = requests.head(
                self.dataset_dict[dataset_name]["url"]
            )
            self.assertTrue(download_page.ok)
コード例 #21
0
 def format_series(self, series: pd.Series) -> pd.Series:
     ret = series.map(self._formatter.format, na_action="ignore")
     # Pandas will still think all-NA is number.
     if is_numeric_dtype(ret):
         ret = ret.astype(object)
     return ret
def spstd(x):
    return np.sqrt(sp.stats.moment(x.dropna(),
                                   2)) if is_numeric_dtype(x) else np.nan
コード例 #23
0
ファイル: base.py プロジェクト: tyarkoni/featureX
def merge_results(results, format='wide', timing=True, metadata=True,
                  extractor_names=True, object_id=True, aggfunc=None,
                  invalid_results='ignore', **to_df_kwargs):
    ''' Merges a list of ExtractorResults instances and returns a pandas DF.

    Args:
        results (list, tuple): A list of ExtractorResult instances to merge.
        format (str): Format to return the data in. Can be either 'wide' or
            'long'. In the wide case, every extracted feature is a column,
            and every Stim is a row. In the long case, every row contains a
            single Stim/Extractor/feature combination.
        timing (bool, str): Whether or not to include columns for onset,
            order, and duration.
        metadata (bool): if True, includes Stim metadata columns in the
            returned DataFrame. These columns include 'stim_name', 'class',
            'filename', 'history', and 'source_file'. Note that these values
            are often long strings, so the returned DF will be considerably
            larger.
        extractor_names (str, bool): How to handle extractor names when
            returning results. The specific behavior depends on whether format
            is 'long' or 'wide'. Valid values include:

                - 'prepend' or True: In both 'long' and 'wide' formats,
                  feature names will be prepended with the Extractor name
                  (e.g., "FaceExtractor#face_likelihood").
                - 'drop' or False: In both 'long' and 'wide' formats, extractor
                  names will be omitted entirely from the result. Note that
                  this can create feature name conflicts when merging results
                  from multiple Extractors, so is generally discouraged.
                - 'column': In 'long' format, extractor name will be included
                  as a separate column. Not valid for 'wide' format (and will
                  raise an error).
                - 'multi': In 'wide' format, a MultiIndex will be used for the
                  columns, with the first level of the index containing the
                  Extractor name and the second level containing the feature
                  name. This value is invalid if format='long' (and will raise
                  and error).

        object_id (bool): If True, attempts to intelligently add an
            'object_id' column that differentiates between multiple objects in
            the results that may share onsets/orders/durations (and would
            otherwise be impossible to distinguish). This frequently occurs for
            ImageExtractors that identify multiple target objects (e.g., faces)
            within a single ImageStim. Default is 'auto', which includes the
            'object_id' column if and only if it has a non-constant value.
        aggfunc (str, Callable): If format='wide' and extractor_names='drop',
            it's possible for name clashes between features to occur. In such
            cases, the aggfunc argument is passed onto pandas' pivot_table
            function, and specifies how to aggregate multiple values for the
            same index. Can be a callable or any string value recognized by
            pandas. By default (None), 'mean' will be used for numeric columns
            and 'first' will be used for object/categorical columns.
        invalid_results (str): Specifies desired action for treating elements
            of the passed in results argument that are not ExtractorResult
            objects. Valid values include:
                - 'ignore' will ignore them and merge the valid
                    ExtractorResults.
                - 'fail' will raise an exception on any invalid input


    Returns: a pandas DataFrame. For format details, see 'format' argument.
    '''

    results = flatten(results)

    _timing = True if timing == 'auto' else timing
    _object_id = True if object_id == 'auto' else object_id

    if extractor_names is True:
        extractor_names = 'prepend'
    elif extractor_names is False:
        extractor_names = 'drop'

    dfs = []
    for r in results:
        if isinstance(r, ExtractorResult):
            dfs.append(r.to_df(timing=_timing, metadata=metadata,
                               format='long', extractor_name=True,
                               object_id=_object_id, **to_df_kwargs))
        elif invalid_results == 'fail':
            raise ValueError("At least one of the provided results was not an"
                             "ExtractorResult. Set the invalid_results"
                             "parameter to 'ignore' if you wish to ignore"
                             "this.")

    if len(dfs) == 0:
        return pd.DataFrame()

    data = pd.concat(dfs, axis=0).reset_index(drop=True)

    if object_id == 'auto' and data['object_id'].nunique() == 1:
        data = data.drop('object_id', axis=1)

    if extractor_names in ['prepend', 'multi']:
        data['feature'] = data['extractor'] + '#' + data['feature'].astype(str)

    if extractor_names != 'column':
        data = data.drop('extractor', axis=1)

    if format == 'wide':
        ind_cols = {'stim_name', 'onset', 'order', 'duration', 'object_id',
                    'class', 'filename', 'history', 'source_file'}
        ind_cols = list(ind_cols & set(data.columns))
        # pandas groupby/index operations can't handle NaNs in index, (see
        # issue at https://github.com/pandas-dev/pandas/issues/3729), so we
        # replace NaNs with a placeholder and then re-substitute after
        # pivoting.
        dtypes = data[ind_cols].dtypes
        data[ind_cols] = data[ind_cols].fillna('PlAcEholdER')

        # Set default aggfunc based on column type, otherwise bad things happen
        if aggfunc is None:
            aggfunc = 'mean' if is_numeric_dtype(data['value']) else 'first'

        data = data.pivot_table(index=ind_cols, columns='feature',
                                values='value', aggfunc=aggfunc).reset_index()
        data.columns.name = None  # vestigial--is set to 'feature'
        data[ind_cols] = data[ind_cols].replace('PlAcEholdER', np.nan)
        data[ind_cols] = data[ind_cols].astype(dict(zip(ind_cols, dtypes)))

    if timing == 'auto' and 'onset' in data.columns:
        if data['onset'].isnull().all():
            data = data.drop(['onset', 'order', 'duration'], axis=1)

    if 'onset' in data.columns:
        key = [('onset', ''), ('order', ''), ('duration', '')] \
            if isinstance(data.columns, pd.MultiIndex) \
            else ['onset', 'order', 'duration']
        data = data.sort_values(key).reset_index(drop=True)

    if extractor_names == 'multi':
        if format == 'long':
            raise ValueError("Invalid extractor_names value 'multi'. When "
                             "format is 'long', extractor_names must be "
                             "one of 'drop', 'prepend', or 'column'.")
        data.columns = pd.MultiIndex.from_tuples(
            [c.split('#') for c in data.columns])
    return data
コード例 #24
0
def lambda_handler(event, context):

    bucket = event['Records'][0]['s3']['bucket']['name']
    aws_key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'], encoding='utf-8')

    bucketcsvfile = s3.get_object(Bucket=bucket, Key=aws_key)
    csvfile = pd.read_csv(bucketcsvfile['Body'])

    #column headers validation
    validation_headers_list = ['id','first_name','last_name','salary','department']
    file_headers_list = list(csvfile.columns.values)
    headers_comparison = (validation_headers_list == file_headers_list)
    data_type_validation = {}

    #data type validation
    if (headers_comparison):
        data_type_validation['id'] = is_numeric_dtype(csvfile['id'])
        data_type_validation['first_name'] = is_string_dtype(csvfile['first_name'])
        data_type_validation['last_name'] = is_string_dtype(csvfile['last_name'])
        data_type_validation['salary'] = is_numeric_dtype(csvfile['salary'])
        data_type_validation['department'] = is_string_dtype(csvfile['department'])
        print(data_type_validation)

    if (headers_comparison and all(data_type_validation.values())):
        print("Validated Headers and data types in File!")

        try:
            connection_rds = psycopg2.connect("dbname={} user={} host={} password={} port={} sslmode={}".format(db_database, db_user, db_host, db_pw, db_port, db_sslmode))
            cursor_rds = connection_rds.cursor()
            print("Connection to DB successful") 
            
            temp_table_query = "create temporary table employee_staging ( like employees ) on commit drop"
            cursor_rds.execute(temp_table_query)
            
            upload_s3_file_query = "select * from fn_load_s3_file('{}');".format(aws_key)
            cursor_rds.execute(upload_s3_file_query)
            
            record_processing_query = """insert into employees (id, first_name, last_name, salary, department)
                                         select id, first_name, last_name, salary, department
                                         from employee_staging
                                         on conflict (id)
                                         do update set first_name = excluded.first_name
                                                        ,last_name = excluded.last_name
                                                        ,salary = excluded.salary
                                                        ,department = excluded.department""" 
                                                        
            cursor_rds.execute(record_processing_query)
    
        except :
            connection_rds.rollback()
            raise
        
        else: 
            connection_rds.commit()
            
        finally:
            connection_rds.close()
            message = message = {"Processed_file": aws_key}
            response = sns.publish(
                TargetArn=os.environ["sns_topic"],
                Message=json.dumps(message)
            )
        return {'processed_file': aws_key}
    
    else:
        if (validation_headers_list != file_headers_list):
            print("Please check columns in file. File headers order should be: id,first_name,last_name,salary,department")
        elif (all(data_type_validation.values()) == False):
            columns_to_check = []
            for key in data_type_validation.keys():
                if data_type_validation[key] is False:
                    columns_to_check.append(key)
            print("Please check the data in columns: " + str(columns_to_check))
        return {'Error': "File Error"}
コード例 #25
0
ファイル: data.py プロジェクト: mariusgruenewald/linearmodels
    def __init__(
        self,
        x: "PanelDataLike",
        var_name: str = "x",
        convert_dummies: bool = True,
        drop_first: bool = True,
        copy: bool = True,
    ):
        self._var_name = var_name
        self._convert_dummies = convert_dummies
        self._drop_first = drop_first
        self._panel: Optional[_Panel] = None
        self._shape: Optional[Tuple[int, int, int]] = None
        index_names = ["entity", "time"]
        if isinstance(x, PanelData):
            x = x.dataframe
        self._original = x

        if not isinstance(x, (Series, DataFrame, np.ndarray)):
            try:
                from xarray import DataArray

                if isinstance(x, DataArray):
                    if x.ndim not in (2, 3):
                        raise ValueError(
                            "Only 2-d or 3-d DataArrays are supported")
                    if x.ndim == 2:
                        x = x.to_pandas()
                    else:
                        items: List[Hashable] = np.asarray(
                            x.coords[x.dims[0]]).tolist()
                        major: List[Hashable] = np.asarray(
                            x.coords[x.dims[1]]).tolist()
                        minor: List[Hashable] = np.asarray(
                            x.coords[x.dims[2]]).tolist()
                        values = x.values
                        x = panel_to_frame(values, items, major, minor, True)
            except ImportError:
                pass

        if isinstance(x, Series) and isinstance(x.index, MultiIndex):
            x = DataFrame(x)
        elif isinstance(x, Series):
            raise ValueError(
                "Series can only be used with a 2-level MultiIndex")

        if isinstance(x, DataFrame):
            if isinstance(x.index, MultiIndex):
                if len(x.index.levels) != 2:
                    raise ValueError("DataFrame input must have a "
                                     "MultiIndex with 2 levels")
                if isinstance(self._original, (DataFrame, PanelData, Series)):
                    for i in range(2):
                        index_names[
                            i] = x.index.levels[i].name or index_names[i]
                self._frame = x
                if copy:
                    self._frame = self._frame.copy()
            else:
                self._frame = DataFrame({var_name: x.T.stack(dropna=False)})
        elif isinstance(x, np.ndarray):
            if x.ndim not in (2, 3):
                raise ValueError("2 or 3-d array required for numpy input")
            if x.ndim == 2:
                x = x[None, :, :]

            k, t, n = x.shape
            var_str = var_name + ".{0:0>" + str(int(np.log10(k) + 0.01)) + "}"
            variables = [var_name] if k == 1 else [
                var_str.format(i) for i in range(k)
            ]
            entity_str = "entity.{0:0>" + str(int(np.log10(n) + 0.01)) + "}"
            entities = [entity_str.format(i) for i in range(n)]
            time = list(range(t))
            assert isinstance(x, np.ndarray)
            x = x.astype(np.float64, copy=False)
            panel = _Panel.from_array(x,
                                      items=variables,
                                      major_axis=time,
                                      minor_axis=entities)
            self._fake_panel = panel
            self._frame = panel.to_frame()
        else:
            raise TypeError("Only ndarrays, DataFrames or DataArrays are "
                            "supported")
        if convert_dummies:
            self._frame = expand_categoricals(self._frame, drop_first)
            self._frame = self._frame.astype(np.float64, copy=False)

        time_index = Series(self.index.levels[1])
        if not (is_numeric_dtype(time_index.dtype)
                or is_datetime64_any_dtype(time_index.dtype)):
            raise ValueError("The index on the time dimension must be either "
                             "numeric or date-like")
        # self._k, self._t, self._n = self.panel.shape
        self._k, self._t, self._n = self.shape
        self._frame.index.set_names(index_names, inplace=True)
コード例 #26
0
def make_summary(in_data, feature, outcome):

    # Get statistics for all observations
    summary_total = pd.DataFrame(columns=['Statistic', 'Total'])
    summary_total.loc[0] = ['observations', len(in_data.index)]
    summary_total.loc[1] = ['non missing', in_data[feature].count()]

    if is_numeric_dtype(in_data[feature]):
        summary_total.loc[2] = ['missing', in_data[feature].isnull().sum()]
    else:
        summary_total.loc[2] = ['missing', in_data[feature].isna().sum()]

    summary_total.loc[3] = ['unique', in_data[feature].nunique()]
    summary_total_desc = in_data[feature].describe().to_frame()

    # Format dataframe
    summary_total_desc.insert(0, 'Statistic', summary_total_desc.index)
    summary_total_desc.rename({feature: 'Total'}, axis=1, inplace=True)

    if is_numeric_dtype(in_data[feature]) and in_data[feature].nunique() > 2:
        all_total = pd.concat([summary_total, summary_total_desc.iloc[1:]],
                              ignore_index=True)
    else:
        all_total = pd.concat([summary_total, summary_total_desc.iloc[2:]],
                              ignore_index=True)

    # Get statistics by outcome value
    summary_outcome = in_data.groupby(outcome)[outcome].count().to_frame()
    summary_outcome.rename({outcome: 'observations'}, axis=1, inplace=True)

    nm = in_data.groupby([outcome]).agg({feature: ['count']})
    nm.columns = ["non missing"]
    summary_outcome = summary_outcome.merge(nm,
                                            how='outer',
                                            left_index=True,
                                            right_index=True)

    miss = in_data[feature].isnull().groupby(in_data[outcome]).sum().to_frame()
    miss.columns = ["missing"]
    summary_outcome = summary_outcome.merge(miss,
                                            how='outer',
                                            left_index=True,
                                            right_index=True)

    nu = in_data.groupby([outcome]).agg({feature: ['nunique']})
    nu.columns = ["unique"]
    summary_outcome = summary_outcome.merge(nu,
                                            how='inner',
                                            left_index=True,
                                            right_index=True)

    # Format dataframe
    summary_outcome_desc = in_data.groupby(outcome)[feature].describe()
    if is_numeric_dtype(in_data[feature]):
        summary_outcome = summary_outcome.merge(summary_outcome_desc.drop(
            'count', axis=1),
                                                how='outer',
                                                left_index=True,
                                                right_index=True)
    else:
        summary_outcome = summary_outcome.merge(summary_outcome_desc.drop(
            ['count', 'unique'], axis=1),
                                                how='outer',
                                                left_index=True,
                                                right_index=True)
    summary_outcome_trans = summary_outcome.transpose()
    new_columns = [
        outcome + ' = ' + str(list(summary_outcome_trans)[0]),
        outcome + ' = ' + str(list(summary_outcome_trans)[1])
    ]
    summary_outcome_trans.columns = new_columns

    # Merge total and by outcome statistics
    summary = all_total.merge(summary_outcome_trans,
                              how='outer',
                              left_on='Statistic',
                              right_index=True)

    try:
        summary.to_html('summary ' + feature + '.html',
                        index=False,
                        float_format=lambda x: '%10.2f' % x)
    except:
        print(summary)
コード例 #27
0
#data eda.
profile = train.profile_report(title='Pandas Profiling Report')
profile.to_file(output_file="report/data_eda_output.html")

#train test split.
train_labels = train['survived']
train = train.drop(columns=['survived'])

X_train, X_test, y_train, y_test = train_test_split(train,
                                                    train_labels,
                                                    test_size=0.2)

#data preprocess.transform
maxunique = 1024
unique_stat = X_train.nunique()
numeric_cols = [i for i in X_train.columns if is_numeric_dtype(X_train[i])]
categorical_cols = [
    i for i in X_train.columns
    if i not in numeric_cols and unique_stat[i] < maxunique
]

mapper = DataFrameMapper([
    (categorical_cols, [
        SimpleImputer(strategy='constant', fill_value='missing'),
        OneHotEncoder(handle_unknown='ignore')
    ]),
    (numeric_cols, [SimpleImputer(strategy='median'),
                    StandardScaler()]),
],
                         df_out=True)
コード例 #28
0
    def make_chart(self, table: pd.DataFrame,
                   input_columns: Dict[str, Any]) -> Chart:
        """Create a Chart ready for charting, or raise GentleValueError.

        Features:
        * Error if X column is missing
        * Error if X column does not have two values
        * Error if X column is all-NaN
        * Error if too many X values in text mode (since we can't chart them)
        * X column can be number or date
        * Missing X dates lead to missing records
        * Missing X floats lead to missing records
        * Missing Y values are omitted
        * Error if no Y columns chosen
        * Error if a Y column is the X column
        * Error if a Y column has fewer than 1 non-missing value
        * Default title, X and Y axis labels
        """
        x_series, mask = self._make_x_series_and_mask(table, input_columns)

        if not self.y_columns:
            raise GentleValueError(
                i18n.trans("noYAxisError.message",
                           "Please choose a Y-axis column"))

        y_serieses = []
        for ycolumn in self.y_columns:
            if ycolumn.column == self.x_column:
                raise GentleValueError(
                    i18n.trans(
                        "sameAxesError.message",
                        "You cannot plot Y-axis column {column_name} because it is the X-axis column",
                        {"column_name": ycolumn.column},
                    ))

            series = table[ycolumn.column]

            if not is_numeric_dtype(series.dtype):
                raise GentleValueError(
                    i18n.trans(
                        "axisNotNumericError.message",
                        'Cannot plot Y-axis column "{column_name}" because it is not numeric. '
                        "Convert it to a number before plotting it.",
                        {"column_name": ycolumn.column},
                    ))

            series = series[mask]  # line up with x_series
            series.reset_index(drop=True, inplace=True)

            # Find how many Y values can actually be plotted on the X axis. If
            # there aren't going to be any Y values on the chart, raise an
            # error.
            if not series.count():
                raise GentleValueError(
                    i18n.trans(
                        "emptyAxisError.message",
                        'Cannot plot Y-axis column "{column_name}" because it has no values',
                        {"column_name": ycolumn.column},
                    ))

            y_serieses.append(
                YSeries(series, ycolumn.color,
                        input_columns[ycolumn.column].format))

        title = self.title or "Line Chart"
        x_axis_label = self.x_axis_label or x_series.name
        if len(y_serieses) == 1:
            y_axis_label = self.y_axis_label or y_serieses[0].name
        else:
            y_axis_label = self.y_axis_label

        return Chart(
            title=title,
            x_axis_label=x_axis_label,
            x_axis_tick_format=x_series.d3_tick_format,
            y_axis_label=y_axis_label,
            x_series=x_series,
            y_serieses=y_serieses,
            y_axis_tick_format=y_serieses[0].d3_tick_format,
        )
コード例 #29
0
    def _prophet(request, context):
        """
        Provide a timeseries forecast using Facebook's Prophet library. Scalar function.
        :param request: an iterable sequence of RowData
        :param context: not used for now
        :return: the forecasted value for each row
        :
        :Qlik expression example:
        :<AAI Connection Name>.Prophet(MonthStartDate, sum(Value), 'return=yhat, freq=MS, debug=true')
        :The third argument in the Qlik expression is a string of parameters. 
        :This should take the form of a comma separated string:
        :e.g 'return=yhat, freq=MS, debug=true' or 'return=yhat_upper, freq=MS'
        :
        :<AAI Connection Name>.Prophet_Holidays(ForecastDate, sum(Value), Holiday, 'return=yhat, freq=D, debug=true')
        :In the holidays variant the third argument is a field containing the holiday name or NULL for each row.
        :
        :Parameters accepted for the Prophet() function are: cap, floor, changepoint_prior_scale, interval_width, 
        :lower_window, upper_window 
        :
        :Parameters accepted for the make_future_dataframe() function are: freq
        :
        :For more information on these parameters go here: https://facebook.github.io/prophet/docs/quick_start.html
        :
        :Additional parameters used are: return, take_log, debug, load_script
        :
        :cap = 1000 : A logistic growth model can be defined using cap and floor. Values should be double or integer
        :changepoint_prior_scale = 0.05 : Decrease if the trend changes are being overfit, increase for underfit
        :interval_width = 0.08 : Set the width of the uncertainty intervals
        :lower_window = 1 : Only used with holidays. Extend the holiday by certain no. of days prior to the date.
        :upper_window = 1 : Only used with holidays. Extend the holiday by certain no. of days after the date.
        :freq = MS : The frequency of the time series. e.g. MS for Month Start. See the possible options here:
        :          : http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases
        :return = yhat : Any of the options in the forecast result. You can see these options with debug=true
        :              : yhat, yhat_upper, yhat_lower : Forecast, upper and lower limits
        :              : y_then_yhat, y_then_yhat_upper, y_then_yhat_lower : Return forecast only for forecast periods
        :              : trend, trend_upper, trend_lower : Trend component of the timeseries
        :              : seasonal, seasonal_upper, seasonal_lower: Seasonal component of the timeseries 
        :take_log = false : Apply logarithm to the values before the forecast. Default is true
        :debug = true : Print execution information to the terminal and logs in ..\logs\Prophet Log <n>.txt
        """

        # Get a list from the generator object so that it can be iterated over multiple times
        request_list = [request_rows for request_rows in request]

        # Calculate timings for the components of the forecasting
        # The results will be stored in ..\logs\Prophet Performance Log.txt
        # The request_list line above is not timed as the generator can only be iterated once
        # ProphetForQlik.timeit(request_list)

        # Create an instance of the ProphetForQlik class
        # This will take the request data from Qlik and prepare it for forecasting
        predictor = ProphetForQlik(request_list, context)

        # Calculate the forecast and store in a Pandas series
        forecast = predictor.predict()

        # Check if the response is a DataFrame.
        # This occurs when the load_script=true argument is passed in the Qlik expression.
        response_is_df = isinstance(forecast, pd.DataFrame)

        # Set the data types of the output
        if response_is_df:
            dtypes = []
            for dt in forecast.dtypes:
                dtypes.append('num' if is_numeric_dtype(dt) else 'str')
        else:
            dtypes = ['num']

        # Get the response as SSE.Rows
        response_rows = utils.get_response_rows(forecast.values.tolist(),
                                                dtypes)

        # Get the number of bundles in the request
        num_request_bundles = len(request_list)

        # Get the number of rows in the response
        num_rows = len(response_rows)

        # Calculate the number of rows to send per bundle
        if num_rows >= num_request_bundles:
            rows_per_bundle = num_rows // num_request_bundles
        else:
            rows_per_bundle = num_rows

        # Stream response as BundledRows
        for i in range(0, num_rows, rows_per_bundle):
            # Yield Row data as Bundled rows
            yield SSE.BundledRows(rows=response_rows[i:i + rows_per_bundle])
コード例 #30
0
def check_events(events):
    """Test that the events data describes a valid experimental paradigm

    It is valid if the events data  has an 'onset' key.

    Parameters
    ----------
    events : pandas DataFrame
        Events data that describes a functional experimental paradigm.

    Returns
    -------
    trial_type : array of shape (n_events,), dtype='s'
        Per-event experimental conditions identifier.
        Defaults to np.repeat('dummy', len(onsets)).

    onset : array of shape (n_events,), dtype='f'
        Per-event onset time (in seconds)

    duration : array of shape (n_events,), dtype='f'
        Per-event durantion, (in seconds)
        defaults to zeros(n_events) when no duration is provided

    modulation : array of shape (n_events,), dtype='f'
        Per-event modulation, (in seconds)
        defaults to ones(n_events) when no duration is provided.

    """
    # Check that events is a Pandas DataFrame
    if not isinstance(events, pd.DataFrame):
        raise TypeError("Events should be a Pandas DataFrame. "
                        "A {} was provided instead.".format(type(events)))
    # Column checks
    for col_name in ['onset', 'duration']:
        if col_name not in events.columns:
            raise ValueError("The provided events data "
                             "has no {} column.".format(col_name))

    # Make a copy of the dataframe
    events_copy = events.copy()

    # Handle missing trial types
    if 'trial_type' not in events_copy.columns:
        warnings.warn("'trial_type' column not found "
                      "in the given events data.")
        events_copy['trial_type'] = 'dummy'

    # Handle modulation
    if 'modulation' in events_copy.columns:
        print("A 'modulation' column was found in "
              "the given events data and is used.")
    else:
        events_copy['modulation'] = 1

    # Warn for each unexpected column that will
    # not be used afterwards
    unexpected_columns = set(events_copy.columns).difference(VALID_FIELDS)
    for unexpected_column in unexpected_columns:
        warnings.warn(("Unexpected column `{}` in events "
                       "data will be ignored.").format(unexpected_column))

    # Make sure we have a numeric type for duration
    if not is_numeric_dtype(events_copy['duration']):
        try:
            events_copy = events_copy.astype({'duration': float})
        except ValueError:
            raise ValueError("Could not cast duration to float "
                             "in events data.")

    # Handle duplicate events
    # Two events are duplicates if they have the same:
    #   - trial type
    #   - onset
    COLUMN_DEFINING_EVENT_IDENTITY = ['trial_type', 'onset', 'duration']

    # Duplicate handling strategy
    # Sum the modulation values of duplicate events
    STRATEGY = {'modulation': np.sum}

    cleaned_events = events_copy.groupby(
        COLUMN_DEFINING_EVENT_IDENTITY,
        sort=False).agg(STRATEGY).reset_index()

    # If there are duplicates, give a warning
    if len(cleaned_events) != len(events_copy):
        warnings.warn("Duplicated events were detected. "
                      "Amplitudes of these events will be summed. "
                      "You might want to verify your inputs.")

    trial_type = cleaned_events['trial_type'].values
    onset = cleaned_events['onset'].values
    duration = cleaned_events['duration'].values
    modulation = cleaned_events['modulation'].values
    return trial_type, onset, duration, modulation
コード例 #31
0
def isNumeric(colData):
    return is_numeric_dtype(colData)
コード例 #32
0
def sample_row(
    X: pd.DataFrame,
    filter_rows_with_na: bool = False,
    random_state: int = 42,
    max_field_len: int = 50,
) -> pd.DataFrame:
    """Sample a row from pandas dataframe.

    Extracts the column name, datatype, minimum and maximum values for each
    column in the supplied dataframe. The orientation is row-based (as opposed to `df.sample(1)`), which allows
    for better printing when a dataset contains many features. This function is usefull when providing a
    sample row in technical model documentation.

    Example:

    ```python
    from probatus.utils import sample_row
    from sklearn.datasets import load_iris

    iris = load_iris(as_frame=True).get('data')
    sample = sample_row(iris, filter_rows_with_na=False, random_state=12)
    print(sample.to_markdown())
    ```

    ??? info "Example output"

        | column            | dtype   |   sample |   range_low |   range_high |
        |:------------------|:--------|---------:|------------:|-------------:|
        | sepal length (cm) | float64 |      5   |         4.3 |          7.9 |
        | sepal width (cm)  | float64 |      3.5 |         2   |          4.4 |
        | petal length (cm) | float64 |      1.3 |         1   |          6.9 |
        | petal width (cm)  | float64 |      0.3 |         0.1 |          2.5 |

    Args:
        X (DataFrame):
            Pandas DataFrame to be sampled
        filter_rows_with_na (bool):
            if true, rows with na values are not considered for sampling
        random_state (int):
            Optional random state to ensure reproducability
        max_field_len (int):
            Maximum number of characters for fields, beyond
            which any text is truncated

    Returns:
        (pd.DataFrame): A Pandas DataFrame containing the sampled row
    """
    # Input validation
    assert type(X) == pd.DataFrame, "X should be pandas DataFrame"
    assert X.empty is False, "X should not be an empty DataFrame"
    assert type(
        filter_rows_with_na) == bool, "filter_rows_with_na should be a boolean"
    assert type(random_state) == int, "random_state should be an integer"
    assert type(max_field_len) == int, "max_field_len should be an integer"

    # Create new empty df
    sample_df = pd.DataFrame()

    # Convert dtypes of pandas to ensure detection of data types
    X_dtypes = X.convert_dtypes()

    # Sample row from X
    sample_row = X.sample(1, random_state=random_state)
    if filter_rows_with_na:
        try:
            sample_row = X.dropna().sample(1, random_state=random_state)
        except ValueError:
            logging.info(
                "sample_row(): No rows without NaN found, sampling from all rows.."
            )

    # Sample every column of X
    for i, col in enumerate(sample_row.columns):
        # Extract sample from X if not all samples are nan
        sample = sample_row[col].values[0]

        # If datatype allows it, extract low and high range
        if is_numeric_dtype(X_dtypes[col]):
            low = X[col].min(skipna=True)
            high = X[col].max(skipna=True)
        else:
            low = ""
            high = ""

            # Shorten sampled datapoint if too long
            if isinstance(sample, str) and len(sample) > max_field_len:
                sample = sample[:(max_field_len // 2) -
                                1] + "..." + sample[(-max_field_len // 2) + 2:]

        # Add new row to sample_df
        row_df = pd.DataFrame({
            "column": [col],
            "dtype": [X[col].dtype],
            "sample": [sample],
            "range_low": [low],
            "range_high": [high],
        })
        sample_df = pd.concat([sample_df, row_df], ignore_index=True)

    sample_df = sample_df.set_index(["column"])
    return sample_df
コード例 #33
0
def vars_from_df(df, role=None, force_nominal=False):
    if role is None and hasattr(df, 'orange_role'):
        role = df.orange_role
    df = _reset_index(df)

    cols = [], [], []
    exprs = [], [], []
    vars_ = [], [], []

    for column in df.columns:
        s = df[column]
        _role = Role.Attribute if role is None else role
        if hasattr(df, 'orange_variables') and column in df.orange_variables:
            original_var = df.orange_variables[column]
            var = original_var.copy(compute_value=None)
            expr = None
        elif _is_datetime(s):
            var = TimeVariable(str(column))
            expr = _convert_datetime
        elif _is_discrete(s, force_nominal):
            discrete = s.astype("category").cat
            var = DiscreteVariable(str(column),
                                   discrete.categories.astype(str).tolist())
            expr = to_categorical
        elif is_numeric_dtype(s):
            var = ContinuousVariable(
                # set number of decimals to 0 if int else keeps default behaviour
                str(column),
                number_of_decimals=(0 if is_integer_dtype(s) else None))
            expr = None
        else:
            if role is not None and role != Role.Meta:
                raise ValueError("String variable must be in metas.")
            _role = Role.Meta
            var = StringVariable(str(column))
            expr = lambda s, _: np.asarray(
                # to object so that fillna can replace with nans if Unknown in nan
                # replace nan with object Unknown assure that all values are string
                s.astype(object).fillna(StringVariable.Unknown).astype(str),
                dtype=object)

        cols[_role].append(column)
        exprs[_role].append(expr)
        vars_[_role].append(var)

    xym = []
    for a_vars, a_cols, a_expr in zip(vars_, cols, exprs):
        if not a_cols:
            arr = None if a_cols != cols[0] else np.empty((df.shape[0], 0))
        elif not any(a_expr):
            # if all c in columns table will share memory with dataframe
            a_df = df if all(c in a_cols for c in df.columns) else df[a_cols]
            if all(isinstance(a, SparseDtype) for a in a_df.dtypes):
                arr = csr_matrix(a_df.sparse.to_coo())
            else:
                arr = np.asarray(a_df)
        else:
            # we'll have to copy the table to resolve any expressions
            arr = np.array([
                expr(df[col], var) if expr else np.asarray(df[col])
                for var, col, expr in zip(a_vars, a_cols, a_expr)
            ]).T
        xym.append(arr)

    # Let the tables share memory with pandas frame
    if xym[1] is not None and xym[1].ndim == 2 and xym[1].shape[1] == 1:
        xym[1] = xym[1][:, 0]

    return xym, Domain(*vars_)
コード例 #34
0
def infer_variable_types(df, link_vars, variable_types, time_index,
                         secondary_time_index):
    '''Infer variable types from dataframe

    Args:
        df (DataFrame): Input DataFrame
        link_vars (list[]): Linked variables
        variable_types (dict[str -> dict[str -> type]]) : An entity's
            variable_types dict maps string variable ids to types (:class:`.Variable`)
            or (type, kwargs) to pass keyword arguments to the Variable.
        time_index (str or None): Name of time_index column
        secondary_time_index (dict[str: [str]]): Dictionary of secondary time columns
            that each map to a list of columns that depend on that secondary time
    '''
    # TODO: set pk and pk types here
    inferred_types = {}
    vids_to_assume_datetime = [time_index]
    if len(list(secondary_time_index.keys())):
        vids_to_assume_datetime.append(list(secondary_time_index.keys())[0])
    inferred_type = vtypes.Unknown
    for variable in df.columns:
        if variable in variable_types:
            continue
        elif isinstance(df, dd.DataFrame):
            msg = 'Variable types cannot be inferred from Dask DataFrames, ' \
                  'use variable_types to provide type metadata for entity'
            raise ValueError(msg)
        elif is_instance(df, ks, 'DataFrame'):
            msg = 'Variable types cannot be inferred from Koalas DataFrames, ' \
                  'use variable_types to provide type metadata for entity'
            raise ValueError(msg)
        elif variable in vids_to_assume_datetime:
            if col_is_datetime(df[variable]):
                inferred_type = vtypes.Datetime
            else:
                inferred_type = vtypes.Numeric

        elif variable in link_vars:
            inferred_type = vtypes.Categorical

        elif df[variable].dtype == "object":
            if not len(df[variable]):
                inferred_type = vtypes.Categorical
            elif col_is_datetime(df[variable]):
                inferred_type = vtypes.Datetime
            else:
                inferred_type = vtypes.Categorical

                # heuristics to predict this some other than categorical
                sample = df[variable].sample(min(10000, len(df[variable])))

                # catch cases where object dtype cannot be interpreted as a string
                try:
                    avg_length = sample.str.len().mean()
                    if avg_length > 50:
                        inferred_type = vtypes.NaturalLanguage
                except AttributeError:
                    pass

        elif df[variable].dtype == "bool":
            inferred_type = vtypes.Boolean

        elif pdtypes.is_categorical_dtype(df[variable].dtype):
            inferred_type = vtypes.Categorical

        elif pdtypes.is_numeric_dtype(df[variable].dtype):
            inferred_type = vtypes.Numeric

        elif col_is_datetime(df[variable]):
            inferred_type = vtypes.Datetime

        elif len(df[variable]):
            sample = df[variable] \
                .sample(min(10000, df[variable].nunique(dropna=False)))

            unique = sample.unique()
            percent_unique = sample.size / len(unique)

            if percent_unique < .05:
                inferred_type = vtypes.Categorical
            else:
                inferred_type = vtypes.Numeric

        inferred_types[variable] = inferred_type

    return inferred_types
コード例 #35
0
ファイル: general.py プロジェクト: sinhmd/tmap
def process_metadata_beta(data, metadata, drop_threshold=0.6, verbose=1):
    # reindex but not use reindex because it will generate lot of nan it missing index
    metadata = metadata.loc[data.index, :]
    if metadata.shape[0] == 0:
        logger("Couldn't find corresponding index from data into metadata",
               verbose=1)
        return
    # divide numeral cols and categorical cols
    numeric_cols = [
        col for col in metadata.columns
        if is_numeric_dtype(metadata.loc[:, col])
    ]
    str_cols = [
        col for col in metadata.columns
        if is_string_dtype(metadata.loc[:, col])
    ]
    sub_numeric = metadata.loc[:, numeric_cols]
    sub_str = metadata.loc[:, str_cols]
    if numeric_cols:
        # fill nan numeral cols
        drop_cols = []
        na_percent = sub_numeric.count(0) / sub_numeric.shape[0]
        drop_cols += list(sub_numeric.columns[na_percent <= drop_threshold])
        ### drop too much nan columns.
        logger('drop cols with nan values over %s percent : ' % drop_threshold,
               ','.join(drop_cols),
               '\n\n',
               verbose=verbose)
        sub_numeric = sub_numeric.loc[:, na_percent > drop_threshold]
        sub_numeric = sub_numeric.fillna(
            {col: sub_numeric.median()[col]
             for col in sub_numeric.columns})
    if str_cols:
        # one hot / get dummy categorical cols
        drop_cols = []
        num_cat = np.array(
            [len(set(sub_str.loc[:, col])) for col in sub_str.columns])
        #### num_cat == 1
        drop_cols += list(sub_str.columns[num_cat == 1])
        #### num_cat >= sub_str.shape[0] * drop_threshold
        drop_cols += list(
            sub_str.columns[num_cat >= sub_str.shape[0] * drop_threshold])

        sub_str = sub_str.loc[:, sub_str.columns.difference(drop_cols)]
        if sub_str.shape[1] != 0:
            sub_str = pd.get_dummies(sub_str)
        # drop_cols += list(sub_str.columns[sub_str.sum(0) <= sub_str.shape[0] * drop_threshold])
        # sub_str = sub_str.loc[:, sub_str.sum(0) <= sub_str.shape[0] * drop_threshold]
        logger('drop cols which is meanless or too much values',
               ','.join(drop_cols),
               '\n\n',
               verbose=verbose)
    # merge and output
    if sub_numeric.shape[1] == 0 and sub_str.shape[1] == 0:
        final_metadata = None
        logger("No columns survived.......", verbose=1)
    elif sub_str.shape[1] == 0:
        final_metadata = sub_numeric
    elif sub_numeric.shape[1] == 0:
        final_metadata = sub_str
    else:
        final_metadata = pd.concat([sub_numeric, sub_str], axis=1)
    return final_metadata
コード例 #36
0
def isInteger(colData):
    return is_numeric_dtype(colData)
コード例 #37
0
ファイル: pandas_compat.py プロジェクト: PrimozGodec/orange3
def table_from_frame(df, *, force_nominal=False):
    """
    Convert pandas.DataFrame to Orange.data.Table

    Parameters
    ----------
    df : pandas.DataFrame
    force_nominal : boolean
        If True, interpret ALL string columns as nominal (DiscreteVariable).

    Returns
    -------
    Table
    """

    def _is_discrete(s):
        return (is_categorical_dtype(s) or
                is_object_dtype(s) and (force_nominal or
                                        s.nunique() < s.size**.666))

    def _is_datetime(s):
        if is_datetime64_any_dtype(s):
            return True
        try:
            if is_object_dtype(s):
                pd.to_datetime(s, infer_datetime_format=True)
                return True
        except Exception:  # pylint: disable=broad-except
            pass
        return False

    # If df index is not a simple RangeIndex (or similar), put it into data
    if not (df.index.is_integer() and (df.index.is_monotonic_increasing or
                                       df.index.is_monotonic_decreasing)):
        df = df.reset_index()

    attrs, metas = [], []
    X, M = [], []

    # Iter over columns
    for name, s in df.items():
        name = str(name)
        if _is_discrete(s):
            discrete = s.astype('category').cat
            attrs.append(DiscreteVariable(name, discrete.categories.astype(str).tolist()))
            X.append(discrete.codes.replace(-1, np.nan).values)
        elif _is_datetime(s):
            tvar = TimeVariable(name)
            attrs.append(tvar)
            s = pd.to_datetime(s, infer_datetime_format=True)
            X.append(s.astype('str').replace('NaT', np.nan).map(tvar.parse).values)
        elif is_numeric_dtype(s):
            attrs.append(ContinuousVariable(name))
            X.append(s.values)
        else:
            metas.append(StringVariable(name))
            M.append(s.values.astype(object))

    return Table.from_numpy(Domain(attrs, None, metas),
                            np.column_stack(X) if X else np.empty((df.shape[0], 0)),
                            None,
                            np.column_stack(M) if M else None)
コード例 #38
0
    def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): 
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            target_names = column_mapping.get('target_names')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [name for name in num_feature_names if is_numeric_dtype(reference_data[name])] 

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [name for name in cat_feature_names if is_numeric_dtype(reference_data[name])] 
        
        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [date_column, id_column, target_column, prediction_column]

            num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns))
            cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns))

            target_names = None

        if production_data is not None and target_column is not None and prediction_column is not None:
            production_data.replace([np.inf, -np.inf], np.nan, inplace=True)
            production_data.dropna(axis=0, how='any', inplace=True)

            array_prediction = production_data[prediction_column].to_numpy()

            prediction_ids = np.argmax(array_prediction, axis=-1)
            prediction_labels = [prediction_column[x] for x in prediction_ids]
            
            #plot support bar
            metrics_matrix = metrics.classification_report(production_data[target_column], prediction_labels, 
                output_dict=True)
            metrics_frame = pd.DataFrame(metrics_matrix)
            support = metrics_frame.iloc[-1:,:-3].values[0]

            fig = go.Figure()

            fig.add_trace(go.Bar(x=metrics_frame.columns.tolist()[:-3], 
                y=metrics_frame.iloc[-1:,:-3].values[0], marker_color=red, name='Support'))

            fig.update_layout(
                xaxis_title = "Class",
                yaxis_title = "Number of Objects",
            )

            support_bar_json = json.loads(fig.to_json())

            self.wi = BaseWidgetInfo(
                title=self.title,
                type="big_graph",
                details="",
                alertStats=AlertStats(),
                alerts=[],
                alertsPosition="row",
                insights=[],
                size=1,
                params={
                    "data": support_bar_json['data'],
                    "layout": support_bar_json['layout']
                },
                additionalGraphs=[],
            )
        else:
            self.wi = None
コード例 #39
0
def to_numeric_array(series):
    if not is_numeric_dtype(series):
        if not hasattr(series, 'cat'):
            series = series.astype('category')
        return series.cat.codes.values
    return series.values
コード例 #40
0
ファイル: backend.py プロジェクト: JackyP/punditkit
    def classifier_pair_plot(self,
                             df,
                             num_features=4,
                             regression_quantiles=10):
        """ Returns a pair plot of top features for a classifier
        """

        # TODO: find replace
        df_default = self.typical_feature_values
        pipe = self.numeric_pipe
        transformer = self.transform_numeric
        features = self.features
        top_features = self.top_features[:num_features]
        response = self.response

        dim_features = len(features)

        mpl.rcParams.update(mpl.rcParamsDefault)
        mpl.rcParams.update({"font.size": 6})  # For readability
        # df_not_na = df[top_features + [response]].dropna()
        vals_df = self.transform_numeric.transform(df[features])
        vals_default = pd.DataFrame(transformer.transform(df_default))

        is_numeric = is_numeric_dtype(df[response])

        if is_numeric:
            pal0 = sns.color_palette("GnBu_d", regression_quantiles)
            pal1 = sns.color_palette("GnBu_r", regression_quantiles)
        else:
            pal0 = sns.color_palette("muted", df[response].nunique())
            pal1 = sns.color_palette("bright", df[response].nunique())
        # g = sns.PairGrid(df_not_na, vars=top_features, hue=response, palette=pal)

        gs = gridspec.GridSpec(num_features, num_features)
        fig = plt.figure()

        # Dummy estimator that predicts categoricals even for regression
        class DummyEstimator:
            model = pipe
            is_num = is_numeric

            def predict(self, X):
                y = self.model.predict(X)
                if self.is_num:
                    return pd.qcut(y,
                                   regression_quantiles,
                                   labels=False,
                                   duplicates="drop")
                else:
                    return np.searchsorted(pipe.classes_, y)

        estimator = DummyEstimator()

        if is_numeric:
            resp_plt = pd.qcut(df[response],
                               regression_quantiles,
                               labels=False,
                               duplicates="drop").values
        else:
            resp_plt = df[response].astype("category").cat.codes.values

        # Diagonals: Numeric - Dist Plot, Categorical - Bar Plot
        lims = []
        for row_i in range(0, num_features):
            ax = plt.subplot(gs[row_i, row_i])
            feature_name = top_features[row_i]
            f_ind_i = features.index(feature_name)
            # plt.hist(x=df.iloc[:, f_ind_i])
            if is_numeric_dtype(df.loc[:, feature_name]):
                ax = sns.distplot(df.loc[:, feature_name].dropna())
            else:
                # Truncate long category names to avoid cluttering the axis
                kwargs = {
                    feature_name: lambda df: df[feature_name].str.slice(0, 5)
                }
                df.groupby(feature_name).size().reset_index(
                    name="counts").assign(**kwargs).plot.bar(ax=ax,
                                                             x=feature_name,
                                                             y="counts",
                                                             legend=False)

            plt.ylabel("Count")
            plt.xlabel(feature_name)
            lims += [plt.xlim()]
            # plt.xlim((df.iloc[:, f_ind_i].min(), df.iloc[:, f_ind_i].max()))

        # Upper:
        indices = zip(*np.triu_indices(num_features, k=1))

        # indices = zip(*np.triu_indices_from(g.axes, 1))
        for row_i, col_j in indices:

            # ax = g.axes[row_i, col_j]
            ax = plt.subplot(gs[row_i, col_j])
            feature_name_i = top_features[row_i.item()]
            feature_name_j = top_features[col_j.item()]
            f_ind_i = features.index(feature_name_i)
            f_ind_j = features.index(feature_name_j)
            filler_ind = [
                x for x in list(range(0, dim_features))
                if x not in [f_ind_i, f_ind_j]
            ]
            # mlxtend decision regions requries a dummy set of variables if there
            # are more than two features.
            if len(features) > 2:
                filler = vals_default[filler_ind].to_dict("records")[0]
            else:
                filler = None

            mpl.rcParams.update({"contour.negative_linestyle": "dotted"})
            plot_decision_regions(
                X=transformer.transform(df[features].values),
                y=resp_plt,
                feature_index=(f_ind_j, f_ind_i),
                filler_feature_values=filler,
                clf=estimator,
                colors=",".join(pal0.as_hex()),
                hide_spines=False,
            )

            if len(filler_ind) > 0:
                plt.scatter(
                    x=vals_df[:, features.index(feature_name_j)],
                    y=vals_df[:, features.index(feature_name_i)],
                    s=3,
                    c=resp_plt,
                    cmap=ListedColormap(pal1.as_hex()),
                )
            plt.xlabel(features[f_ind_j])
            plt.ylabel(features[f_ind_i])
            plt.xlim(lims[col_j])
            plt.ylim(lims[row_i])

        # Lower Diag
        indices = zip(*np.tril_indices(num_features, k=-1))
        for row_i, col_j in indices:
            ax = plt.subplot(gs[row_i, col_j])
            ax.set_facecolor("grey")
            f_ind_i = features.index(top_features[row_i.item()])
            f_ind_j = features.index(top_features[col_j.item()])
            plt.xlim(lims[col_j])
            plt.ylim(lims[row_i])
            plt.xlabel(features[f_ind_j])
            plt.ylabel(features[f_ind_i])

        fig.tight_layout()
コード例 #41
0
ファイル: plot.py プロジェクト: wanliu2019/tmap
def vis_progressX(graph,
                  simple=False,
                  mode='file',
                  color=None,
                  _color_SAFE=None,
                  min_size=10,
                  max_size=40,
                  **kwargs):
    """
    For dynamic visualizing tmap construction process, it performs a interactive graph based on `plotly` with a slider to present the process from ordination to graph step by step. Currently, it doesn't provide any API for overriding the number of step from ordination to graph. It may be implemented at the future.

    If you want to draw a simple graph with edges and nodes instead of the process,  try the params ``simple``.

    This visualized function is mainly based on plotly which is a interactive Python graphing library. The params mode is trying to provide multiple type of return for different purpose. There are three different modes you can choose including "file" which return a html created by plotly, "obj" which return a reusable python dict object and "web" which normally used at notebook and make inline visualization possible.

    The color part of this function has a little bit complex because of the multiple sub-figures. Currently, it use the ``tmap.tda.plot.Color`` class to auto generate color with given array. More detailed about how to auto generate color could be reviewed at the annotation of ``tmap.tda.plot.Color``.

    In this function,  there are two kinds of color need to implement.

        * First, all color and its showing text values of samples points should be followed by given color params. The color could be **any array** which represents some measurement of Nodes or Samples. **It doesn't have to be SAFE score**.

        * Second, The ``_color_SAFE`` param should be a ``Color`` with a nodes-length array, which is normally a SAFE score.

    :param tmap.tda.Graph.Graph graph:
    :param str mode: [file|obj|web]
    :param bool simple:
    :param color:
    :param _color_SAFE:
    :param kwargs:
    :return:
    """
    node_pos = graph.nodePos
    # shape is average projected_data (node x lens)
    sample_pos = graph.data
    # shape is raw projected_data (sample x lens)
    nodes = graph.nodes
    sizes = graph.size

    sample_names = np.array(graph.sample_names.astype(str))
    minmax_scaler = MinMaxScaler(feature_range=(min_size, max_size))
    mms_color = MinMaxScaler(feature_range=[0, 1])

    scaled_size = minmax_scaler.fit_transform(
        np.array([sizes[_] for _ in range(len(nodes))]).reshape(-1, 1))

    # init some empty values if color wasn't given
    target_v_raw = [0 for _ in nodes]
    target_v = [0 for _ in nodes]
    target_colors = ['blue' for _ in nodes]
    sample_colors = ['red' for _ in sample_names]
    cat2color = defaultdict(lambda: 'blue')
    legend_names = []

    if color is None or type(color) == str:
        color = 'red' if color is None else color
        color_map = {node_id: color for node_id in graph.nodes}
        target2colors = (np.zeros(
            (len(graph.nodes), 1)), [color] * len(graph.nodes))
    else:
        color_map, target2colors = color.get_colors(graph.nodes)
        if types.is_numeric_dtype(target2colors[0]):
            target_v = mms_color.fit_transform(target2colors[0]).ravel()
        else:
            target_v = []
        target_v_raw = target2colors[0].ravel()
        target_colors = target2colors[1]

        sample_colors, cat2color = color.get_sample_colors()
        if color.dtype == 'categorical':
            legend_names = target2colors[0][:, 0]

    # For calculating the dynamic process. It need to duplicate the samples first.
    # reconstructing the ori_MDS into the samples_pos
    # reconstructing the node_pos into the center_pos
    sample_tmp = []
    center_tmp = []
    text_tmp = []
    duplicated_sample_colors = []
    for n in nodes:
        sample_tmp.append(sample_pos[nodes[n]['sample'], :])
        center_tmp.append(np.repeat(node_pos[[n], :], sizes[n], axis=0))
        text_tmp.append(sample_names[nodes[n]['sample']])
        if color is not None:
            duplicated_sample_colors += list(
                np.repeat(color_map.get(n, 'blue'), sizes[n]))
        else:
            duplicated_sample_colors += ["blue"] * sizes[n]
    duplicated_sample_pos = np.concatenate(sample_tmp, axis=0)
    duplicated_node_pos = np.concatenate(center_tmp, axis=0)
    duplicated_samples_text = np.concatenate(text_tmp, axis=0)
    assert duplicated_sample_pos.shape[0] == duplicated_node_pos.shape[
        0] == duplicated_samples_text.shape[0] == len(duplicated_sample_colors)
    # For visualizing the movement of samples, it need to multiply one sample into multiple samples which is need to reconstruct pos,text.

    # prepare edge data
    xs = []
    ys = []
    for edge in graph.edges:
        xs += [node_pos[edge[0], 0], node_pos[edge[1], 0], None]
        ys += [node_pos[edge[0], 1], node_pos[edge[1], 1], None]

    # if there are _color_SAFE, it will present two kinds of color. if simple != True
    # one is base on original data, one is transformed-SAFE data. Use the second one.
    if _color_SAFE is not None:
        safe_color, safe_t2c = _color_SAFE.get_colors(graph.nodes)
        # former is a dict which key is node id and values is node color
        # second is a tuple (node values, node color)
        target_SAFE_color = [safe_color[_] for _ in graph.nodes]
        target_SAFE_raw_v = safe_t2c[0].ravel()  # raw node values
    else:
        target_SAFE_color = []
        target_SAFE_raw_v = []

    # prepare node & samples text
    node_text = c_node_text(nodes, sample_names, target_v_raw)
    ### samples text
    samples_text = ['sample ID:%s' % _ for _ in sample_names]

    node_line = go.Scatter(
        # ordination line
        visible=False,
        x=xs,
        y=ys,
        marker=dict(color="#8E9DA2", opacity=0.7),
        line=dict(width=1),
        hoverinfo='skip',
        showlegend=False,
        mode="lines")

    node_marker = go.Scatter(
        # node position
        visible=False,
        x=node_pos[:, 0],
        y=node_pos[:, 1],
        hovertext=node_text,
        hoverinfo="text",
        marker=dict(color=target_colors, size=scaled_size, opacity=1),
        showlegend=False,
        mode="markers")

    sample_marker = go.Scatter(visible=True,
                               x=sample_pos[:, 0],
                               y=sample_pos[:, 1],
                               marker=dict(color=sample_colors),
                               hovertext=samples_text,
                               hoverinfo="text",
                               showlegend=False,
                               mode="markers")
    # After all prepared work have been finished.
    # Append all traces instance into fig
    if simple:
        fig = plotly.tools.make_subplots(1, 1)
        node_line['visible'] = True
        node_marker['visible'] = True
        fig.append_trace(node_line, 1, 1)

        if color is not None and type(color) != str:
            if color.dtype == 'numerical':
                # with continuous legend bar
                # A dict which includes values of node to color
                # For make continuous color legend
                nv2c = dict(zip(target_v, target_colors))
                colorscale = []
                for _ in sorted(set(target_v)):
                    colorscale.append([_, nv2c[_]])
                colorscale[-1][0] = 1  # the last value must be 1
                colorscale[0][0] = 0  # the first value must be 0

                node_marker['marker']['color'] = target2colors[0].ravel()
                # it is not necessary to use target_v, it could use original data target2colors.
                # Or it will display normalized values which will confuse reader.
                node_marker['marker']['colorscale'] = colorscale
                node_marker['marker']['cmin'] = target2colors[0].min()
                node_marker['marker']['cmax'] = target2colors[0].max()
                node_marker['marker']['showscale'] = True
                fig.append_trace(node_marker, 1, 1)
            else:  # if color.dtype == 'categorical'
                for cat in np.unique(legend_names):
                    # it won't missing variables legend_names. check 434 line
                    # it will auto sort with alphabetic order
                    node_marker = go.Scatter(
                        # node position
                        visible=True,
                        x=node_pos[legend_names == cat, 0],
                        y=node_pos[legend_names == cat, 1],
                        text=np.array(node_text)[legend_names == cat],
                        hoverinfo="text",
                        marker=dict(color=cat2color[cat],
                                    size=scaled_size[legend_names == cat, 0],
                                    opacity=1),
                        name=str(cat),
                        showlegend=True,
                        mode="markers")
                    fig.append_trace(node_marker, 1, 1)
        elif type(color) == str:
            node_marker['marker']['color'] = color
            fig.append_trace(node_marker, 1, 1)
        else:
            fig.append_trace(node_marker, 1, 1)
        fig.layout.hovermode = "closest"
    else:
        fig = plotly.tools.make_subplots(
            rows=2,
            cols=2,
            specs=[[{
                'rowspan': 2
            }, {}], [None, {}]],
        )
        # original place or ordination place
        fig.append_trace(sample_marker, 1, 1)

        # dynamic process to generate 5 binning positions
        n_step = 5
        for s in range(1, n_step + 1):
            # s = 1: move 1/steps
            # s = steps: move to node position.
            fig.append_trace(
                go.Scatter(visible=False,
                           x=duplicated_sample_pos[:, 0] +
                           ((duplicated_node_pos - duplicated_sample_pos) /
                            n_step * s)[:, 0],
                           y=duplicated_sample_pos[:, 1] +
                           ((duplicated_node_pos - duplicated_sample_pos) /
                            n_step * s)[:, 1],
                           marker=dict(color=duplicated_sample_colors),
                           hoverinfo="text",
                           hovertext=duplicated_samples_text,
                           showlegend=False,
                           mode="markers"), 1, 1)

        # Order is important, do not change the order !!!
        # There are the last 5 should be visible at any time
        fig.append_trace(node_line, 1, 1)
        fig.append_trace(node_marker, 1, 1)
        node_line['visible'] = True
        node_marker['visible'] = True
        sample_marker['visible'] = True
        fig.append_trace(node_line, 2, 2)
        if _color_SAFE is not None:
            node_text = c_node_text(nodes, sample_names, target_SAFE_raw_v)
            node_marker['hovertext'] = node_text
            node_marker['marker']['color'] = target_SAFE_color
        fig.append_trace(node_marker, 2, 2)
        fig.append_trace(sample_marker, 1, 2)
        ############################################################
        steps = []
        for i in range(n_step + 1):
            step = dict(
                method='restyle',
                args=['visible', [False] * (n_step + 3) + [True, True, True]],
            )
            if i >= n_step:
                step["args"][1][-5:] = [
                    True
                ] * 5  # The last 5 should be some traces must present at any time.
            else:
                step['args'][1][i] = True  # Toggle i'th trace to "visible"
            steps.append(step)

        sliders = [
            dict(active=0,
                 currentvalue={"prefix": "status: "},
                 pad={"t": 20},
                 steps=steps)
        ]
        ############################################################
        layout = dict(
            sliders=sliders,
            width=2000,
            height=1000,
            xaxis1={  # "range": [0, 1],
                "domain": [0, 0.5]
            },
            yaxis1={  # "range": [0, 1],
                "domain": [0, 1]
            },
            xaxis2={  # "range": [0, 1],
                "domain": [0.6, 0.9]
            },
            yaxis2={  # "range": [0, 1],
                "domain": [0.5, 1]
            },
            xaxis3={  # "range": [0, 1],
                "domain": [0.6, 0.9]
            },
            yaxis3={  # "range": [0, 1],
                "domain": [0, 0.5]
            },
            hovermode="closest")
        fig.layout.update(layout)

    return write_figure(fig, mode, **kwargs)
コード例 #42
0
ファイル: everything.py プロジェクト: morganics/jupyter-utils
def fix_missing(df, col, name, na_dict):
    """ Fill missing data in a column of df with the median, and add a {name}_na column
    which specifies if the data was missing.

    Parameters:
    -----------
    df: The data frame that will be changed.

    col: The column of data to fix by filling in missing data.

    name: The name of the new filled column in df.

    na_dict: A dictionary of values to create na's of and the value to insert. If
        name is not a key of na_dict the median will fill any missing data. Also
        if name is not a key of na_dict and there is no missing data in col, then
        no {name}_na column is not created.


    Examples:
    ---------
    >>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]})
    >>> df
       col1 col2
    0     1    5
    1   nan    2
    2     3    2

    >>> fix_missing(df, df['col1'], 'col1', {})
    >>> df
       col1 col2 col1_na
    0     1    5   False
    1     2    2    True
    2     3    2   False


    >>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]})
    >>> df
       col1 col2
    0     1    5
    1   nan    2
    2     3    2

    >>> fix_missing(df, df['col2'], 'col2', {})
    >>> df
       col1 col2
    0     1    5
    1   nan    2
    2     3    2


    >>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]})
    >>> df
       col1 col2
    0     1    5
    1   nan    2
    2     3    2

    >>> fix_missing(df, df['col1'], 'col1', {'col1' : 500})
    >>> df
       col1 col2 col1_na
    0     1    5   False
    1   500    2    True
    2     3    2   False
    """
    if is_numeric_dtype(col):
        if pd.isnull(col).sum() or (name in na_dict):
            df[name + '_na'] = pd.isnull(col)
            filler = na_dict[name] if name in na_dict else col.median()
            df[name] = col.fillna(filler)
            na_dict[name] = filler
    return na_dict
コード例 #43
0
def pandas_to_table(df):
    # type: (pd.DataFrame) -> Orange.data.Table
    """
    Convert a pandas.DataFrame to a Orange.data.Table instance.
    """
    index = df.index
    if not isinstance(index, pd.RangeIndex):
        df = df.reset_index()

    columns = []  # type: List[Tuple[Orange.data.Variable, np.ndarray]]

    for header, series in df.items():  # type: (Any, pd.Series)
        if pdtypes.is_categorical(series):
            coldata = series.values  # type: pd.Categorical
            categories = [str(c) for c in coldata.categories]
            var = Orange.data.DiscreteVariable.make(
                str(header), values=categories, ordered=coldata.ordered
            )
            # Remap the coldata into the var.values order/set
            coldata = pd.Categorical(
                coldata, categories=var.values, ordered=coldata.ordered
            )
            codes = coldata.codes
            assert np.issubdtype(codes.dtype, np.integer)
            orangecol = np.array(codes, dtype=np.float)
            orangecol[codes < 0] = np.nan
        elif pdtypes.is_datetime64_any_dtype(series):
            # Check that this converts tz local to UTC
            series = series.astype(np.dtype("M8[ns]"))
            coldata = series.values  # type: np.ndarray
            assert coldata.dtype == "M8[ns]"
            mask = np.isnat(coldata)
            orangecol = coldata.astype(np.int64) / 10 ** 9
            orangecol[mask] = np.nan
            var = Orange.data.TimeVariable.make(str(header))
            var.have_date = var.have_time = 1
        elif pdtypes.is_object_dtype(series):
            coldata = series.values
            assert isinstance(coldata, np.ndarray)
            orangecol = coldata
            var = Orange.data.StringVariable.make(str(header))
        elif pdtypes.is_integer_dtype(series):
            coldata = series.values
            var = Orange.data.ContinuousVariable.make(str(header))
            var.number_of_decimals = 0
            orangecol = coldata.astype(np.float64)
        elif pdtypes.is_numeric_dtype(series):
            orangecol = series.values.astype(np.float64)
            var = Orange.data.ContinuousVariable.make(str(header))
            var._out_format = "%.15g"
        else:
            warnings.warn(
                "Column '{}' with dtype: {} skipped."
                .format(header, series.dtype),
                UserWarning
            )
            continue
        columns.append((var, orangecol))

    cols_x = [(var, col) for var, col in columns if var.is_primitive()]
    cols_m = [(var, col) for var, col in columns if not var.is_primitive()]

    variables = [v for v, _ in cols_x]
    if cols_x:
        X = np.column_stack([a for _, a in cols_x])
    else:
        X = np.empty((df.shape[0], 0), dtype=np.float)
    metas = [v for v, _ in cols_m]
    if cols_m:
        M = np.column_stack([a for _, a in cols_m])
    else:
        M = None

    domain = Orange.data.Domain(variables, metas=metas)
    return Orange.data.Table.from_numpy(domain, X, None, M)
コード例 #44
0
ファイル: everything.py プロジェクト: morganics/jupyter-utils
def proc_df(df,
            y_fld=None,
            skip_flds=None,
            ignore_flds=None,
            do_scale=False,
            na_dict=None,
            preproc_fn=None,
            max_n_cat=None,
            subset=None,
            mapper=None):
    """ proc_df takes a data frame df and splits off the response variable, and
    changes the df into an entirely numeric dataframe.

    Parameters:
    -----------
    df: The data frame you wish to process.

    y_fld: The name of the response variable

    skip_flds: A list of fields that dropped from df.

    ignore_flds: A list of fields that are ignored during processing.

    do_scale: Standardizes each column in df. Takes Boolean Values(True,False)

    na_dict: a dictionary of na columns to add. Na columns are also added if there
        are any missing values.

    preproc_fn: A function that gets applied to df.

    max_n_cat: The maximum number of categories to break into dummy values, instead
        of integer codes.

    subset: Takes a random subset of size subset from df.

    mapper: If do_scale is set as True, the mapper variable
        calculates the values used for scaling of variables during training time (mean and standard deviation).

    Returns:
    --------
    [x, y, nas, mapper(optional)]:

        x: x is the transformed version of df. x will not have the response variable
            and is entirely numeric.

        y: y is the response variable

        nas: returns a dictionary of which nas it created, and the associated median.

        mapper: A DataFrameMapper which stores the mean and standard deviation of the corresponding continuous
        variables which is then used for scaling of during test-time.

    Examples:
    ---------
    >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']})
    >>> df
       col1 col2
    0     1    a
    1     2    b
    2     3    a

    note the type of col2 is string

    >>> train_cats(df)
    >>> df

       col1 col2
    0     1    a
    1     2    b
    2     3    a

    now the type of col2 is category { a : 1, b : 2}

    >>> x, y, nas = proc_df(df, 'col1')
    >>> x

       col2
    0     1
    1     2
    2     1

    >>> data = DataFrame(pet=["cat", "dog", "dog", "fish", "cat", "dog", "cat", "fish"],
                 children=[4., 6, 3, 3, 2, 3, 5, 4],
                 salary=[90, 24, 44, 27, 32, 59, 36, 27])

    >>> mapper = DataFrameMapper([(:pet, LabelBinarizer()),
                          ([:children], StandardScaler())])

    >>>round(fit_transform!(mapper, copy(data)), 2)

    8x4 Array{Float64,2}:
    1.0  0.0  0.0   0.21
    0.0  1.0  0.0   1.88
    0.0  1.0  0.0  -0.63
    0.0  0.0  1.0  -0.63
    1.0  0.0  0.0  -1.46
    0.0  1.0  0.0  -0.63
    1.0  0.0  0.0   1.04
    0.0  0.0  1.0   0.21
    """
    if not ignore_flds: ignore_flds = []
    if not skip_flds: skip_flds = []
    if subset: df = get_sample(df, subset)
    else: df = df.copy()
    ignored_flds = df.loc[:, ignore_flds]
    df.drop(ignore_flds, axis=1, inplace=True)
    if preproc_fn: preproc_fn(df)
    if y_fld is None: y = None
    else:
        if not is_numeric_dtype(df[y_fld]): df[y_fld] = df[y_fld].cat.codes
        y = df[y_fld].values
        skip_flds += [y_fld]
    df.drop(skip_flds, axis=1, inplace=True)

    if na_dict is None: na_dict = {}
    else: na_dict = na_dict.copy()
    na_dict_initial = na_dict.copy()
    for n, c in df.items():
        na_dict = fix_missing(df, c, n, na_dict)
    if len(na_dict_initial.keys()) > 0:
        df.drop([
            a + '_na'
            for a in list(set(na_dict.keys()) - set(na_dict_initial.keys()))
        ],
                axis=1,
                inplace=True)
    if do_scale: mapper = scale_vars(df, mapper)
    for n, c in df.items():
        numericalize(df, c, n, max_n_cat)
    df = pd.get_dummies(df, dummy_na=True)
    df = pd.concat([ignored_flds, df], axis=1)
    res = [df, y, na_dict]
    if do_scale: res = res + [mapper]
    return res
コード例 #45
0
ファイル: analysis.py プロジェクト: gasongjian/reportgen
def dtype_detection(data,category_detection=True,StructureText_detection=True,\
datetime_to_category=True,criterion='sqrt',min_mean_counts=5,fix=False):
    '''检测数据中单个变量的数据类型
    将数据类型分为以下4种
    1. number,数值型
    2. category,因子
    3. datetime,时间类型
    4. text,文本型
    5. text_st,结构性文本,比如ID,
    6. group_number,连续

    parameter
    ---------
    data: pd.Series 数据, 仅支持一维
    # 如果有data,则函数会改变原来data的数据类型
    category_detection: bool,根据 nunique 检测是否是因子类型
    StructureText_detection: bool, 结构化文本,如列中都有一个分隔符"-"
    datetime_to_category: 时间序列如果 nunique过少是否转化成因子变量
    criterion: string or int, optional (default="sqrt",即样本数的开根号)
        支持:'sqrt':样本量的开根号, int: 绝对数, 0-1的float:样本数的百分多少
        检测因子变量时,如果一个特征的nunique小于criterion,则判定为因子变量
    min_mean_counts: default 5,数值型判定为因子变量时,需要满足每个类别的平均频数要大于min_mean_counts
    fix: bool,是否返回修改好类型的数据


    return:
    result:dict{
        'name':列名,
        'vtype':变量类型,
        'ordered':是否是有序因子,
        'categories':所有的因子}

    '''

    assert len(data.shape)==1
    data=data.copy()
    data=pd.Series(data)
    dtype,name,n_sample=data.dtype,data.name,data.count()

    min_mean_counts=5
    if criterion=='sqrt':
        max_nuniques=np.sqrt(n_sample)
    elif isinstance(criterion,int):
        max_nuniques=criterion
    elif isinstance(criterion,float) and (0<criterion<1):
        max_nuniques=criterion
    else:
        max_nuniques=np.sqrt(n_sample)
    ordered=False
    categories=[]
    if is_numeric_dtype(dtype):
        vtype='number'
        ordered=False
        categories=[]
        # 纠正误分的数据类型。如将1.0,2.0,3.0都修正为1,2,3
        if data.dropna().astype(np.int64).sum()==data.dropna().sum():
            data[data.notnull()]=data[data.notnull()].astype(np.int64)
        if category_detection:
            nunique=len(data.dropna().unique())
            mean_counts=data.value_counts().median()
            if nunique<max_nuniques and mean_counts>=min_mean_counts:
                data=data.astype('category')
                ordered=data.cat.ordered
                vtype='category'
                categories=list(data.dropna().cat.categories)
        result={'name':name,'vtype':vtype,'ordered':ordered,'categories':categories}
    elif is_string_dtype(dtype):
        # 处理时间类型
        tmp=data.map(lambda x: np.nan if '%s'%x == 'nan' else len('%s'%x))
        tmp=tmp.dropna().astype(np.int64)
        if not(any(data.dropna().map(is_number))) and 7<tmp.max()<20 and tmp.std()<0.1:
            try:
                data=pd.to_datetime(data)
            except :
                pass
        # 处理可能的因子类型
        #时间格式是否处理为True 且
        if datetime_to_category:
            if len(data.dropna().unique())<np.sqrt(n_sample):
                data=data.astype('category')
        else:
            nunique=len(data.dropna().unique())
            #print(data.dtype)
            if not(is_categorical_dtype(data.dtype)) and not(np.issubdtype(data.dtype,np.datetime64)) and nunique<max_nuniques:
                data=data.astype('category')

        # 在非因子类型的前提下,将百分数转化成浮点数,例如21.12%-->0.2112
        if is_string_dtype(data.dtype) and not(is_categorical_dtype(data.dtype)) and all(data.str.contains('%')):
            data=data.str.strip('%').astype(np.float64)/100

        if is_categorical_dtype(data.dtype):
            vtype='category'
            categories=list(data.cat.categories)
            ordered=data.cat.ordered
        # 时间格式
        elif np.issubdtype(data.dtype,np.datetime64):
            vtype='datetime'
        # 是否是结构化数组
        elif StructureText_detection and tmp.dropna().std()==0:
            # 不可迭代,不是字符串
            if not(isinstance(data.dropna().iloc[0],Iterable)):
                vtype='text'
            else:
                k=set(list(data.dropna().iloc[0]))
                for x in data:
                    if isinstance(x,str) and len(x)>0:
                        k&=set(list(x))
                if len(k)>0:
                    vtype='text_st'
                else:
                    vtype='text'
        elif is_numeric_dtype(data.dtype):
            vtype='number'
            ordered=False
            categories=[]
        else:
            vtype='text'
        result={'name':name,'vtype':vtype,'ordered':ordered,'categories':categories}
    elif is_datetime64_any_dtype(dtype):
        vtype='datetime'
        result={'name':name,'vtype':vtype,'ordered':ordered,'categories':categories}
    else:
        print('unknown dtype!')
        result=None

    if fix:
        return result,data
    else:
        return result
コード例 #46
0
ファイル: simple_imputer.py プロジェクト: sravanroy/datawig
    def fit(self,
            train_df: pd.DataFrame,
            test_df: pd.DataFrame = None,
            ctx: mx.context = get_context(),
            learning_rate: float = 4e-3,
            num_epochs: int = 10,
            patience: int = 3,
            test_split: float = .1,
            weight_decay: float = 0.,
            batch_size: int = 16,
            final_fc_hidden_units: List[int] = None,
            calibrate: bool = True) -> Any:
        """

        Trains and stores imputer model

        :param train_df: training data as dataframe
        :param test_df: test data as dataframe; if not provided, a ratio of test_split of the
                            training data are used as test data
        :param ctx: List of mxnet contexts (if no gpu's available, defaults to [mx.cpu()])
                    User can also pass in a list gpus to be used, ex. [mx.gpu(0), mx.gpu(2), mx.gpu(4)]
        :param learning_rate: learning rate for stochastic gradient descent (default 4e-4)
        :param num_epochs: maximal number of training epochs (default 10)
        :param patience: used for early stopping; after [patience] epochs with no improvement,
                            training is stopped. (default 3)
        :param test_split: if no test_df is provided this is the ratio of test data to be held
                            separate for determining model convergence
        :param weight_decay: regularizer (default 0)
        :param batch_size (default 16)
        :param final_fc_hidden_units: list dimensions for FC layers after the final concatenation

        """
        self.check_data_types(train_df)

        data_encoders = []
        data_columns = []

        if len(self.string_columns) > 0:
            string_feature_column = "ngram_features-" + rand_string(10)
            if self.is_explainable:
                data_encoders += [
                    TfIdfEncoder(input_columns=self.string_columns,
                                 output_column=string_feature_column,
                                 max_tokens=self.num_hash_buckets,
                                 tokens=self.tokens)
                ]
            else:
                data_encoders += [
                    BowEncoder(input_columns=self.string_columns,
                               output_column=string_feature_column,
                               max_tokens=self.num_hash_buckets,
                               tokens=self.tokens)
                ]
            data_columns += [
                BowFeaturizer(field_name=string_feature_column,
                              max_tokens=self.num_hash_buckets)
            ]

        if len(self.numeric_columns) > 0:
            numerical_feature_column = "numerical_features-" + rand_string(10)

            data_encoders += [
                NumericalEncoder(input_columns=self.numeric_columns,
                                 output_column=numerical_feature_column)
            ]

            data_columns += [
                NumericalFeaturizer(
                    field_name=numerical_feature_column,
                    numeric_latent_dim=self.numeric_latent_dim,
                    numeric_hidden_layers=self.numeric_hidden_layers)
            ]

        label_column = []

        if is_numeric_dtype(train_df[self.output_column]):
            label_column = [
                NumericalEncoder(self.output_column, normalize=True)
            ]
            logger.info("Assuming numeric output column: {}".format(
                self.output_column))
        else:
            label_column = [
                CategoricalEncoder(self.output_column,
                                   max_tokens=self.num_labels)
            ]
            logger.info("Assuming categorical output column: {}".format(
                self.output_column))

        # to make consecutive calls to .fit() continue where the previous call finished
        if self.imputer is None:
            self.imputer = Imputer(data_encoders=data_encoders,
                                   data_featurizers=data_columns,
                                   label_encoders=label_column,
                                   output_path=self.output_path)

        self.output_path = self.imputer.output_path

        self.imputer = self.imputer.fit(
            train_df,
            test_df,
            ctx,
            learning_rate,
            num_epochs,
            patience,
            test_split,
            weight_decay,
            batch_size,
            final_fc_hidden_units=final_fc_hidden_units,
            calibrate=calibrate)
        self.save()

        return self
コード例 #47
0
def groupCompare(variables, group, dataframe, number_groups):
    ### Declare empty variables to hold column names
    NormallyDistributed = []
    NonNormallyDistributed = []
    statistic = []
    p_value = []
    types = []
    ### Loop through all columns of a dataframe and check normality
    for col in dataframe.columns:
        if is_numeric_dtype(dataframe[col]) == True:  ## Numeric check
            data = dataframe[np.isfinite(
                dataframe[col]
            )]  ## Drop NAs (the shapiro will not calculate statistic if NAs present)
            r, p = stats.shapiro(
                data[col])  ### If less than 0.05 non normally distributed
            if p < 0.05:
                NonNormallyDistributed.append(col)
            else:
                NormallyDistributed.append(col)
    for var in variables:
        if number_groups > 2:
            if var in NormallyDistributed:  ## Normally distributed then do ANOVA
                data = dataframe[np.isfinite(dataframe[var])]
                variable = data[var].dropna()
                comp = data[group]  ### comparison of interest
                anova = ols("variable ~ C(comp)",
                            data=data).fit()  ### run anova
                r = anova.rsquared_adj  ## extract overall model adjusted r statistic
                p = anova.f_pvalue  ## extract overall model p-value
                statistic.append(r)
                p_value.append(p)
                types.append("ANOVA")
            elif var in NonNormallyDistributed:  ### Non normally distributed then do Kruskal Wallis
                data = dataframe[np.isfinite(dataframe[var])]
                ### declare the three series
                v1 = data[data[group] == 0][var]
                v2 = data[data[group] == 1][var]
                v3 = data[data[group] == 2][var]
                r, p = stats.kruskal(v1, v2, v3)  ### run Kruskal wallis
                statistic.append(r)
                p_value.append(p)
                types.append("Kruskal-Wallis")
            else:  ### In case any variables were labelled incorrectly
                statistic.append("NA")
                p_value.append("NA")
                types.append("NA")
        elif number_groups == 2:
            if var in NormallyDistributed:  ## Normally distributed then do ttest
                data = dataframe[np.isfinite(dataframe[var])]
                v1 = data[data.PD_VHAny == 1][var]
                v2 = data[data.PD_VHAny == 2][var]
                r, p = stats.ttest_ind(v1, v2)
                statistic.append(r)
                p_value.append(p)
                types.append("t-test")
            elif var in NonNormallyDistributed:  ### Non normally distributed then do Mann-Whitney
                data = dataframe[np.isfinite(dataframe[var])]
                v1 = data[data.PD_VHAny == 1][var]
                v2 = data[data.PD_VHAny == 2][var]
                r, p = stats.mannwhitneyu(v1, v2)  ### run Kruskal wallis
                statistic.append(r)
                p_value.append(p)
                types.append("Mann-Whitney")
            else:  ### In case any variables were labelled incorrectly
                statistic.append("NA")
                p_value.append("NA")
                types.append("NA")
    ### Combine results on dataframe
    results = pd.DataFrame(data=np.zeros(
        (len(variables), 0)))  # empty dataframe
    results["Variable"] = variables  # variable names
    results["Statistic"] = statistic  # statistic
    results["Pvalue"] = p_value  # p_value
    results["Type"] = types  # type of statistical test used
    return (results)
コード例 #48
0
def Select_Variables(dfTrain,
                     dfTest,
                     variables=[],
                     center_floats=True,
                     scale_floats=True,
                     max_pct_na=0.3):
    """
        Descr: 
            To do machine learning, we need to have two data (train and test)
            with the same structure.
            Some variables must be centered and scaled. In this case,
            test's variables must be scaled with train's features,
            in order to not to slant prediction.
        In:
            - dfTrain : dataframe for training
            - dfTest : dataframe for tests
            - variables : list of UNCHANGED variables to select.
            - scales : list of centered-and-scaled float variables to select.
                If one of those variables is not a float, its unchanged !
                
        Note :
            If a name in 'variables' or 'scales' does not exists in dfTrain or
            dfTest, then a error will be returned !
            
        Out :
            Two dataframes with the same structure : resTrain, resTest
    """
    msgerr = "'{}' is not contained in {} !"
    ## Init
    resTrain = pd.DataFrame(index=dfTrain.index)
    resTest = pd.DataFrame(index=dfTest.index)
    ## Boucle
    for ivar in variables:
        ## Errors
        if not (ivar in list(dfTrain)):
            raise ValueError(msgerr.format(ivar, 'dfTrain'))
        elif not (ivar in list(dfTest)):
            raise ValueError(msgerr.format(ivar, 'dfTest'))
        xtrain = dfTrain[ivar]
        xtest = dfTest[ivar]
        ## if differents types : error
        if (xtrain.dtype != xtest.dtype):
            raise ValueError('''Variable {} is not of the same type in
                             dfTrain and in dfTest !'''.format(ivar))
        ## if too many missing values
        pct_na_train = xtrain.isnull().mean()
        pct_na_test = xtest.isnull().mean()

        if (pct_na_train <= max_pct_na and pct_na_test <= max_pct_na):
            ## Tests on types
            is_num = pdtypes.is_numeric_dtype(xtrain)
            is_str = pdtypes.is_categorical_dtype(
                xtrain) or pdtypes.is_string_dtype(xtrain)
            ## IF FLOAT
            if is_num:
                ## Mean Features
                moy = xtrain.mean()
                stderr = xtrain.std()
                ## Dont take useless variables
                ## having no variation, or too much NA
                if (stderr != 0):
                    xtrain = xtrain.fillna(moy)
                    xtest = xtest.fillna(xtest.mean())
                    if center_floats:
                        xtrain -= moy
                        xtest -= moy
                    if scale_floats:
                        xtrain /= stderr
                        xtest /= stderr
                    ## Add to Data
                    resTrain[ivar] = pd.Series(xtrain, index=dfTrain.index)
                    resTest[ivar] = pd.Series(xtest, index=dfTest.index)
            ## IF CATEG
            elif is_str:
                iDummTrain = pd.get_dummies(xtrain, prefix=ivar)
                iDummTest = pd.get_dummies(xtest, prefix=ivar)
                resTrain = pd.concat([resTrain, iDummTrain], axis=1)
                resTest = pd.concat([resTest, iDummTest], axis=1)
        continue

    ## Results
    return resTrain, resTest
コード例 #49
0
    def __init__(
            self,
            x,
            chrm="CHR",
            bp="BP",
            p="P",
            snp="SNP",
            gene="GENE",
            annotation=None,
            logp=True
    ):
        """
        Keyword arguments:
        - dataframe (dataframe; required): A pandas dataframe which
        must contain at least the following three columns:
            - the chromosome number
            - genomic base-pair position
            - a numeric quantity to plot such as a p-value or zscore
        - chrm (string; default 'CHR'): A string denoting the column name for the
        chromosome.  This column must be float or integer.  Minimum number
        of chromosomes required is 1. If you have X, Y, or MT chromosomes,
        be sure to renumber these 23, 24, 25, etc.
        - bp (string; default 'BP'): A string denoting the column name for the
        chromosomal position.
        - p (string; default 'P'): A string denoting the column name for the
        float quantity to be plotted on the y-axis. This column must be
        numeric. This does not have to be a p-value. It can be any
        numeric quantity such as peak heights, bayes factors, test
        statistics. If it is not a p-value, make sure to set logp = FALSE.
        - snp (string; default 'SNP'): A string denoting the column name for the
        SNP names (e.g. rs number). More generally, this column could be
        anything that identifies each point being plotted. For example, in
        an Epigenomewide association study (EWAS) this could be the probe
        name or cg number. This column should be a character. This
        argument is optional, however it is necessary to specify if you
        want to highlight points on the plot using the highlight argument
        in the figure method.
        - gene (string; default 'GENE'): A string denoting the column name for the
        GENE names. This column could be a string or a float. More
        generally, it could be any annotation information that you want
        to include in the plot.
        - annotation (string; optional): A string denoting the column name for
        an annotation. This column could be a string or a float.  This
        could be any annotation information that you want to include in
        the plot (e.g. zscore, effect size, minor allele frequency).
        - logp (bool; default True): If True, the -log10 of the p-value is
        plotted.  It isn't very useful to plot raw p-values; however,
        plotting the raw value could be useful for other genome-wide plots
        (e.g., peak heights, Bayes factors, test statistics, other
        "scores", etc.).

        Returns:
        - A ManhattanPlot object."""

        # checking the validity of the arguments

        # Make sure you have chrm, bp and p columns and that they are of
        # numeric type
        if chrm not in x.columns.values:
            raise KeyError("Column %s not found in 'x' data.frame" % chrm)
        else:
            if not is_numeric_dtype(x[chrm].dtype):
                raise TypeError("%s column should be numeric. Do you have "
                                "'X', 'Y', 'MT', etc? If so change to "
                                "numbers and try again." % chrm)

        if bp not in x.columns.values:
            raise KeyError("Column %s not found in 'x' data.frame" % bp)
        else:
            if not is_numeric_dtype(x[bp].dtype):
                raise TypeError("%s column should be numeric type" % bp)

        if p not in x.columns.values:
            raise KeyError("Column %s not found in 'x' data.frame" % p)
        else:
            if not is_numeric_dtype(x[p].dtype):
                raise TypeError("%s column should be numeric type" % p)

        # Create a new DataFrame with columns named after chrm, bp, and p.
        self.data = pd.DataFrame(data=x[[chrm, bp, p]])

        if snp is not None:
            if snp not in x.columns.values:
                # Warn if you don't have a snp column
                raise KeyError(
                    "snp argument specified as %s but column not found in "
                    "'x' data.frame" % snp)
            else:
                # If the input DataFrame has a snp column, add it to the new
                # DataFrame
                self.data[snp] = x[snp]

        if gene is not None:
            if gene not in x.columns.values:
                # Warn if you don't have a gene column
                raise KeyError(
                    "gene argument specified as %s but column not found in "
                    "'x' data.frame" % gene)
            else:
                # If the input DataFrame has a gene column, add it to the new
                # DataFrame
                self.data[gene] = x[gene]

        if annotation is not None:
            if annotation not in x.columns.values:
                # Warn if you don't have an annotation column
                raise KeyError(
                    "annotation argument specified as %s but column not "
                    "found in 'x' data.frame" % annotation
                )
            else:
                # If the input DataFrame has a gene column, add it to the new
                # DataFrame
                self.data[annotation] = x[annotation]

        self.xlabel = ""
        self.ticks = []
        self.ticksLabels = []
        self.nChr = len(x[chrm].unique())
        self.chrName = chrm
        self.pName = p
        self.snpName = snp
        self.geneName = gene
        self.annotationName = annotation
        self.logp = logp

        # Set positions, ticks, and labels for plotting

        self.index = 'INDEX'
        self.pos = 'POSITION'

        # Fixes the bug where one chromosome is missing by adding a sequential
        # index column.
        idx = 0
        for i in self.data[chrm].unique():
            idx = idx + 1
            self.data.loc[self.data[chrm] == i, self.index] = int(idx)
        # Set the type to be the same as provided for chrm column
        self.data[self.index] = \
            self.data[self.index].astype(self.data[chrm].dtype)

        # This section sets up positions and ticks. Ticks should be placed in
        # the middle of a chromosome. The new pos column is added that keeps
        # a running sum of the positions of each successive chromosome.
        # For example:
        # chrm bp pos
        # 1   1  1
        # 1   2  2
        # 2   1  3
        # 2   2  4
        # 3   1  5

        if self.nChr == 1:
            # For a single chromosome
            self.data[self.pos] = self.data[bp]
            self.ticks.append(int(len(self.data[self.pos]) / 2.) + 1)
            self.xlabel = "Chromosome %s position" % (self.data[chrm].unique())
            self.ticksLabels = self.ticks
        else:
            # For multiple chromosomes
            lastbase = 0
            for i in self.data[self.index].unique():
                if i == 1:
                    self.data.loc[self.data[self.index] == i, self.pos] = \
                        self.data.loc[self.data[self.index] == i, bp].values
                else:
                    prevbp = self.data.loc[self.data[self.index] == i - 1, bp]
                    # Shift the basepair position by the largest bp of the
                    # current chromosome
                    lastbase = lastbase + prevbp.iat[-1]

                    self.data.loc[self.data[self.index] == i, self.pos] = \
                        self.data.loc[self.data[self.index] == i, bp].values \
                        + lastbase

                tmin = min(self.data.loc[self.data[self.index] == i, self.pos])
                tmax = max(self.data.loc[self.data[self.index] == i, self.pos])
                self.ticks.append(int((tmin + tmax) / 2.) + 1)

            self.xlabel = 'Chromosome'
            self.data[self.pos] = self.data[self.pos].astype(
                self.data[bp].dtype)

            if self.nChr > 10:  # To avoid crowded labels
                self.ticksLabels = [
                    t if np.mod(int(t), 2)  # Only every two ticks
                    else ''
                    for t in self.data[chrm].unique()
                ]
            else:
                self.ticksLabels = self.data[chrm].unique()  # All the ticks
コード例 #50
0
    def calculate(self, reference_data: pd.DataFrame,
                  production_data: pd.DataFrame, column_mapping):
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            #target_names = column_mapping.get('target_names')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [
                    name for name in num_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [
                    name for name in cat_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [
                date_column, id_column, target_column, prediction_column
            ]

            num_feature_names = list(
                set(reference_data.select_dtypes([np.number]).columns) -
                set(utility_columns))
            cat_feature_names = list(
                set(reference_data.select_dtypes([np.object]).columns) -
                set(utility_columns))

            #target_names = None

        if production_data is not None and target_column is not None and prediction_column is not None:
            production_data.replace([np.inf, -np.inf], np.nan, inplace=True)
            production_data.dropna(axis=0, how='any', inplace=True)

            array_prediction = production_data[prediction_column].to_numpy()

            prediction_ids = np.argmax(array_prediction, axis=-1)
            prediction_labels = [prediction_column[x] for x in prediction_ids]

            #plot support bar
            graphs = []

            for label in prediction_column:
                fig = go.Figure()

                fig.add_trace(
                    go.Scatter(x=np.random.random(production_data[
                        production_data[target_column] == label].shape[0]),
                               y=production_data[production_data[target_column]
                                                 == label][label],
                               mode='markers',
                               name=str(label),
                               marker=dict(size=6, color=red)))

                fig.add_trace(
                    go.Scatter(
                        x=np.random.random(production_data[
                            production_data[target_column] != label].shape[0]),
                        y=production_data[
                            production_data[target_column] != label][label],
                        mode='markers',
                        name='others',
                        marker=dict(size=6, color=grey)))

                fig.update_layout(yaxis_title="Probability",
                                  xaxis=dict(range=(-2, 3),
                                             showticklabels=False))

                fig_json = json.loads(fig.to_json())

                graphs.append({
                    "id": "tab_" + str(label),
                    "title": str(label),
                    "graph": {
                        "data": fig_json["data"],
                        "layout": fig_json["layout"],
                    }
                })

            self.wi = BaseWidgetInfo(
                title=self.title,
                type="tabbed_graph",
                details="",
                alertStats=AlertStats(),
                alerts=[],
                alertsPosition="row",
                insights=[],
                size=1,
                params={"graphs": graphs},
                additionalGraphs=[],
            )
        else:
            self.wi = None
コード例 #51
0
ファイル: backend.py プロジェクト: JackyP/punditkit
    def fit_data(self, df, valid_percentage=0.3):
        X = df[self.features]
        y = df[self.response]

        self.response_is_numeric = is_numeric_dtype(df[self.response])

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=valid_percentage, random_state=42)

        # Drop NaN in response.
        X_train = X_train[~pd.isnull(y_train)]
        y_train = y_train[~pd.isnull(y_train)]

        X_test = X_test[~pd.isnull(y_test)]
        y_test = y_test[~pd.isnull(y_test)]

        self.X = X
        self.y = y
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test

        # Train
        self.pipe.fit(X_train, y_train)

        # Importance
        self.perm_imp = permutation_importance(
            self.pipe,
            self.X_train,
            self.y_train,
            n_repeats=10,
            random_state=42,
            n_jobs=2,
        )

        self.sorted_idx = self.perm_imp.importances_mean.argsort()
        self.perm_imp_labels = X_train.columns[self.sorted_idx]
        self.top_features = X_train.columns[self.sorted_idx].tolist()
        self.top_features.reverse()

        # Explainer
        self.X_numeric = self.transform_numeric.transform(X_train)

        categorical_feature_indices = [
            self.features.index(x) for x in self.categorical_features
        ]

        if self.response_is_numeric:
            self.explainer = lime.lime_tabular.LimeTabularExplainer(
                self.X_numeric,
                feature_names=self.features,
                class_names=[self.response],
                discretize_continuous=True,
                categorical_features=categorical_feature_indices,
                mode="regression",
            )

        else:
            self.explainer = lime.lime_tabular.LimeTabularExplainer(
                self.X_numeric,
                feature_names=self.features,
                class_names=self.pipe.classes_.tolist(),
                discretize_continuous=True,
                categorical_features=categorical_feature_indices,
            )

        # Typical Feature Values
        typical_feature_values_list = []
        for col in self.features:
            if col in self.numeric_features:
                mn = df[col].min().item()
                mx = df[col].max().item()
                defv = df[col].mean()

                if mx > mn:
                    sp = (mx - mn) / 100
                    sp = round(sp, 1 - int(floor(log10(abs(sp)))) - 1)

                    if df[col].dtype == np.int64:
                        defv = np.ceil(defv).astype(int).item()
                        sp = np.ceil(sp).astype(int).item()

            else:
                defv = df[col].mode().item()
            typical_feature_values_list += [defv]

        self.typical_feature_values = pd.DataFrame(
            [typical_feature_values_list], columns=self.features)
コード例 #52
0
 def f(df, column):
     if not is_numeric_dtype(df[column]):
         dtype = df[column].dtype
         msg = (f"Expected type of column $column to be one of numeric"
                f" but found {dtype} instead!")
         raise ValueError(msg)