def scale_vars(df, mapper): warnings.filterwarnings('ignore', category=sklearn.exceptions.DataConversionWarning) if mapper is None: map_f = [([n],StandardScaler()) for n in df.columns if is_numeric_dtype(df[n])] mapper = DataFrameMapper(map_f).fit(df) df[mapper.transformed_names_] = mapper.transform(df) return mapper
def feature_dependence_matrix(rf, X_train, n_samples=5000): """ Given training observation independent variables in X_train (a dataframe), compute the feature importance using each var as a dependent variable. We retrain a random forest for each var as target using the others as independent vars. Only numeric columns are considered. By default, sample up to 5000 observations to compute feature dependencies. :return: a non-symmetric data frame with the dependence matrix where each row is the importance of each var to the row's var used as a model target. """ numcols = [col for col in X_train if is_numeric_dtype(X_train[col])] X_train = sample_rows(X_train, n_samples) df_dep = pd.DataFrame(index=X_train.columns, columns=['Dependence']+X_train.columns.tolist()) for i in range(len(numcols)): col = numcols[i] X, y = X_train.drop(col, axis=1), X_train[col] rf.fit(X,y) #imp = rf.feature_importances_ imp = permutation_importances_raw(rf, X, y, oob_regression_r2_score, n_samples) imp = np.insert(imp, i, 1.0) df_dep.iloc[i] = np.insert(imp, 0, rf.oob_score_) # add overall dependence return df_dep
def _check_Xy(X: pd.DataFrame, y: pd.Series, *, norm_y=False) -> Tuple[pd.Series, pd.Series]: if np.ndim(X) == 1: X = pd.Series(X).to_frame() elif np.ndim(X) == 2: X = pd.DataFrame(X) assert X.ndim == 2 assert np.ndim(y) == 1 assert len(X) == len(y) valid = ~X.isnull().any(1).values X = pd.Series(list(zip(*X.values[valid].T)), name=tuple(X.columns)).astype('category') y = pd.Series(y).reset_index(drop=True)[valid] if is_object_dtype(y): y = pd.Categorical(y) if norm_y: assert is_numeric_dtype(y) y = (y - y.mean()) / y.std() return X, y
def get_var_type(col): """ Return var_type (for KDEMultivariate) of the column Parameters ---------- col : pandas.Series A dataframe column. Returns ------- out : str One of ['c', 'o', 'u']. See Also -------- The origin of the character codes is :class:`statsmodels.nonparametric.kernel_density.KDEMultivariate`. """ if pdtypes.is_numeric_dtype(col): # continuous return 'c' elif pdtypes.is_categorical_dtype(col): # ordered or unordered return 'o' if col.cat.ordered else 'u' else: # unordered if unsure, e.g string columns that # are not categorical return 'u'
def missing_check(df, imputation="zero", verbose=True): '''check the missing percentage. Impute the missing values if necessary Note: for numerical variables and categorical variables we should handle differently :param df: :param imputation: "zero" or "mean" :return: ''' n_df = df.shape[0] cols = df.columns.tolist() if verbose: print("\nMissing value check and imputation starts...") for col in cols: missing = n_df - np.count_nonzero(df[col].isnull().values) mis_perc = 100 - float(missing) / n_df * 100 if mis_perc > 0: if verbose: print(" {col} missing percentage is {miss}%" \ .format(col=col, miss=mis_perc)) # impute categorical var by NaN if not is_numeric_dtype(df[col]): df[col].fillna('NaN', inplace=True) continue # impute num variables if imputation == "mean": df[col].fillna(df[col].mean, inplace=True) else: df[col].fillna(int(0), inplace=True) return df
def fix_missing(df, col, name, na_dict): """ Fill missing data in a column of df with the median, and add a {name}_na column which specifies if the data was missing. Parameters: ----------- df: The data frame that will be changed. col: The column of data to fix by filling in missing data. name: The name of the new filled column in df. na_dict: A dictionary of values to create na's of and the value to insert. If name is not a key of na_dict the median will fill any missing data. Also if name is not a key of na_dict and there is no missing data in col, then no {name}_na column is not created. Examples: --------- >>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]}) >>> df col1 col2 0 1 5 1 nan 2 2 3 2 >>> fix_missing(df, df['col1'], 'col1', {}) >>> df col1 col2 col1_na 0 1 5 False 1 2 2 True 2 3 2 False >>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]}) >>> df col1 col2 0 1 5 1 nan 2 2 3 2 >>> fix_missing(df, df['col2'], 'col2', {}) >>> df col1 col2 0 1 5 1 nan 2 2 3 2 >>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]}) >>> df col1 col2 0 1 5 1 nan 2 2 3 2 >>> fix_missing(df, df['col1'], 'col1', {'col1' : 500}) >>> df col1 col2 col1_na 0 1 5 False 1 500 2 True 2 3 2 False """ if is_numeric_dtype(col): if pd.isnull(col).sum() or (name in na_dict): df[name+'_na'] = pd.isnull(col) filler = na_dict[name] if name in na_dict else col.median() df[name] = col.fillna(filler) na_dict[name] = filler return na_dict
def compute_boxplot(self, series): """ Compute boxplot for given pandas Series. """ from matplotlib.cbook import boxplot_stats series = series[series.notnull()] if len(series.values) == 0: return {} elif not is_numeric_dtype(series): return self.non_numeric_stats(series) stats = boxplot_stats(list(series.values))[0] stats['count'] = len(series.values) stats['fliers'] = "|".join(map(str, stats['fliers'])) return stats
def generate_plotly_dim_dict(df, field): dim_dict = {} dim_dict["label"] = field column = df[field] if is_numeric_dtype(column): dim_dict["values"] = column elif is_string_dtype(column): texts = column.unique() dim_dict["values"] = [ np.argwhere(texts == x).flatten()[0] for x in column ] dim_dict["tickvals"] = list(range(len(texts))) dim_dict["ticktext"] = texts else: raise Exception("Unidentifiable Type") return dim_dict
def numericalize(df, col, name, max_n_cat): """ Changes the column col from a categorical type to it's integer codes. Parameters: ----------- df: A pandas dataframe. df[name] will be filled with the integer codes from col. col: The column you wish to change into the categories. name: The column name you wish to insert into df. This column will hold the integer codes. max_n_cat: If col has more categories than max_n_cat it will not change the it to its integer codes. If max_n_cat is None, then col will always be converted. Examples: --------- >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']}) >>> df col1 col2 0 1 a 1 2 b 2 3 a note the type of col2 is string >>> train_cats(df) >>> df col1 col2 0 1 a 1 2 b 2 3 a now the type of col2 is category { a : 1, b : 2} >>> numericalize(df, df['col2'], 'col3', None) col1 col2 col3 0 1 a 1 1 2 b 2 2 3 a 1 """ if not is_numeric_dtype(col) and ( max_n_cat is None or col.nunique()>max_n_cat): df[name] = col.cat.codes+1
def table_from_frame(df, *, force_nominal=False): def _is_discrete(s): return (is_categorical_dtype(s) or is_object_dtype(s) and (force_nominal or s.nunique() < s.size**.666)) def _is_datetime(s): if is_datetime64_any_dtype(s): return True try: if is_object_dtype(s): pd.to_datetime(s, infer_datetime_format=True) return True except Exception: pass return False attrs, metas = [], [] X, M = [], [] for name, s in df.items(): name = str(name) if _is_discrete(s): discrete = s.astype('category').cat attrs.append(DiscreteVariable(name, discrete.categories.astype(str).tolist())) X.append(discrete.codes.replace(-1, np.nan).values) elif _is_datetime(s): tvar = TimeVariable(name) attrs.append(tvar) s = pd.to_datetime(s, infer_datetime_format=True) X.append(s.astype('str').map(tvar.parse).values) elif is_numeric_dtype(s): attrs.append(ContinuousVariable(name)) X.append(s.values) else: metas.append(StringVariable(name)) M.append(s.values.astype(object)) MAX_LENGTH = max(len(X[0]) if X else 0, len(M[0]) if M else 0) return Table.from_numpy(Domain(attrs, None, metas), np.column_stack(X) if X else np.empty( (MAX_LENGTH, 0)), None, np.column_stack(M) if M else None)
def _table_from_numpy(x): def _to2d(x): if x.ndim <= 1: return np.c_[x] if x.ndim == 2: return x return None # When the shitty internals get fixed, this below will work # 2d array of (n-2)d-list fields x2d = np.empty(x.shape[:2], dtype=object) x2d[:] = x.tolist() return x2d x = _to2d(x) # 2d or str arrays etc. not supported if x is None or not is_numeric_dtype(x): return None return Table.from_numpy(None, x)
def resample(self, sampling_rate=None, variables=None, force_dense=False, in_place=False, kind='linear'): ''' Resample all dense variables (and optionally, sparse ones) to the specified sampling rate. Args: sampling_rate (int, float): Target sampling rate (in Hz). If None, uses the instance sampling rate. variables (list): Optional list of Variables to resample. If None, all variables are resampled. force_dense (bool): if True, all sparse variables will be forced to dense. in_place (bool): When True, all variables are overwritten in-place. When False, returns resampled versions of all variables. kind (str): Argument to pass to scipy's interp1d; indicates the kind of interpolation approach to use. See interp1d docs for valid values. ''' # Store old sampling rate-based variables sampling_rate = sampling_rate or self.sampling_rate _variables = {} for name, var in self.variables.items(): if variables is not None and name not in variables: continue if isinstance(var, SparseRunVariable): if force_dense and is_numeric_dtype(var.values): _variables[name] = var.to_dense(sampling_rate) else: # None if in_place; no update needed _var = var.resample(sampling_rate, inplace=in_place, kind=kind) if not in_place: _variables[name] = _var if in_place: for k, v in _variables.items(): self.variables[k] = v self.sampling_rate = sampling_rate else: return _variables
def test__write_frame__read_frame(): print("Start") from django.db import utils from econdata.models import Listing from libclair.dataframes import write_frame_create, read_frame, write_frame # Create a DataFrame and write it ino the database fr1 = pd.DataFrame([{'id':'foo-1', 'site':'a', 'id_site':'1', 'title':'The 1st record.'}, {'id':'foo-2', 'site':'a', 'id_site':'2', 'title':'The 2nd record.'}]) print('\nfr1:\n', fr1) write_frame_create(fr1, Listing, delete=True) # The records already exist. Creating them again, without deleting them, # must raise an exception. with pytest.raises(utils.IntegrityError): write_frame_create(fr1, Listing) # Read the records, that were just created, from the database. # Read a few additional empty columns. qset = Listing.objects.filter(id__in=['foo-1', 'foo-2']) fr2 = read_frame(qset, ['id', 'title', 'time', 'price']) print('\nfr2:\n', fr2) assert pd_types.is_string_dtype(fr2['title']) assert pd_types.is_datetime64_any_dtype(fr2['time']) assert pd_types.is_numeric_dtype(fr2['price']) assert fr2['id'][0] == 'foo-1' assert fr2['id'][1] == 'foo-2' assert fr2['title'][0] == 'The 1st record.' assert fr2['title'][1] == 'The 2nd record.' # Change the dataframe fr2['time'] = [pd.Timestamp('2017-01-01 12:00+0'), pd.Timestamp('2017-01-02 12:00+0'),] fr2['price'] = [101.0, 102.0,] print('\nfr2:\n', fr2) # Update the records in the database write_frame(fr2, Listing) # Read the updated records from the database. qset = Listing.objects.filter(id__in=['foo-1', 'foo-2']) fr3 = read_frame(qset, ['id', 'title', 'time', 'price']) print('\nfr3:\n', fr3) assert_frames_equal(fr2, fr3)
def _get_columns_info(self, stats): column_info = {} column_info[self.TYPE_CONSTANT] = stats['uniques'][stats['uniques'] == 1].index column_info[self.TYPE_BOOL] = stats['uniques'][stats['uniques'] == 2].index rest_columns = self.get_columns(self.df, self.EXCLUDE, column_info['constant'].union(column_info['bool'])) column_info[self.TYPE_NUMERIC] = pd.Index([c for c in rest_columns if types.is_numeric_dtype(self.df[c])]) rest_columns = self.get_columns( self.df[rest_columns], self.EXCLUDE, column_info['numeric']) column_info[self.TYPE_DATE] = pd.Index([c for c in rest_columns if types.is_datetime64_dtype(self.df[c])]) rest_columns = self.get_columns( self.df[rest_columns], self.EXCLUDE, column_info['date']) unique_columns = stats['uniques'][rest_columns] == stats['counts'][rest_columns] column_info[self.TYPE_UNIQUE] = stats['uniques'][rest_columns][unique_columns].index column_info[self.TYPE_CATEGORICAL] = stats['uniques'][rest_columns][~unique_columns].index return column_info
def plot_elastic_properties(self, fontsize=10, **kwargs): """ Args: fontsize: legend and label fontsize. Returns: |matplotlib-Figure| """ df = self.get_elastic_dataframe(with_geo=False, abspath=False, with_params=False) from pandas.api.types import is_numeric_dtype keys = [k for k in df.keys() if is_numeric_dtype(df[k])] i = keys.index("fitted_to_structure") if i != -1: keys.pop(i) num_plots, ncols, nrows = len(keys), 1, 1 if num_plots > 1: ncols = 3 nrows = (num_plots // ncols) + (num_plots % ncols) ax_list, fig, plt = get_axarray_fig_plt(None, nrows=nrows, ncols=ncols, sharex=False, sharey=False, squeeze=False) ax_list = ax_list.ravel() for ix, (key, ax) in enumerate(zip(keys, ax_list)): irow, icol = divmod(ix, ncols) xn = range(len(df.index)) ax.plot(xn, df[key], marker="o") ax.grid(True) ax.set_xticks(xn) ax.set_ylabel(key, fontsize=fontsize) ax.set_xticklabels([]) ax.set_xticklabels(self.keys(), fontsize=fontsize) rotate_ticklabels(ax, 15) if ix != len(ax_list) -1: for ix in range(ix + 1, len(ax_list)): ax_list[ix].axis('off') return fig
def oob_dependences(rf, X_train, n_samples=5000): """ Given a random forest model, rf, and training observation independent variables in X_train (a dataframe), compute the OOB R^2 score using each var as a dependent variable. We retrain rf for each var. Only numeric columns are considered. By default, sample up to 5000 observations to compute feature dependencies. :return: Return a DataFrame with Feature/Dependence values for each variable. Feature is the dataframe index. """ numcols = [col for col in X_train if is_numeric_dtype(X_train[col])] X_train = sample_rows(X_train, n_samples) df_dep = pd.DataFrame(columns=['Feature','Dependence']) df_dep = df_dep.set_index('Feature') for col in numcols: X, y = X_train.drop(col, axis=1), X_train[col] rf.fit(X, y) df_dep.loc[col] = rf.oob_score_ df_dep = df_dep.sort_values('Dependence', ascending=False) return df_dep
def contains_op(cls, series: pd.Series, state: dict) -> bool: return pdt.is_numeric_dtype(series)
def table_from_frame(df, *, force_nominal=False): """ Convert pandas.DataFrame to Orange.data.Table Parameters ---------- df : pandas.DataFrame force_nominal : boolean If True, interpret ALL string columns as nominal (DiscreteVariable). Returns ------- Table """ def _is_discrete(s): return (is_categorical_dtype(s) or is_object_dtype(s) and (force_nominal or s.nunique() < s.size**.666)) def _is_datetime(s): if is_datetime64_any_dtype(s): return True try: if is_object_dtype(s): pd.to_datetime(s, infer_datetime_format=True) return True except Exception: # pylint: disable=broad-except pass return False # If df index is not a simple RangeIndex (or similar), put it into data if not (df.index.is_integer() and (df.index.is_monotonic_increasing or df.index.is_monotonic_decreasing)): df = df.reset_index() attrs, metas = [], [] X, M = [], [] # Iter over columns for name, s in df.items(): name = str(name) if _is_discrete(s): discrete = s.astype('category').cat attrs.append(DiscreteVariable(name, discrete.categories.astype(str).tolist())) X.append(discrete.codes.replace(-1, np.nan).values) elif _is_datetime(s): tvar = TimeVariable(name) attrs.append(tvar) s = pd.to_datetime(s, infer_datetime_format=True) X.append(s.astype('str').replace('NaT', np.nan).map(tvar.parse).values) elif is_numeric_dtype(s): attrs.append(ContinuousVariable(name)) X.append(s.values) else: metas.append(StringVariable(name)) M.append(s.values.astype(object)) return Table.from_numpy(Domain(attrs, None, metas), np.column_stack(X) if X else np.empty((df.shape[0], 0)), None, np.column_stack(M) if M else None)
def proc_df(df, y_fld=None, skip_flds=None, ignore_flds=None, do_scale=False, na_dict=None, preproc_fn=None, max_n_cat=None, subset=None, mapper=None): """ proc_df takes a data frame df and splits off the response variable, and changes the df into an entirely numeric dataframe. Parameters: ----------- df: The data frame you wish to process. y_fld: The name of the response variable skip_flds: A list of fields that dropped from df. ignore_flds: A list of fields that are ignored during processing. do_scale: Standardizes each column in df. Takes Boolean Values(True,False) na_dict: a dictionary of na columns to add. Na columns are also added if there are any missing values. preproc_fn: A function that gets applied to df. max_n_cat: The maximum number of categories to break into dummy values, instead of integer codes. subset: Takes a random subset of size subset from df. mapper: If do_scale is set as True, the mapper variable calculates the values used for scaling of variables during training time (mean and standard deviation). Returns: -------- [x, y, nas, mapper(optional)]: x: x is the transformed version of df. x will not have the response variable and is entirely numeric. y: y is the response variable nas: returns a dictionary of which nas it created, and the associated median. mapper: A DataFrameMapper which stores the mean and standard deviation of the corresponding continuous variables which is then used for scaling of during test-time. Examples: --------- >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']}) >>> df col1 col2 0 1 a 1 2 b 2 3 a note the type of col2 is string >>> train_cats(df) >>> df col1 col2 0 1 a 1 2 b 2 3 a now the type of col2 is category { a : 1, b : 2} >>> x, y, nas = proc_df(df, 'col1') >>> x col2 0 1 1 2 2 1 >>> data = DataFrame(pet=["cat", "dog", "dog", "fish", "cat", "dog", "cat", "fish"], children=[4., 6, 3, 3, 2, 3, 5, 4], salary=[90, 24, 44, 27, 32, 59, 36, 27]) >>> mapper = DataFrameMapper([(:pet, LabelBinarizer()), ([:children], StandardScaler())]) >>>round(fit_transform!(mapper, copy(data)), 2) 8x4 Array{Float64,2}: 1.0 0.0 0.0 0.21 0.0 1.0 0.0 1.88 0.0 1.0 0.0 -0.63 0.0 0.0 1.0 -0.63 1.0 0.0 0.0 -1.46 0.0 1.0 0.0 -0.63 1.0 0.0 0.0 1.04 0.0 0.0 1.0 0.21 """ if not ignore_flds: ignore_flds=[] if not skip_flds: skip_flds=[] if subset: df = get_sample(df,subset) ignored_flds = df.loc[:, ignore_flds] df.drop(ignore_flds, axis=1, inplace=True) df = df.copy() if preproc_fn: preproc_fn(df) if y_fld is None: y = None else: if not is_numeric_dtype(df[y_fld]): df[y_fld] = df[y_fld].cat.codes y = df[y_fld].values skip_flds += [y_fld] df.drop(skip_flds, axis=1, inplace=True) if na_dict is None: na_dict = {} for n,c in df.items(): na_dict = fix_missing(df, c, n, na_dict) if do_scale: mapper = scale_vars(df, mapper) for n,c in df.items(): numericalize(df, c, n, max_n_cat) df = pd.get_dummies(df, dummy_na=True) df = pd.concat([ignored_flds, df], axis=1) res = [df, y, na_dict] if do_scale: res = res + [mapper] return res
def universal_dataset_check(self, dataset_name, object_headers=None, numeric_headers=None, bool_headers=None, test_func=None): # "Hard" integrity checks that take a long time. # These tests only run if the MATMINER_DATASET_FULL_TEST # environment variable is set to True if do_complete_test: # Get rid of dataset if it's on the disk already data_path = os.path.join( self.dataset_dir, dataset_name + "." + self.dataset_dict[dataset_name][ 'file_type' ] ) if os.path.exists(data_path): os.remove(data_path) # Test that dataset can be downloaded load_dataset(dataset_name) self.assertTrue(os.path.exists(data_path)) # Test that data is now available and has all its elements df = load_dataset(dataset_name, download_if_missing=False) self.assertEqual( len(df), self.dataset_dict[dataset_name]["num_entries"] ) # Test all columns are there self.assertEqual(sorted(list(df)), sorted( [header for header in self.dataset_dict[dataset_name]['columns'].keys()] )) # Test each column for appropriate type if object_headers is None: object_headers = [] if numeric_headers is None: numeric_headers = [] if bool_headers is None: bool_headers = [] df = load_dataset(dataset_name, download_if_missing=False) if object_headers: self.assertTrue(is_object_dtype(df[object_headers].values)) if numeric_headers: self.assertTrue(is_numeric_dtype(df[numeric_headers].values)) if bool_headers: self.assertTrue(is_bool_dtype(df[bool_headers].values)) # Make sure all columns are accounted for column_headers = object_headers + numeric_headers + bool_headers self.assertEqual(sorted(list(df)), sorted(column_headers)) # Run tests unique to the dataset if test_func is not None: test_func(df) # "Soft" check that just makes sure the dataset download page is active # This runs when on a system with the CI environment var present # (e.g. when running a continuous integration VCS system) else: download_page = requests.head( self.dataset_dict[dataset_name]["url"] ) self.assertTrue(download_page.ok)
def format_series(self, series: pd.Series) -> pd.Series: ret = series.map(self._formatter.format, na_action="ignore") # Pandas will still think all-NA is number. if is_numeric_dtype(ret): ret = ret.astype(object) return ret
def spstd(x): return np.sqrt(sp.stats.moment(x.dropna(), 2)) if is_numeric_dtype(x) else np.nan
def merge_results(results, format='wide', timing=True, metadata=True, extractor_names=True, object_id=True, aggfunc=None, invalid_results='ignore', **to_df_kwargs): ''' Merges a list of ExtractorResults instances and returns a pandas DF. Args: results (list, tuple): A list of ExtractorResult instances to merge. format (str): Format to return the data in. Can be either 'wide' or 'long'. In the wide case, every extracted feature is a column, and every Stim is a row. In the long case, every row contains a single Stim/Extractor/feature combination. timing (bool, str): Whether or not to include columns for onset, order, and duration. metadata (bool): if True, includes Stim metadata columns in the returned DataFrame. These columns include 'stim_name', 'class', 'filename', 'history', and 'source_file'. Note that these values are often long strings, so the returned DF will be considerably larger. extractor_names (str, bool): How to handle extractor names when returning results. The specific behavior depends on whether format is 'long' or 'wide'. Valid values include: - 'prepend' or True: In both 'long' and 'wide' formats, feature names will be prepended with the Extractor name (e.g., "FaceExtractor#face_likelihood"). - 'drop' or False: In both 'long' and 'wide' formats, extractor names will be omitted entirely from the result. Note that this can create feature name conflicts when merging results from multiple Extractors, so is generally discouraged. - 'column': In 'long' format, extractor name will be included as a separate column. Not valid for 'wide' format (and will raise an error). - 'multi': In 'wide' format, a MultiIndex will be used for the columns, with the first level of the index containing the Extractor name and the second level containing the feature name. This value is invalid if format='long' (and will raise and error). object_id (bool): If True, attempts to intelligently add an 'object_id' column that differentiates between multiple objects in the results that may share onsets/orders/durations (and would otherwise be impossible to distinguish). This frequently occurs for ImageExtractors that identify multiple target objects (e.g., faces) within a single ImageStim. Default is 'auto', which includes the 'object_id' column if and only if it has a non-constant value. aggfunc (str, Callable): If format='wide' and extractor_names='drop', it's possible for name clashes between features to occur. In such cases, the aggfunc argument is passed onto pandas' pivot_table function, and specifies how to aggregate multiple values for the same index. Can be a callable or any string value recognized by pandas. By default (None), 'mean' will be used for numeric columns and 'first' will be used for object/categorical columns. invalid_results (str): Specifies desired action for treating elements of the passed in results argument that are not ExtractorResult objects. Valid values include: - 'ignore' will ignore them and merge the valid ExtractorResults. - 'fail' will raise an exception on any invalid input Returns: a pandas DataFrame. For format details, see 'format' argument. ''' results = flatten(results) _timing = True if timing == 'auto' else timing _object_id = True if object_id == 'auto' else object_id if extractor_names is True: extractor_names = 'prepend' elif extractor_names is False: extractor_names = 'drop' dfs = [] for r in results: if isinstance(r, ExtractorResult): dfs.append(r.to_df(timing=_timing, metadata=metadata, format='long', extractor_name=True, object_id=_object_id, **to_df_kwargs)) elif invalid_results == 'fail': raise ValueError("At least one of the provided results was not an" "ExtractorResult. Set the invalid_results" "parameter to 'ignore' if you wish to ignore" "this.") if len(dfs) == 0: return pd.DataFrame() data = pd.concat(dfs, axis=0).reset_index(drop=True) if object_id == 'auto' and data['object_id'].nunique() == 1: data = data.drop('object_id', axis=1) if extractor_names in ['prepend', 'multi']: data['feature'] = data['extractor'] + '#' + data['feature'].astype(str) if extractor_names != 'column': data = data.drop('extractor', axis=1) if format == 'wide': ind_cols = {'stim_name', 'onset', 'order', 'duration', 'object_id', 'class', 'filename', 'history', 'source_file'} ind_cols = list(ind_cols & set(data.columns)) # pandas groupby/index operations can't handle NaNs in index, (see # issue at https://github.com/pandas-dev/pandas/issues/3729), so we # replace NaNs with a placeholder and then re-substitute after # pivoting. dtypes = data[ind_cols].dtypes data[ind_cols] = data[ind_cols].fillna('PlAcEholdER') # Set default aggfunc based on column type, otherwise bad things happen if aggfunc is None: aggfunc = 'mean' if is_numeric_dtype(data['value']) else 'first' data = data.pivot_table(index=ind_cols, columns='feature', values='value', aggfunc=aggfunc).reset_index() data.columns.name = None # vestigial--is set to 'feature' data[ind_cols] = data[ind_cols].replace('PlAcEholdER', np.nan) data[ind_cols] = data[ind_cols].astype(dict(zip(ind_cols, dtypes))) if timing == 'auto' and 'onset' in data.columns: if data['onset'].isnull().all(): data = data.drop(['onset', 'order', 'duration'], axis=1) if 'onset' in data.columns: key = [('onset', ''), ('order', ''), ('duration', '')] \ if isinstance(data.columns, pd.MultiIndex) \ else ['onset', 'order', 'duration'] data = data.sort_values(key).reset_index(drop=True) if extractor_names == 'multi': if format == 'long': raise ValueError("Invalid extractor_names value 'multi'. When " "format is 'long', extractor_names must be " "one of 'drop', 'prepend', or 'column'.") data.columns = pd.MultiIndex.from_tuples( [c.split('#') for c in data.columns]) return data
def lambda_handler(event, context): bucket = event['Records'][0]['s3']['bucket']['name'] aws_key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'], encoding='utf-8') bucketcsvfile = s3.get_object(Bucket=bucket, Key=aws_key) csvfile = pd.read_csv(bucketcsvfile['Body']) #column headers validation validation_headers_list = ['id','first_name','last_name','salary','department'] file_headers_list = list(csvfile.columns.values) headers_comparison = (validation_headers_list == file_headers_list) data_type_validation = {} #data type validation if (headers_comparison): data_type_validation['id'] = is_numeric_dtype(csvfile['id']) data_type_validation['first_name'] = is_string_dtype(csvfile['first_name']) data_type_validation['last_name'] = is_string_dtype(csvfile['last_name']) data_type_validation['salary'] = is_numeric_dtype(csvfile['salary']) data_type_validation['department'] = is_string_dtype(csvfile['department']) print(data_type_validation) if (headers_comparison and all(data_type_validation.values())): print("Validated Headers and data types in File!") try: connection_rds = psycopg2.connect("dbname={} user={} host={} password={} port={} sslmode={}".format(db_database, db_user, db_host, db_pw, db_port, db_sslmode)) cursor_rds = connection_rds.cursor() print("Connection to DB successful") temp_table_query = "create temporary table employee_staging ( like employees ) on commit drop" cursor_rds.execute(temp_table_query) upload_s3_file_query = "select * from fn_load_s3_file('{}');".format(aws_key) cursor_rds.execute(upload_s3_file_query) record_processing_query = """insert into employees (id, first_name, last_name, salary, department) select id, first_name, last_name, salary, department from employee_staging on conflict (id) do update set first_name = excluded.first_name ,last_name = excluded.last_name ,salary = excluded.salary ,department = excluded.department""" cursor_rds.execute(record_processing_query) except : connection_rds.rollback() raise else: connection_rds.commit() finally: connection_rds.close() message = message = {"Processed_file": aws_key} response = sns.publish( TargetArn=os.environ["sns_topic"], Message=json.dumps(message) ) return {'processed_file': aws_key} else: if (validation_headers_list != file_headers_list): print("Please check columns in file. File headers order should be: id,first_name,last_name,salary,department") elif (all(data_type_validation.values()) == False): columns_to_check = [] for key in data_type_validation.keys(): if data_type_validation[key] is False: columns_to_check.append(key) print("Please check the data in columns: " + str(columns_to_check)) return {'Error': "File Error"}
def __init__( self, x: "PanelDataLike", var_name: str = "x", convert_dummies: bool = True, drop_first: bool = True, copy: bool = True, ): self._var_name = var_name self._convert_dummies = convert_dummies self._drop_first = drop_first self._panel: Optional[_Panel] = None self._shape: Optional[Tuple[int, int, int]] = None index_names = ["entity", "time"] if isinstance(x, PanelData): x = x.dataframe self._original = x if not isinstance(x, (Series, DataFrame, np.ndarray)): try: from xarray import DataArray if isinstance(x, DataArray): if x.ndim not in (2, 3): raise ValueError( "Only 2-d or 3-d DataArrays are supported") if x.ndim == 2: x = x.to_pandas() else: items: List[Hashable] = np.asarray( x.coords[x.dims[0]]).tolist() major: List[Hashable] = np.asarray( x.coords[x.dims[1]]).tolist() minor: List[Hashable] = np.asarray( x.coords[x.dims[2]]).tolist() values = x.values x = panel_to_frame(values, items, major, minor, True) except ImportError: pass if isinstance(x, Series) and isinstance(x.index, MultiIndex): x = DataFrame(x) elif isinstance(x, Series): raise ValueError( "Series can only be used with a 2-level MultiIndex") if isinstance(x, DataFrame): if isinstance(x.index, MultiIndex): if len(x.index.levels) != 2: raise ValueError("DataFrame input must have a " "MultiIndex with 2 levels") if isinstance(self._original, (DataFrame, PanelData, Series)): for i in range(2): index_names[ i] = x.index.levels[i].name or index_names[i] self._frame = x if copy: self._frame = self._frame.copy() else: self._frame = DataFrame({var_name: x.T.stack(dropna=False)}) elif isinstance(x, np.ndarray): if x.ndim not in (2, 3): raise ValueError("2 or 3-d array required for numpy input") if x.ndim == 2: x = x[None, :, :] k, t, n = x.shape var_str = var_name + ".{0:0>" + str(int(np.log10(k) + 0.01)) + "}" variables = [var_name] if k == 1 else [ var_str.format(i) for i in range(k) ] entity_str = "entity.{0:0>" + str(int(np.log10(n) + 0.01)) + "}" entities = [entity_str.format(i) for i in range(n)] time = list(range(t)) assert isinstance(x, np.ndarray) x = x.astype(np.float64, copy=False) panel = _Panel.from_array(x, items=variables, major_axis=time, minor_axis=entities) self._fake_panel = panel self._frame = panel.to_frame() else: raise TypeError("Only ndarrays, DataFrames or DataArrays are " "supported") if convert_dummies: self._frame = expand_categoricals(self._frame, drop_first) self._frame = self._frame.astype(np.float64, copy=False) time_index = Series(self.index.levels[1]) if not (is_numeric_dtype(time_index.dtype) or is_datetime64_any_dtype(time_index.dtype)): raise ValueError("The index on the time dimension must be either " "numeric or date-like") # self._k, self._t, self._n = self.panel.shape self._k, self._t, self._n = self.shape self._frame.index.set_names(index_names, inplace=True)
def make_summary(in_data, feature, outcome): # Get statistics for all observations summary_total = pd.DataFrame(columns=['Statistic', 'Total']) summary_total.loc[0] = ['observations', len(in_data.index)] summary_total.loc[1] = ['non missing', in_data[feature].count()] if is_numeric_dtype(in_data[feature]): summary_total.loc[2] = ['missing', in_data[feature].isnull().sum()] else: summary_total.loc[2] = ['missing', in_data[feature].isna().sum()] summary_total.loc[3] = ['unique', in_data[feature].nunique()] summary_total_desc = in_data[feature].describe().to_frame() # Format dataframe summary_total_desc.insert(0, 'Statistic', summary_total_desc.index) summary_total_desc.rename({feature: 'Total'}, axis=1, inplace=True) if is_numeric_dtype(in_data[feature]) and in_data[feature].nunique() > 2: all_total = pd.concat([summary_total, summary_total_desc.iloc[1:]], ignore_index=True) else: all_total = pd.concat([summary_total, summary_total_desc.iloc[2:]], ignore_index=True) # Get statistics by outcome value summary_outcome = in_data.groupby(outcome)[outcome].count().to_frame() summary_outcome.rename({outcome: 'observations'}, axis=1, inplace=True) nm = in_data.groupby([outcome]).agg({feature: ['count']}) nm.columns = ["non missing"] summary_outcome = summary_outcome.merge(nm, how='outer', left_index=True, right_index=True) miss = in_data[feature].isnull().groupby(in_data[outcome]).sum().to_frame() miss.columns = ["missing"] summary_outcome = summary_outcome.merge(miss, how='outer', left_index=True, right_index=True) nu = in_data.groupby([outcome]).agg({feature: ['nunique']}) nu.columns = ["unique"] summary_outcome = summary_outcome.merge(nu, how='inner', left_index=True, right_index=True) # Format dataframe summary_outcome_desc = in_data.groupby(outcome)[feature].describe() if is_numeric_dtype(in_data[feature]): summary_outcome = summary_outcome.merge(summary_outcome_desc.drop( 'count', axis=1), how='outer', left_index=True, right_index=True) else: summary_outcome = summary_outcome.merge(summary_outcome_desc.drop( ['count', 'unique'], axis=1), how='outer', left_index=True, right_index=True) summary_outcome_trans = summary_outcome.transpose() new_columns = [ outcome + ' = ' + str(list(summary_outcome_trans)[0]), outcome + ' = ' + str(list(summary_outcome_trans)[1]) ] summary_outcome_trans.columns = new_columns # Merge total and by outcome statistics summary = all_total.merge(summary_outcome_trans, how='outer', left_on='Statistic', right_index=True) try: summary.to_html('summary ' + feature + '.html', index=False, float_format=lambda x: '%10.2f' % x) except: print(summary)
#data eda. profile = train.profile_report(title='Pandas Profiling Report') profile.to_file(output_file="report/data_eda_output.html") #train test split. train_labels = train['survived'] train = train.drop(columns=['survived']) X_train, X_test, y_train, y_test = train_test_split(train, train_labels, test_size=0.2) #data preprocess.transform maxunique = 1024 unique_stat = X_train.nunique() numeric_cols = [i for i in X_train.columns if is_numeric_dtype(X_train[i])] categorical_cols = [ i for i in X_train.columns if i not in numeric_cols and unique_stat[i] < maxunique ] mapper = DataFrameMapper([ (categorical_cols, [ SimpleImputer(strategy='constant', fill_value='missing'), OneHotEncoder(handle_unknown='ignore') ]), (numeric_cols, [SimpleImputer(strategy='median'), StandardScaler()]), ], df_out=True)
def make_chart(self, table: pd.DataFrame, input_columns: Dict[str, Any]) -> Chart: """Create a Chart ready for charting, or raise GentleValueError. Features: * Error if X column is missing * Error if X column does not have two values * Error if X column is all-NaN * Error if too many X values in text mode (since we can't chart them) * X column can be number or date * Missing X dates lead to missing records * Missing X floats lead to missing records * Missing Y values are omitted * Error if no Y columns chosen * Error if a Y column is the X column * Error if a Y column has fewer than 1 non-missing value * Default title, X and Y axis labels """ x_series, mask = self._make_x_series_and_mask(table, input_columns) if not self.y_columns: raise GentleValueError( i18n.trans("noYAxisError.message", "Please choose a Y-axis column")) y_serieses = [] for ycolumn in self.y_columns: if ycolumn.column == self.x_column: raise GentleValueError( i18n.trans( "sameAxesError.message", "You cannot plot Y-axis column {column_name} because it is the X-axis column", {"column_name": ycolumn.column}, )) series = table[ycolumn.column] if not is_numeric_dtype(series.dtype): raise GentleValueError( i18n.trans( "axisNotNumericError.message", 'Cannot plot Y-axis column "{column_name}" because it is not numeric. ' "Convert it to a number before plotting it.", {"column_name": ycolumn.column}, )) series = series[mask] # line up with x_series series.reset_index(drop=True, inplace=True) # Find how many Y values can actually be plotted on the X axis. If # there aren't going to be any Y values on the chart, raise an # error. if not series.count(): raise GentleValueError( i18n.trans( "emptyAxisError.message", 'Cannot plot Y-axis column "{column_name}" because it has no values', {"column_name": ycolumn.column}, )) y_serieses.append( YSeries(series, ycolumn.color, input_columns[ycolumn.column].format)) title = self.title or "Line Chart" x_axis_label = self.x_axis_label or x_series.name if len(y_serieses) == 1: y_axis_label = self.y_axis_label or y_serieses[0].name else: y_axis_label = self.y_axis_label return Chart( title=title, x_axis_label=x_axis_label, x_axis_tick_format=x_series.d3_tick_format, y_axis_label=y_axis_label, x_series=x_series, y_serieses=y_serieses, y_axis_tick_format=y_serieses[0].d3_tick_format, )
def _prophet(request, context): """ Provide a timeseries forecast using Facebook's Prophet library. Scalar function. :param request: an iterable sequence of RowData :param context: not used for now :return: the forecasted value for each row : :Qlik expression example: :<AAI Connection Name>.Prophet(MonthStartDate, sum(Value), 'return=yhat, freq=MS, debug=true') :The third argument in the Qlik expression is a string of parameters. :This should take the form of a comma separated string: :e.g 'return=yhat, freq=MS, debug=true' or 'return=yhat_upper, freq=MS' : :<AAI Connection Name>.Prophet_Holidays(ForecastDate, sum(Value), Holiday, 'return=yhat, freq=D, debug=true') :In the holidays variant the third argument is a field containing the holiday name or NULL for each row. : :Parameters accepted for the Prophet() function are: cap, floor, changepoint_prior_scale, interval_width, :lower_window, upper_window : :Parameters accepted for the make_future_dataframe() function are: freq : :For more information on these parameters go here: https://facebook.github.io/prophet/docs/quick_start.html : :Additional parameters used are: return, take_log, debug, load_script : :cap = 1000 : A logistic growth model can be defined using cap and floor. Values should be double or integer :changepoint_prior_scale = 0.05 : Decrease if the trend changes are being overfit, increase for underfit :interval_width = 0.08 : Set the width of the uncertainty intervals :lower_window = 1 : Only used with holidays. Extend the holiday by certain no. of days prior to the date. :upper_window = 1 : Only used with holidays. Extend the holiday by certain no. of days after the date. :freq = MS : The frequency of the time series. e.g. MS for Month Start. See the possible options here: : : http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases :return = yhat : Any of the options in the forecast result. You can see these options with debug=true : : yhat, yhat_upper, yhat_lower : Forecast, upper and lower limits : : y_then_yhat, y_then_yhat_upper, y_then_yhat_lower : Return forecast only for forecast periods : : trend, trend_upper, trend_lower : Trend component of the timeseries : : seasonal, seasonal_upper, seasonal_lower: Seasonal component of the timeseries :take_log = false : Apply logarithm to the values before the forecast. Default is true :debug = true : Print execution information to the terminal and logs in ..\logs\Prophet Log <n>.txt """ # Get a list from the generator object so that it can be iterated over multiple times request_list = [request_rows for request_rows in request] # Calculate timings for the components of the forecasting # The results will be stored in ..\logs\Prophet Performance Log.txt # The request_list line above is not timed as the generator can only be iterated once # ProphetForQlik.timeit(request_list) # Create an instance of the ProphetForQlik class # This will take the request data from Qlik and prepare it for forecasting predictor = ProphetForQlik(request_list, context) # Calculate the forecast and store in a Pandas series forecast = predictor.predict() # Check if the response is a DataFrame. # This occurs when the load_script=true argument is passed in the Qlik expression. response_is_df = isinstance(forecast, pd.DataFrame) # Set the data types of the output if response_is_df: dtypes = [] for dt in forecast.dtypes: dtypes.append('num' if is_numeric_dtype(dt) else 'str') else: dtypes = ['num'] # Get the response as SSE.Rows response_rows = utils.get_response_rows(forecast.values.tolist(), dtypes) # Get the number of bundles in the request num_request_bundles = len(request_list) # Get the number of rows in the response num_rows = len(response_rows) # Calculate the number of rows to send per bundle if num_rows >= num_request_bundles: rows_per_bundle = num_rows // num_request_bundles else: rows_per_bundle = num_rows # Stream response as BundledRows for i in range(0, num_rows, rows_per_bundle): # Yield Row data as Bundled rows yield SSE.BundledRows(rows=response_rows[i:i + rows_per_bundle])
def check_events(events): """Test that the events data describes a valid experimental paradigm It is valid if the events data has an 'onset' key. Parameters ---------- events : pandas DataFrame Events data that describes a functional experimental paradigm. Returns ------- trial_type : array of shape (n_events,), dtype='s' Per-event experimental conditions identifier. Defaults to np.repeat('dummy', len(onsets)). onset : array of shape (n_events,), dtype='f' Per-event onset time (in seconds) duration : array of shape (n_events,), dtype='f' Per-event durantion, (in seconds) defaults to zeros(n_events) when no duration is provided modulation : array of shape (n_events,), dtype='f' Per-event modulation, (in seconds) defaults to ones(n_events) when no duration is provided. """ # Check that events is a Pandas DataFrame if not isinstance(events, pd.DataFrame): raise TypeError("Events should be a Pandas DataFrame. " "A {} was provided instead.".format(type(events))) # Column checks for col_name in ['onset', 'duration']: if col_name not in events.columns: raise ValueError("The provided events data " "has no {} column.".format(col_name)) # Make a copy of the dataframe events_copy = events.copy() # Handle missing trial types if 'trial_type' not in events_copy.columns: warnings.warn("'trial_type' column not found " "in the given events data.") events_copy['trial_type'] = 'dummy' # Handle modulation if 'modulation' in events_copy.columns: print("A 'modulation' column was found in " "the given events data and is used.") else: events_copy['modulation'] = 1 # Warn for each unexpected column that will # not be used afterwards unexpected_columns = set(events_copy.columns).difference(VALID_FIELDS) for unexpected_column in unexpected_columns: warnings.warn(("Unexpected column `{}` in events " "data will be ignored.").format(unexpected_column)) # Make sure we have a numeric type for duration if not is_numeric_dtype(events_copy['duration']): try: events_copy = events_copy.astype({'duration': float}) except ValueError: raise ValueError("Could not cast duration to float " "in events data.") # Handle duplicate events # Two events are duplicates if they have the same: # - trial type # - onset COLUMN_DEFINING_EVENT_IDENTITY = ['trial_type', 'onset', 'duration'] # Duplicate handling strategy # Sum the modulation values of duplicate events STRATEGY = {'modulation': np.sum} cleaned_events = events_copy.groupby( COLUMN_DEFINING_EVENT_IDENTITY, sort=False).agg(STRATEGY).reset_index() # If there are duplicates, give a warning if len(cleaned_events) != len(events_copy): warnings.warn("Duplicated events were detected. " "Amplitudes of these events will be summed. " "You might want to verify your inputs.") trial_type = cleaned_events['trial_type'].values onset = cleaned_events['onset'].values duration = cleaned_events['duration'].values modulation = cleaned_events['modulation'].values return trial_type, onset, duration, modulation
def isNumeric(colData): return is_numeric_dtype(colData)
def sample_row( X: pd.DataFrame, filter_rows_with_na: bool = False, random_state: int = 42, max_field_len: int = 50, ) -> pd.DataFrame: """Sample a row from pandas dataframe. Extracts the column name, datatype, minimum and maximum values for each column in the supplied dataframe. The orientation is row-based (as opposed to `df.sample(1)`), which allows for better printing when a dataset contains many features. This function is usefull when providing a sample row in technical model documentation. Example: ```python from probatus.utils import sample_row from sklearn.datasets import load_iris iris = load_iris(as_frame=True).get('data') sample = sample_row(iris, filter_rows_with_na=False, random_state=12) print(sample.to_markdown()) ``` ??? info "Example output" | column | dtype | sample | range_low | range_high | |:------------------|:--------|---------:|------------:|-------------:| | sepal length (cm) | float64 | 5 | 4.3 | 7.9 | | sepal width (cm) | float64 | 3.5 | 2 | 4.4 | | petal length (cm) | float64 | 1.3 | 1 | 6.9 | | petal width (cm) | float64 | 0.3 | 0.1 | 2.5 | Args: X (DataFrame): Pandas DataFrame to be sampled filter_rows_with_na (bool): if true, rows with na values are not considered for sampling random_state (int): Optional random state to ensure reproducability max_field_len (int): Maximum number of characters for fields, beyond which any text is truncated Returns: (pd.DataFrame): A Pandas DataFrame containing the sampled row """ # Input validation assert type(X) == pd.DataFrame, "X should be pandas DataFrame" assert X.empty is False, "X should not be an empty DataFrame" assert type( filter_rows_with_na) == bool, "filter_rows_with_na should be a boolean" assert type(random_state) == int, "random_state should be an integer" assert type(max_field_len) == int, "max_field_len should be an integer" # Create new empty df sample_df = pd.DataFrame() # Convert dtypes of pandas to ensure detection of data types X_dtypes = X.convert_dtypes() # Sample row from X sample_row = X.sample(1, random_state=random_state) if filter_rows_with_na: try: sample_row = X.dropna().sample(1, random_state=random_state) except ValueError: logging.info( "sample_row(): No rows without NaN found, sampling from all rows.." ) # Sample every column of X for i, col in enumerate(sample_row.columns): # Extract sample from X if not all samples are nan sample = sample_row[col].values[0] # If datatype allows it, extract low and high range if is_numeric_dtype(X_dtypes[col]): low = X[col].min(skipna=True) high = X[col].max(skipna=True) else: low = "" high = "" # Shorten sampled datapoint if too long if isinstance(sample, str) and len(sample) > max_field_len: sample = sample[:(max_field_len // 2) - 1] + "..." + sample[(-max_field_len // 2) + 2:] # Add new row to sample_df row_df = pd.DataFrame({ "column": [col], "dtype": [X[col].dtype], "sample": [sample], "range_low": [low], "range_high": [high], }) sample_df = pd.concat([sample_df, row_df], ignore_index=True) sample_df = sample_df.set_index(["column"]) return sample_df
def vars_from_df(df, role=None, force_nominal=False): if role is None and hasattr(df, 'orange_role'): role = df.orange_role df = _reset_index(df) cols = [], [], [] exprs = [], [], [] vars_ = [], [], [] for column in df.columns: s = df[column] _role = Role.Attribute if role is None else role if hasattr(df, 'orange_variables') and column in df.orange_variables: original_var = df.orange_variables[column] var = original_var.copy(compute_value=None) expr = None elif _is_datetime(s): var = TimeVariable(str(column)) expr = _convert_datetime elif _is_discrete(s, force_nominal): discrete = s.astype("category").cat var = DiscreteVariable(str(column), discrete.categories.astype(str).tolist()) expr = to_categorical elif is_numeric_dtype(s): var = ContinuousVariable( # set number of decimals to 0 if int else keeps default behaviour str(column), number_of_decimals=(0 if is_integer_dtype(s) else None)) expr = None else: if role is not None and role != Role.Meta: raise ValueError("String variable must be in metas.") _role = Role.Meta var = StringVariable(str(column)) expr = lambda s, _: np.asarray( # to object so that fillna can replace with nans if Unknown in nan # replace nan with object Unknown assure that all values are string s.astype(object).fillna(StringVariable.Unknown).astype(str), dtype=object) cols[_role].append(column) exprs[_role].append(expr) vars_[_role].append(var) xym = [] for a_vars, a_cols, a_expr in zip(vars_, cols, exprs): if not a_cols: arr = None if a_cols != cols[0] else np.empty((df.shape[0], 0)) elif not any(a_expr): # if all c in columns table will share memory with dataframe a_df = df if all(c in a_cols for c in df.columns) else df[a_cols] if all(isinstance(a, SparseDtype) for a in a_df.dtypes): arr = csr_matrix(a_df.sparse.to_coo()) else: arr = np.asarray(a_df) else: # we'll have to copy the table to resolve any expressions arr = np.array([ expr(df[col], var) if expr else np.asarray(df[col]) for var, col, expr in zip(a_vars, a_cols, a_expr) ]).T xym.append(arr) # Let the tables share memory with pandas frame if xym[1] is not None and xym[1].ndim == 2 and xym[1].shape[1] == 1: xym[1] = xym[1][:, 0] return xym, Domain(*vars_)
def infer_variable_types(df, link_vars, variable_types, time_index, secondary_time_index): '''Infer variable types from dataframe Args: df (DataFrame): Input DataFrame link_vars (list[]): Linked variables variable_types (dict[str -> dict[str -> type]]) : An entity's variable_types dict maps string variable ids to types (:class:`.Variable`) or (type, kwargs) to pass keyword arguments to the Variable. time_index (str or None): Name of time_index column secondary_time_index (dict[str: [str]]): Dictionary of secondary time columns that each map to a list of columns that depend on that secondary time ''' # TODO: set pk and pk types here inferred_types = {} vids_to_assume_datetime = [time_index] if len(list(secondary_time_index.keys())): vids_to_assume_datetime.append(list(secondary_time_index.keys())[0]) inferred_type = vtypes.Unknown for variable in df.columns: if variable in variable_types: continue elif isinstance(df, dd.DataFrame): msg = 'Variable types cannot be inferred from Dask DataFrames, ' \ 'use variable_types to provide type metadata for entity' raise ValueError(msg) elif is_instance(df, ks, 'DataFrame'): msg = 'Variable types cannot be inferred from Koalas DataFrames, ' \ 'use variable_types to provide type metadata for entity' raise ValueError(msg) elif variable in vids_to_assume_datetime: if col_is_datetime(df[variable]): inferred_type = vtypes.Datetime else: inferred_type = vtypes.Numeric elif variable in link_vars: inferred_type = vtypes.Categorical elif df[variable].dtype == "object": if not len(df[variable]): inferred_type = vtypes.Categorical elif col_is_datetime(df[variable]): inferred_type = vtypes.Datetime else: inferred_type = vtypes.Categorical # heuristics to predict this some other than categorical sample = df[variable].sample(min(10000, len(df[variable]))) # catch cases where object dtype cannot be interpreted as a string try: avg_length = sample.str.len().mean() if avg_length > 50: inferred_type = vtypes.NaturalLanguage except AttributeError: pass elif df[variable].dtype == "bool": inferred_type = vtypes.Boolean elif pdtypes.is_categorical_dtype(df[variable].dtype): inferred_type = vtypes.Categorical elif pdtypes.is_numeric_dtype(df[variable].dtype): inferred_type = vtypes.Numeric elif col_is_datetime(df[variable]): inferred_type = vtypes.Datetime elif len(df[variable]): sample = df[variable] \ .sample(min(10000, df[variable].nunique(dropna=False))) unique = sample.unique() percent_unique = sample.size / len(unique) if percent_unique < .05: inferred_type = vtypes.Categorical else: inferred_type = vtypes.Numeric inferred_types[variable] = inferred_type return inferred_types
def process_metadata_beta(data, metadata, drop_threshold=0.6, verbose=1): # reindex but not use reindex because it will generate lot of nan it missing index metadata = metadata.loc[data.index, :] if metadata.shape[0] == 0: logger("Couldn't find corresponding index from data into metadata", verbose=1) return # divide numeral cols and categorical cols numeric_cols = [ col for col in metadata.columns if is_numeric_dtype(metadata.loc[:, col]) ] str_cols = [ col for col in metadata.columns if is_string_dtype(metadata.loc[:, col]) ] sub_numeric = metadata.loc[:, numeric_cols] sub_str = metadata.loc[:, str_cols] if numeric_cols: # fill nan numeral cols drop_cols = [] na_percent = sub_numeric.count(0) / sub_numeric.shape[0] drop_cols += list(sub_numeric.columns[na_percent <= drop_threshold]) ### drop too much nan columns. logger('drop cols with nan values over %s percent : ' % drop_threshold, ','.join(drop_cols), '\n\n', verbose=verbose) sub_numeric = sub_numeric.loc[:, na_percent > drop_threshold] sub_numeric = sub_numeric.fillna( {col: sub_numeric.median()[col] for col in sub_numeric.columns}) if str_cols: # one hot / get dummy categorical cols drop_cols = [] num_cat = np.array( [len(set(sub_str.loc[:, col])) for col in sub_str.columns]) #### num_cat == 1 drop_cols += list(sub_str.columns[num_cat == 1]) #### num_cat >= sub_str.shape[0] * drop_threshold drop_cols += list( sub_str.columns[num_cat >= sub_str.shape[0] * drop_threshold]) sub_str = sub_str.loc[:, sub_str.columns.difference(drop_cols)] if sub_str.shape[1] != 0: sub_str = pd.get_dummies(sub_str) # drop_cols += list(sub_str.columns[sub_str.sum(0) <= sub_str.shape[0] * drop_threshold]) # sub_str = sub_str.loc[:, sub_str.sum(0) <= sub_str.shape[0] * drop_threshold] logger('drop cols which is meanless or too much values', ','.join(drop_cols), '\n\n', verbose=verbose) # merge and output if sub_numeric.shape[1] == 0 and sub_str.shape[1] == 0: final_metadata = None logger("No columns survived.......", verbose=1) elif sub_str.shape[1] == 0: final_metadata = sub_numeric elif sub_numeric.shape[1] == 0: final_metadata = sub_str else: final_metadata = pd.concat([sub_numeric, sub_str], axis=1) return final_metadata
def isInteger(colData): return is_numeric_dtype(colData)
def table_from_frame(df, *, force_nominal=False): """ Convert pandas.DataFrame to Orange.data.Table Parameters ---------- df : pandas.DataFrame force_nominal : boolean If True, interpret ALL string columns as nominal (DiscreteVariable). Returns ------- Table """ def _is_discrete(s): return (is_categorical_dtype(s) or is_object_dtype(s) and (force_nominal or s.nunique() < s.size**.666)) def _is_datetime(s): if is_datetime64_any_dtype(s): return True try: if is_object_dtype(s): pd.to_datetime(s, infer_datetime_format=True) return True except Exception: # pylint: disable=broad-except pass return False # If df index is not a simple RangeIndex (or similar), put it into data if not (df.index.is_integer() and (df.index.is_monotonic_increasing or df.index.is_monotonic_decreasing)): df = df.reset_index() attrs, metas = [], [] X, M = [], [] # Iter over columns for name, s in df.items(): name = str(name) if _is_discrete(s): discrete = s.astype('category').cat attrs.append(DiscreteVariable(name, discrete.categories.astype(str).tolist())) X.append(discrete.codes.replace(-1, np.nan).values) elif _is_datetime(s): tvar = TimeVariable(name) attrs.append(tvar) s = pd.to_datetime(s, infer_datetime_format=True) X.append(s.astype('str').replace('NaT', np.nan).map(tvar.parse).values) elif is_numeric_dtype(s): attrs.append(ContinuousVariable(name)) X.append(s.values) else: metas.append(StringVariable(name)) M.append(s.values.astype(object)) return Table.from_numpy(Domain(attrs, None, metas), np.column_stack(X) if X else np.empty((df.shape[0], 0)), None, np.column_stack(M) if M else None)
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') target_names = column_mapping.get('target_names') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [name for name in num_feature_names if is_numeric_dtype(reference_data[name])] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [name for name in cat_feature_names if is_numeric_dtype(reference_data[name])] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [date_column, id_column, target_column, prediction_column] num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) target_names = None if production_data is not None and target_column is not None and prediction_column is not None: production_data.replace([np.inf, -np.inf], np.nan, inplace=True) production_data.dropna(axis=0, how='any', inplace=True) array_prediction = production_data[prediction_column].to_numpy() prediction_ids = np.argmax(array_prediction, axis=-1) prediction_labels = [prediction_column[x] for x in prediction_ids] #plot support bar metrics_matrix = metrics.classification_report(production_data[target_column], prediction_labels, output_dict=True) metrics_frame = pd.DataFrame(metrics_matrix) support = metrics_frame.iloc[-1:,:-3].values[0] fig = go.Figure() fig.add_trace(go.Bar(x=metrics_frame.columns.tolist()[:-3], y=metrics_frame.iloc[-1:,:-3].values[0], marker_color=red, name='Support')) fig.update_layout( xaxis_title = "Class", yaxis_title = "Number of Objects", ) support_bar_json = json.loads(fig.to_json()) self.wi = BaseWidgetInfo( title=self.title, type="big_graph", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=1, params={ "data": support_bar_json['data'], "layout": support_bar_json['layout'] }, additionalGraphs=[], ) else: self.wi = None
def to_numeric_array(series): if not is_numeric_dtype(series): if not hasattr(series, 'cat'): series = series.astype('category') return series.cat.codes.values return series.values
def classifier_pair_plot(self, df, num_features=4, regression_quantiles=10): """ Returns a pair plot of top features for a classifier """ # TODO: find replace df_default = self.typical_feature_values pipe = self.numeric_pipe transformer = self.transform_numeric features = self.features top_features = self.top_features[:num_features] response = self.response dim_features = len(features) mpl.rcParams.update(mpl.rcParamsDefault) mpl.rcParams.update({"font.size": 6}) # For readability # df_not_na = df[top_features + [response]].dropna() vals_df = self.transform_numeric.transform(df[features]) vals_default = pd.DataFrame(transformer.transform(df_default)) is_numeric = is_numeric_dtype(df[response]) if is_numeric: pal0 = sns.color_palette("GnBu_d", regression_quantiles) pal1 = sns.color_palette("GnBu_r", regression_quantiles) else: pal0 = sns.color_palette("muted", df[response].nunique()) pal1 = sns.color_palette("bright", df[response].nunique()) # g = sns.PairGrid(df_not_na, vars=top_features, hue=response, palette=pal) gs = gridspec.GridSpec(num_features, num_features) fig = plt.figure() # Dummy estimator that predicts categoricals even for regression class DummyEstimator: model = pipe is_num = is_numeric def predict(self, X): y = self.model.predict(X) if self.is_num: return pd.qcut(y, regression_quantiles, labels=False, duplicates="drop") else: return np.searchsorted(pipe.classes_, y) estimator = DummyEstimator() if is_numeric: resp_plt = pd.qcut(df[response], regression_quantiles, labels=False, duplicates="drop").values else: resp_plt = df[response].astype("category").cat.codes.values # Diagonals: Numeric - Dist Plot, Categorical - Bar Plot lims = [] for row_i in range(0, num_features): ax = plt.subplot(gs[row_i, row_i]) feature_name = top_features[row_i] f_ind_i = features.index(feature_name) # plt.hist(x=df.iloc[:, f_ind_i]) if is_numeric_dtype(df.loc[:, feature_name]): ax = sns.distplot(df.loc[:, feature_name].dropna()) else: # Truncate long category names to avoid cluttering the axis kwargs = { feature_name: lambda df: df[feature_name].str.slice(0, 5) } df.groupby(feature_name).size().reset_index( name="counts").assign(**kwargs).plot.bar(ax=ax, x=feature_name, y="counts", legend=False) plt.ylabel("Count") plt.xlabel(feature_name) lims += [plt.xlim()] # plt.xlim((df.iloc[:, f_ind_i].min(), df.iloc[:, f_ind_i].max())) # Upper: indices = zip(*np.triu_indices(num_features, k=1)) # indices = zip(*np.triu_indices_from(g.axes, 1)) for row_i, col_j in indices: # ax = g.axes[row_i, col_j] ax = plt.subplot(gs[row_i, col_j]) feature_name_i = top_features[row_i.item()] feature_name_j = top_features[col_j.item()] f_ind_i = features.index(feature_name_i) f_ind_j = features.index(feature_name_j) filler_ind = [ x for x in list(range(0, dim_features)) if x not in [f_ind_i, f_ind_j] ] # mlxtend decision regions requries a dummy set of variables if there # are more than two features. if len(features) > 2: filler = vals_default[filler_ind].to_dict("records")[0] else: filler = None mpl.rcParams.update({"contour.negative_linestyle": "dotted"}) plot_decision_regions( X=transformer.transform(df[features].values), y=resp_plt, feature_index=(f_ind_j, f_ind_i), filler_feature_values=filler, clf=estimator, colors=",".join(pal0.as_hex()), hide_spines=False, ) if len(filler_ind) > 0: plt.scatter( x=vals_df[:, features.index(feature_name_j)], y=vals_df[:, features.index(feature_name_i)], s=3, c=resp_plt, cmap=ListedColormap(pal1.as_hex()), ) plt.xlabel(features[f_ind_j]) plt.ylabel(features[f_ind_i]) plt.xlim(lims[col_j]) plt.ylim(lims[row_i]) # Lower Diag indices = zip(*np.tril_indices(num_features, k=-1)) for row_i, col_j in indices: ax = plt.subplot(gs[row_i, col_j]) ax.set_facecolor("grey") f_ind_i = features.index(top_features[row_i.item()]) f_ind_j = features.index(top_features[col_j.item()]) plt.xlim(lims[col_j]) plt.ylim(lims[row_i]) plt.xlabel(features[f_ind_j]) plt.ylabel(features[f_ind_i]) fig.tight_layout()
def vis_progressX(graph, simple=False, mode='file', color=None, _color_SAFE=None, min_size=10, max_size=40, **kwargs): """ For dynamic visualizing tmap construction process, it performs a interactive graph based on `plotly` with a slider to present the process from ordination to graph step by step. Currently, it doesn't provide any API for overriding the number of step from ordination to graph. It may be implemented at the future. If you want to draw a simple graph with edges and nodes instead of the process, try the params ``simple``. This visualized function is mainly based on plotly which is a interactive Python graphing library. The params mode is trying to provide multiple type of return for different purpose. There are three different modes you can choose including "file" which return a html created by plotly, "obj" which return a reusable python dict object and "web" which normally used at notebook and make inline visualization possible. The color part of this function has a little bit complex because of the multiple sub-figures. Currently, it use the ``tmap.tda.plot.Color`` class to auto generate color with given array. More detailed about how to auto generate color could be reviewed at the annotation of ``tmap.tda.plot.Color``. In this function, there are two kinds of color need to implement. * First, all color and its showing text values of samples points should be followed by given color params. The color could be **any array** which represents some measurement of Nodes or Samples. **It doesn't have to be SAFE score**. * Second, The ``_color_SAFE`` param should be a ``Color`` with a nodes-length array, which is normally a SAFE score. :param tmap.tda.Graph.Graph graph: :param str mode: [file|obj|web] :param bool simple: :param color: :param _color_SAFE: :param kwargs: :return: """ node_pos = graph.nodePos # shape is average projected_data (node x lens) sample_pos = graph.data # shape is raw projected_data (sample x lens) nodes = graph.nodes sizes = graph.size sample_names = np.array(graph.sample_names.astype(str)) minmax_scaler = MinMaxScaler(feature_range=(min_size, max_size)) mms_color = MinMaxScaler(feature_range=[0, 1]) scaled_size = minmax_scaler.fit_transform( np.array([sizes[_] for _ in range(len(nodes))]).reshape(-1, 1)) # init some empty values if color wasn't given target_v_raw = [0 for _ in nodes] target_v = [0 for _ in nodes] target_colors = ['blue' for _ in nodes] sample_colors = ['red' for _ in sample_names] cat2color = defaultdict(lambda: 'blue') legend_names = [] if color is None or type(color) == str: color = 'red' if color is None else color color_map = {node_id: color for node_id in graph.nodes} target2colors = (np.zeros( (len(graph.nodes), 1)), [color] * len(graph.nodes)) else: color_map, target2colors = color.get_colors(graph.nodes) if types.is_numeric_dtype(target2colors[0]): target_v = mms_color.fit_transform(target2colors[0]).ravel() else: target_v = [] target_v_raw = target2colors[0].ravel() target_colors = target2colors[1] sample_colors, cat2color = color.get_sample_colors() if color.dtype == 'categorical': legend_names = target2colors[0][:, 0] # For calculating the dynamic process. It need to duplicate the samples first. # reconstructing the ori_MDS into the samples_pos # reconstructing the node_pos into the center_pos sample_tmp = [] center_tmp = [] text_tmp = [] duplicated_sample_colors = [] for n in nodes: sample_tmp.append(sample_pos[nodes[n]['sample'], :]) center_tmp.append(np.repeat(node_pos[[n], :], sizes[n], axis=0)) text_tmp.append(sample_names[nodes[n]['sample']]) if color is not None: duplicated_sample_colors += list( np.repeat(color_map.get(n, 'blue'), sizes[n])) else: duplicated_sample_colors += ["blue"] * sizes[n] duplicated_sample_pos = np.concatenate(sample_tmp, axis=0) duplicated_node_pos = np.concatenate(center_tmp, axis=0) duplicated_samples_text = np.concatenate(text_tmp, axis=0) assert duplicated_sample_pos.shape[0] == duplicated_node_pos.shape[ 0] == duplicated_samples_text.shape[0] == len(duplicated_sample_colors) # For visualizing the movement of samples, it need to multiply one sample into multiple samples which is need to reconstruct pos,text. # prepare edge data xs = [] ys = [] for edge in graph.edges: xs += [node_pos[edge[0], 0], node_pos[edge[1], 0], None] ys += [node_pos[edge[0], 1], node_pos[edge[1], 1], None] # if there are _color_SAFE, it will present two kinds of color. if simple != True # one is base on original data, one is transformed-SAFE data. Use the second one. if _color_SAFE is not None: safe_color, safe_t2c = _color_SAFE.get_colors(graph.nodes) # former is a dict which key is node id and values is node color # second is a tuple (node values, node color) target_SAFE_color = [safe_color[_] for _ in graph.nodes] target_SAFE_raw_v = safe_t2c[0].ravel() # raw node values else: target_SAFE_color = [] target_SAFE_raw_v = [] # prepare node & samples text node_text = c_node_text(nodes, sample_names, target_v_raw) ### samples text samples_text = ['sample ID:%s' % _ for _ in sample_names] node_line = go.Scatter( # ordination line visible=False, x=xs, y=ys, marker=dict(color="#8E9DA2", opacity=0.7), line=dict(width=1), hoverinfo='skip', showlegend=False, mode="lines") node_marker = go.Scatter( # node position visible=False, x=node_pos[:, 0], y=node_pos[:, 1], hovertext=node_text, hoverinfo="text", marker=dict(color=target_colors, size=scaled_size, opacity=1), showlegend=False, mode="markers") sample_marker = go.Scatter(visible=True, x=sample_pos[:, 0], y=sample_pos[:, 1], marker=dict(color=sample_colors), hovertext=samples_text, hoverinfo="text", showlegend=False, mode="markers") # After all prepared work have been finished. # Append all traces instance into fig if simple: fig = plotly.tools.make_subplots(1, 1) node_line['visible'] = True node_marker['visible'] = True fig.append_trace(node_line, 1, 1) if color is not None and type(color) != str: if color.dtype == 'numerical': # with continuous legend bar # A dict which includes values of node to color # For make continuous color legend nv2c = dict(zip(target_v, target_colors)) colorscale = [] for _ in sorted(set(target_v)): colorscale.append([_, nv2c[_]]) colorscale[-1][0] = 1 # the last value must be 1 colorscale[0][0] = 0 # the first value must be 0 node_marker['marker']['color'] = target2colors[0].ravel() # it is not necessary to use target_v, it could use original data target2colors. # Or it will display normalized values which will confuse reader. node_marker['marker']['colorscale'] = colorscale node_marker['marker']['cmin'] = target2colors[0].min() node_marker['marker']['cmax'] = target2colors[0].max() node_marker['marker']['showscale'] = True fig.append_trace(node_marker, 1, 1) else: # if color.dtype == 'categorical' for cat in np.unique(legend_names): # it won't missing variables legend_names. check 434 line # it will auto sort with alphabetic order node_marker = go.Scatter( # node position visible=True, x=node_pos[legend_names == cat, 0], y=node_pos[legend_names == cat, 1], text=np.array(node_text)[legend_names == cat], hoverinfo="text", marker=dict(color=cat2color[cat], size=scaled_size[legend_names == cat, 0], opacity=1), name=str(cat), showlegend=True, mode="markers") fig.append_trace(node_marker, 1, 1) elif type(color) == str: node_marker['marker']['color'] = color fig.append_trace(node_marker, 1, 1) else: fig.append_trace(node_marker, 1, 1) fig.layout.hovermode = "closest" else: fig = plotly.tools.make_subplots( rows=2, cols=2, specs=[[{ 'rowspan': 2 }, {}], [None, {}]], ) # original place or ordination place fig.append_trace(sample_marker, 1, 1) # dynamic process to generate 5 binning positions n_step = 5 for s in range(1, n_step + 1): # s = 1: move 1/steps # s = steps: move to node position. fig.append_trace( go.Scatter(visible=False, x=duplicated_sample_pos[:, 0] + ((duplicated_node_pos - duplicated_sample_pos) / n_step * s)[:, 0], y=duplicated_sample_pos[:, 1] + ((duplicated_node_pos - duplicated_sample_pos) / n_step * s)[:, 1], marker=dict(color=duplicated_sample_colors), hoverinfo="text", hovertext=duplicated_samples_text, showlegend=False, mode="markers"), 1, 1) # Order is important, do not change the order !!! # There are the last 5 should be visible at any time fig.append_trace(node_line, 1, 1) fig.append_trace(node_marker, 1, 1) node_line['visible'] = True node_marker['visible'] = True sample_marker['visible'] = True fig.append_trace(node_line, 2, 2) if _color_SAFE is not None: node_text = c_node_text(nodes, sample_names, target_SAFE_raw_v) node_marker['hovertext'] = node_text node_marker['marker']['color'] = target_SAFE_color fig.append_trace(node_marker, 2, 2) fig.append_trace(sample_marker, 1, 2) ############################################################ steps = [] for i in range(n_step + 1): step = dict( method='restyle', args=['visible', [False] * (n_step + 3) + [True, True, True]], ) if i >= n_step: step["args"][1][-5:] = [ True ] * 5 # The last 5 should be some traces must present at any time. else: step['args'][1][i] = True # Toggle i'th trace to "visible" steps.append(step) sliders = [ dict(active=0, currentvalue={"prefix": "status: "}, pad={"t": 20}, steps=steps) ] ############################################################ layout = dict( sliders=sliders, width=2000, height=1000, xaxis1={ # "range": [0, 1], "domain": [0, 0.5] }, yaxis1={ # "range": [0, 1], "domain": [0, 1] }, xaxis2={ # "range": [0, 1], "domain": [0.6, 0.9] }, yaxis2={ # "range": [0, 1], "domain": [0.5, 1] }, xaxis3={ # "range": [0, 1], "domain": [0.6, 0.9] }, yaxis3={ # "range": [0, 1], "domain": [0, 0.5] }, hovermode="closest") fig.layout.update(layout) return write_figure(fig, mode, **kwargs)
def fix_missing(df, col, name, na_dict): """ Fill missing data in a column of df with the median, and add a {name}_na column which specifies if the data was missing. Parameters: ----------- df: The data frame that will be changed. col: The column of data to fix by filling in missing data. name: The name of the new filled column in df. na_dict: A dictionary of values to create na's of and the value to insert. If name is not a key of na_dict the median will fill any missing data. Also if name is not a key of na_dict and there is no missing data in col, then no {name}_na column is not created. Examples: --------- >>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]}) >>> df col1 col2 0 1 5 1 nan 2 2 3 2 >>> fix_missing(df, df['col1'], 'col1', {}) >>> df col1 col2 col1_na 0 1 5 False 1 2 2 True 2 3 2 False >>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]}) >>> df col1 col2 0 1 5 1 nan 2 2 3 2 >>> fix_missing(df, df['col2'], 'col2', {}) >>> df col1 col2 0 1 5 1 nan 2 2 3 2 >>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]}) >>> df col1 col2 0 1 5 1 nan 2 2 3 2 >>> fix_missing(df, df['col1'], 'col1', {'col1' : 500}) >>> df col1 col2 col1_na 0 1 5 False 1 500 2 True 2 3 2 False """ if is_numeric_dtype(col): if pd.isnull(col).sum() or (name in na_dict): df[name + '_na'] = pd.isnull(col) filler = na_dict[name] if name in na_dict else col.median() df[name] = col.fillna(filler) na_dict[name] = filler return na_dict
def pandas_to_table(df): # type: (pd.DataFrame) -> Orange.data.Table """ Convert a pandas.DataFrame to a Orange.data.Table instance. """ index = df.index if not isinstance(index, pd.RangeIndex): df = df.reset_index() columns = [] # type: List[Tuple[Orange.data.Variable, np.ndarray]] for header, series in df.items(): # type: (Any, pd.Series) if pdtypes.is_categorical(series): coldata = series.values # type: pd.Categorical categories = [str(c) for c in coldata.categories] var = Orange.data.DiscreteVariable.make( str(header), values=categories, ordered=coldata.ordered ) # Remap the coldata into the var.values order/set coldata = pd.Categorical( coldata, categories=var.values, ordered=coldata.ordered ) codes = coldata.codes assert np.issubdtype(codes.dtype, np.integer) orangecol = np.array(codes, dtype=np.float) orangecol[codes < 0] = np.nan elif pdtypes.is_datetime64_any_dtype(series): # Check that this converts tz local to UTC series = series.astype(np.dtype("M8[ns]")) coldata = series.values # type: np.ndarray assert coldata.dtype == "M8[ns]" mask = np.isnat(coldata) orangecol = coldata.astype(np.int64) / 10 ** 9 orangecol[mask] = np.nan var = Orange.data.TimeVariable.make(str(header)) var.have_date = var.have_time = 1 elif pdtypes.is_object_dtype(series): coldata = series.values assert isinstance(coldata, np.ndarray) orangecol = coldata var = Orange.data.StringVariable.make(str(header)) elif pdtypes.is_integer_dtype(series): coldata = series.values var = Orange.data.ContinuousVariable.make(str(header)) var.number_of_decimals = 0 orangecol = coldata.astype(np.float64) elif pdtypes.is_numeric_dtype(series): orangecol = series.values.astype(np.float64) var = Orange.data.ContinuousVariable.make(str(header)) var._out_format = "%.15g" else: warnings.warn( "Column '{}' with dtype: {} skipped." .format(header, series.dtype), UserWarning ) continue columns.append((var, orangecol)) cols_x = [(var, col) for var, col in columns if var.is_primitive()] cols_m = [(var, col) for var, col in columns if not var.is_primitive()] variables = [v for v, _ in cols_x] if cols_x: X = np.column_stack([a for _, a in cols_x]) else: X = np.empty((df.shape[0], 0), dtype=np.float) metas = [v for v, _ in cols_m] if cols_m: M = np.column_stack([a for _, a in cols_m]) else: M = None domain = Orange.data.Domain(variables, metas=metas) return Orange.data.Table.from_numpy(domain, X, None, M)
def proc_df(df, y_fld=None, skip_flds=None, ignore_flds=None, do_scale=False, na_dict=None, preproc_fn=None, max_n_cat=None, subset=None, mapper=None): """ proc_df takes a data frame df and splits off the response variable, and changes the df into an entirely numeric dataframe. Parameters: ----------- df: The data frame you wish to process. y_fld: The name of the response variable skip_flds: A list of fields that dropped from df. ignore_flds: A list of fields that are ignored during processing. do_scale: Standardizes each column in df. Takes Boolean Values(True,False) na_dict: a dictionary of na columns to add. Na columns are also added if there are any missing values. preproc_fn: A function that gets applied to df. max_n_cat: The maximum number of categories to break into dummy values, instead of integer codes. subset: Takes a random subset of size subset from df. mapper: If do_scale is set as True, the mapper variable calculates the values used for scaling of variables during training time (mean and standard deviation). Returns: -------- [x, y, nas, mapper(optional)]: x: x is the transformed version of df. x will not have the response variable and is entirely numeric. y: y is the response variable nas: returns a dictionary of which nas it created, and the associated median. mapper: A DataFrameMapper which stores the mean and standard deviation of the corresponding continuous variables which is then used for scaling of during test-time. Examples: --------- >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']}) >>> df col1 col2 0 1 a 1 2 b 2 3 a note the type of col2 is string >>> train_cats(df) >>> df col1 col2 0 1 a 1 2 b 2 3 a now the type of col2 is category { a : 1, b : 2} >>> x, y, nas = proc_df(df, 'col1') >>> x col2 0 1 1 2 2 1 >>> data = DataFrame(pet=["cat", "dog", "dog", "fish", "cat", "dog", "cat", "fish"], children=[4., 6, 3, 3, 2, 3, 5, 4], salary=[90, 24, 44, 27, 32, 59, 36, 27]) >>> mapper = DataFrameMapper([(:pet, LabelBinarizer()), ([:children], StandardScaler())]) >>>round(fit_transform!(mapper, copy(data)), 2) 8x4 Array{Float64,2}: 1.0 0.0 0.0 0.21 0.0 1.0 0.0 1.88 0.0 1.0 0.0 -0.63 0.0 0.0 1.0 -0.63 1.0 0.0 0.0 -1.46 0.0 1.0 0.0 -0.63 1.0 0.0 0.0 1.04 0.0 0.0 1.0 0.21 """ if not ignore_flds: ignore_flds = [] if not skip_flds: skip_flds = [] if subset: df = get_sample(df, subset) else: df = df.copy() ignored_flds = df.loc[:, ignore_flds] df.drop(ignore_flds, axis=1, inplace=True) if preproc_fn: preproc_fn(df) if y_fld is None: y = None else: if not is_numeric_dtype(df[y_fld]): df[y_fld] = df[y_fld].cat.codes y = df[y_fld].values skip_flds += [y_fld] df.drop(skip_flds, axis=1, inplace=True) if na_dict is None: na_dict = {} else: na_dict = na_dict.copy() na_dict_initial = na_dict.copy() for n, c in df.items(): na_dict = fix_missing(df, c, n, na_dict) if len(na_dict_initial.keys()) > 0: df.drop([ a + '_na' for a in list(set(na_dict.keys()) - set(na_dict_initial.keys())) ], axis=1, inplace=True) if do_scale: mapper = scale_vars(df, mapper) for n, c in df.items(): numericalize(df, c, n, max_n_cat) df = pd.get_dummies(df, dummy_na=True) df = pd.concat([ignored_flds, df], axis=1) res = [df, y, na_dict] if do_scale: res = res + [mapper] return res
def dtype_detection(data,category_detection=True,StructureText_detection=True,\ datetime_to_category=True,criterion='sqrt',min_mean_counts=5,fix=False): '''检测数据中单个变量的数据类型 将数据类型分为以下4种 1. number,数值型 2. category,因子 3. datetime,时间类型 4. text,文本型 5. text_st,结构性文本,比如ID, 6. group_number,连续 parameter --------- data: pd.Series 数据, 仅支持一维 # 如果有data,则函数会改变原来data的数据类型 category_detection: bool,根据 nunique 检测是否是因子类型 StructureText_detection: bool, 结构化文本,如列中都有一个分隔符"-" datetime_to_category: 时间序列如果 nunique过少是否转化成因子变量 criterion: string or int, optional (default="sqrt",即样本数的开根号) 支持:'sqrt':样本量的开根号, int: 绝对数, 0-1的float:样本数的百分多少 检测因子变量时,如果一个特征的nunique小于criterion,则判定为因子变量 min_mean_counts: default 5,数值型判定为因子变量时,需要满足每个类别的平均频数要大于min_mean_counts fix: bool,是否返回修改好类型的数据 return: result:dict{ 'name':列名, 'vtype':变量类型, 'ordered':是否是有序因子, 'categories':所有的因子} ''' assert len(data.shape)==1 data=data.copy() data=pd.Series(data) dtype,name,n_sample=data.dtype,data.name,data.count() min_mean_counts=5 if criterion=='sqrt': max_nuniques=np.sqrt(n_sample) elif isinstance(criterion,int): max_nuniques=criterion elif isinstance(criterion,float) and (0<criterion<1): max_nuniques=criterion else: max_nuniques=np.sqrt(n_sample) ordered=False categories=[] if is_numeric_dtype(dtype): vtype='number' ordered=False categories=[] # 纠正误分的数据类型。如将1.0,2.0,3.0都修正为1,2,3 if data.dropna().astype(np.int64).sum()==data.dropna().sum(): data[data.notnull()]=data[data.notnull()].astype(np.int64) if category_detection: nunique=len(data.dropna().unique()) mean_counts=data.value_counts().median() if nunique<max_nuniques and mean_counts>=min_mean_counts: data=data.astype('category') ordered=data.cat.ordered vtype='category' categories=list(data.dropna().cat.categories) result={'name':name,'vtype':vtype,'ordered':ordered,'categories':categories} elif is_string_dtype(dtype): # 处理时间类型 tmp=data.map(lambda x: np.nan if '%s'%x == 'nan' else len('%s'%x)) tmp=tmp.dropna().astype(np.int64) if not(any(data.dropna().map(is_number))) and 7<tmp.max()<20 and tmp.std()<0.1: try: data=pd.to_datetime(data) except : pass # 处理可能的因子类型 #时间格式是否处理为True 且 if datetime_to_category: if len(data.dropna().unique())<np.sqrt(n_sample): data=data.astype('category') else: nunique=len(data.dropna().unique()) #print(data.dtype) if not(is_categorical_dtype(data.dtype)) and not(np.issubdtype(data.dtype,np.datetime64)) and nunique<max_nuniques: data=data.astype('category') # 在非因子类型的前提下,将百分数转化成浮点数,例如21.12%-->0.2112 if is_string_dtype(data.dtype) and not(is_categorical_dtype(data.dtype)) and all(data.str.contains('%')): data=data.str.strip('%').astype(np.float64)/100 if is_categorical_dtype(data.dtype): vtype='category' categories=list(data.cat.categories) ordered=data.cat.ordered # 时间格式 elif np.issubdtype(data.dtype,np.datetime64): vtype='datetime' # 是否是结构化数组 elif StructureText_detection and tmp.dropna().std()==0: # 不可迭代,不是字符串 if not(isinstance(data.dropna().iloc[0],Iterable)): vtype='text' else: k=set(list(data.dropna().iloc[0])) for x in data: if isinstance(x,str) and len(x)>0: k&=set(list(x)) if len(k)>0: vtype='text_st' else: vtype='text' elif is_numeric_dtype(data.dtype): vtype='number' ordered=False categories=[] else: vtype='text' result={'name':name,'vtype':vtype,'ordered':ordered,'categories':categories} elif is_datetime64_any_dtype(dtype): vtype='datetime' result={'name':name,'vtype':vtype,'ordered':ordered,'categories':categories} else: print('unknown dtype!') result=None if fix: return result,data else: return result
def fit(self, train_df: pd.DataFrame, test_df: pd.DataFrame = None, ctx: mx.context = get_context(), learning_rate: float = 4e-3, num_epochs: int = 10, patience: int = 3, test_split: float = .1, weight_decay: float = 0., batch_size: int = 16, final_fc_hidden_units: List[int] = None, calibrate: bool = True) -> Any: """ Trains and stores imputer model :param train_df: training data as dataframe :param test_df: test data as dataframe; if not provided, a ratio of test_split of the training data are used as test data :param ctx: List of mxnet contexts (if no gpu's available, defaults to [mx.cpu()]) User can also pass in a list gpus to be used, ex. [mx.gpu(0), mx.gpu(2), mx.gpu(4)] :param learning_rate: learning rate for stochastic gradient descent (default 4e-4) :param num_epochs: maximal number of training epochs (default 10) :param patience: used for early stopping; after [patience] epochs with no improvement, training is stopped. (default 3) :param test_split: if no test_df is provided this is the ratio of test data to be held separate for determining model convergence :param weight_decay: regularizer (default 0) :param batch_size (default 16) :param final_fc_hidden_units: list dimensions for FC layers after the final concatenation """ self.check_data_types(train_df) data_encoders = [] data_columns = [] if len(self.string_columns) > 0: string_feature_column = "ngram_features-" + rand_string(10) if self.is_explainable: data_encoders += [ TfIdfEncoder(input_columns=self.string_columns, output_column=string_feature_column, max_tokens=self.num_hash_buckets, tokens=self.tokens) ] else: data_encoders += [ BowEncoder(input_columns=self.string_columns, output_column=string_feature_column, max_tokens=self.num_hash_buckets, tokens=self.tokens) ] data_columns += [ BowFeaturizer(field_name=string_feature_column, max_tokens=self.num_hash_buckets) ] if len(self.numeric_columns) > 0: numerical_feature_column = "numerical_features-" + rand_string(10) data_encoders += [ NumericalEncoder(input_columns=self.numeric_columns, output_column=numerical_feature_column) ] data_columns += [ NumericalFeaturizer( field_name=numerical_feature_column, numeric_latent_dim=self.numeric_latent_dim, numeric_hidden_layers=self.numeric_hidden_layers) ] label_column = [] if is_numeric_dtype(train_df[self.output_column]): label_column = [ NumericalEncoder(self.output_column, normalize=True) ] logger.info("Assuming numeric output column: {}".format( self.output_column)) else: label_column = [ CategoricalEncoder(self.output_column, max_tokens=self.num_labels) ] logger.info("Assuming categorical output column: {}".format( self.output_column)) # to make consecutive calls to .fit() continue where the previous call finished if self.imputer is None: self.imputer = Imputer(data_encoders=data_encoders, data_featurizers=data_columns, label_encoders=label_column, output_path=self.output_path) self.output_path = self.imputer.output_path self.imputer = self.imputer.fit( train_df, test_df, ctx, learning_rate, num_epochs, patience, test_split, weight_decay, batch_size, final_fc_hidden_units=final_fc_hidden_units, calibrate=calibrate) self.save() return self
def groupCompare(variables, group, dataframe, number_groups): ### Declare empty variables to hold column names NormallyDistributed = [] NonNormallyDistributed = [] statistic = [] p_value = [] types = [] ### Loop through all columns of a dataframe and check normality for col in dataframe.columns: if is_numeric_dtype(dataframe[col]) == True: ## Numeric check data = dataframe[np.isfinite( dataframe[col] )] ## Drop NAs (the shapiro will not calculate statistic if NAs present) r, p = stats.shapiro( data[col]) ### If less than 0.05 non normally distributed if p < 0.05: NonNormallyDistributed.append(col) else: NormallyDistributed.append(col) for var in variables: if number_groups > 2: if var in NormallyDistributed: ## Normally distributed then do ANOVA data = dataframe[np.isfinite(dataframe[var])] variable = data[var].dropna() comp = data[group] ### comparison of interest anova = ols("variable ~ C(comp)", data=data).fit() ### run anova r = anova.rsquared_adj ## extract overall model adjusted r statistic p = anova.f_pvalue ## extract overall model p-value statistic.append(r) p_value.append(p) types.append("ANOVA") elif var in NonNormallyDistributed: ### Non normally distributed then do Kruskal Wallis data = dataframe[np.isfinite(dataframe[var])] ### declare the three series v1 = data[data[group] == 0][var] v2 = data[data[group] == 1][var] v3 = data[data[group] == 2][var] r, p = stats.kruskal(v1, v2, v3) ### run Kruskal wallis statistic.append(r) p_value.append(p) types.append("Kruskal-Wallis") else: ### In case any variables were labelled incorrectly statistic.append("NA") p_value.append("NA") types.append("NA") elif number_groups == 2: if var in NormallyDistributed: ## Normally distributed then do ttest data = dataframe[np.isfinite(dataframe[var])] v1 = data[data.PD_VHAny == 1][var] v2 = data[data.PD_VHAny == 2][var] r, p = stats.ttest_ind(v1, v2) statistic.append(r) p_value.append(p) types.append("t-test") elif var in NonNormallyDistributed: ### Non normally distributed then do Mann-Whitney data = dataframe[np.isfinite(dataframe[var])] v1 = data[data.PD_VHAny == 1][var] v2 = data[data.PD_VHAny == 2][var] r, p = stats.mannwhitneyu(v1, v2) ### run Kruskal wallis statistic.append(r) p_value.append(p) types.append("Mann-Whitney") else: ### In case any variables were labelled incorrectly statistic.append("NA") p_value.append("NA") types.append("NA") ### Combine results on dataframe results = pd.DataFrame(data=np.zeros( (len(variables), 0))) # empty dataframe results["Variable"] = variables # variable names results["Statistic"] = statistic # statistic results["Pvalue"] = p_value # p_value results["Type"] = types # type of statistical test used return (results)
def Select_Variables(dfTrain, dfTest, variables=[], center_floats=True, scale_floats=True, max_pct_na=0.3): """ Descr: To do machine learning, we need to have two data (train and test) with the same structure. Some variables must be centered and scaled. In this case, test's variables must be scaled with train's features, in order to not to slant prediction. In: - dfTrain : dataframe for training - dfTest : dataframe for tests - variables : list of UNCHANGED variables to select. - scales : list of centered-and-scaled float variables to select. If one of those variables is not a float, its unchanged ! Note : If a name in 'variables' or 'scales' does not exists in dfTrain or dfTest, then a error will be returned ! Out : Two dataframes with the same structure : resTrain, resTest """ msgerr = "'{}' is not contained in {} !" ## Init resTrain = pd.DataFrame(index=dfTrain.index) resTest = pd.DataFrame(index=dfTest.index) ## Boucle for ivar in variables: ## Errors if not (ivar in list(dfTrain)): raise ValueError(msgerr.format(ivar, 'dfTrain')) elif not (ivar in list(dfTest)): raise ValueError(msgerr.format(ivar, 'dfTest')) xtrain = dfTrain[ivar] xtest = dfTest[ivar] ## if differents types : error if (xtrain.dtype != xtest.dtype): raise ValueError('''Variable {} is not of the same type in dfTrain and in dfTest !'''.format(ivar)) ## if too many missing values pct_na_train = xtrain.isnull().mean() pct_na_test = xtest.isnull().mean() if (pct_na_train <= max_pct_na and pct_na_test <= max_pct_na): ## Tests on types is_num = pdtypes.is_numeric_dtype(xtrain) is_str = pdtypes.is_categorical_dtype( xtrain) or pdtypes.is_string_dtype(xtrain) ## IF FLOAT if is_num: ## Mean Features moy = xtrain.mean() stderr = xtrain.std() ## Dont take useless variables ## having no variation, or too much NA if (stderr != 0): xtrain = xtrain.fillna(moy) xtest = xtest.fillna(xtest.mean()) if center_floats: xtrain -= moy xtest -= moy if scale_floats: xtrain /= stderr xtest /= stderr ## Add to Data resTrain[ivar] = pd.Series(xtrain, index=dfTrain.index) resTest[ivar] = pd.Series(xtest, index=dfTest.index) ## IF CATEG elif is_str: iDummTrain = pd.get_dummies(xtrain, prefix=ivar) iDummTest = pd.get_dummies(xtest, prefix=ivar) resTrain = pd.concat([resTrain, iDummTrain], axis=1) resTest = pd.concat([resTest, iDummTest], axis=1) continue ## Results return resTrain, resTest
def __init__( self, x, chrm="CHR", bp="BP", p="P", snp="SNP", gene="GENE", annotation=None, logp=True ): """ Keyword arguments: - dataframe (dataframe; required): A pandas dataframe which must contain at least the following three columns: - the chromosome number - genomic base-pair position - a numeric quantity to plot such as a p-value or zscore - chrm (string; default 'CHR'): A string denoting the column name for the chromosome. This column must be float or integer. Minimum number of chromosomes required is 1. If you have X, Y, or MT chromosomes, be sure to renumber these 23, 24, 25, etc. - bp (string; default 'BP'): A string denoting the column name for the chromosomal position. - p (string; default 'P'): A string denoting the column name for the float quantity to be plotted on the y-axis. This column must be numeric. This does not have to be a p-value. It can be any numeric quantity such as peak heights, bayes factors, test statistics. If it is not a p-value, make sure to set logp = FALSE. - snp (string; default 'SNP'): A string denoting the column name for the SNP names (e.g. rs number). More generally, this column could be anything that identifies each point being plotted. For example, in an Epigenomewide association study (EWAS) this could be the probe name or cg number. This column should be a character. This argument is optional, however it is necessary to specify if you want to highlight points on the plot using the highlight argument in the figure method. - gene (string; default 'GENE'): A string denoting the column name for the GENE names. This column could be a string or a float. More generally, it could be any annotation information that you want to include in the plot. - annotation (string; optional): A string denoting the column name for an annotation. This column could be a string or a float. This could be any annotation information that you want to include in the plot (e.g. zscore, effect size, minor allele frequency). - logp (bool; default True): If True, the -log10 of the p-value is plotted. It isn't very useful to plot raw p-values; however, plotting the raw value could be useful for other genome-wide plots (e.g., peak heights, Bayes factors, test statistics, other "scores", etc.). Returns: - A ManhattanPlot object.""" # checking the validity of the arguments # Make sure you have chrm, bp and p columns and that they are of # numeric type if chrm not in x.columns.values: raise KeyError("Column %s not found in 'x' data.frame" % chrm) else: if not is_numeric_dtype(x[chrm].dtype): raise TypeError("%s column should be numeric. Do you have " "'X', 'Y', 'MT', etc? If so change to " "numbers and try again." % chrm) if bp not in x.columns.values: raise KeyError("Column %s not found in 'x' data.frame" % bp) else: if not is_numeric_dtype(x[bp].dtype): raise TypeError("%s column should be numeric type" % bp) if p not in x.columns.values: raise KeyError("Column %s not found in 'x' data.frame" % p) else: if not is_numeric_dtype(x[p].dtype): raise TypeError("%s column should be numeric type" % p) # Create a new DataFrame with columns named after chrm, bp, and p. self.data = pd.DataFrame(data=x[[chrm, bp, p]]) if snp is not None: if snp not in x.columns.values: # Warn if you don't have a snp column raise KeyError( "snp argument specified as %s but column not found in " "'x' data.frame" % snp) else: # If the input DataFrame has a snp column, add it to the new # DataFrame self.data[snp] = x[snp] if gene is not None: if gene not in x.columns.values: # Warn if you don't have a gene column raise KeyError( "gene argument specified as %s but column not found in " "'x' data.frame" % gene) else: # If the input DataFrame has a gene column, add it to the new # DataFrame self.data[gene] = x[gene] if annotation is not None: if annotation not in x.columns.values: # Warn if you don't have an annotation column raise KeyError( "annotation argument specified as %s but column not " "found in 'x' data.frame" % annotation ) else: # If the input DataFrame has a gene column, add it to the new # DataFrame self.data[annotation] = x[annotation] self.xlabel = "" self.ticks = [] self.ticksLabels = [] self.nChr = len(x[chrm].unique()) self.chrName = chrm self.pName = p self.snpName = snp self.geneName = gene self.annotationName = annotation self.logp = logp # Set positions, ticks, and labels for plotting self.index = 'INDEX' self.pos = 'POSITION' # Fixes the bug where one chromosome is missing by adding a sequential # index column. idx = 0 for i in self.data[chrm].unique(): idx = idx + 1 self.data.loc[self.data[chrm] == i, self.index] = int(idx) # Set the type to be the same as provided for chrm column self.data[self.index] = \ self.data[self.index].astype(self.data[chrm].dtype) # This section sets up positions and ticks. Ticks should be placed in # the middle of a chromosome. The new pos column is added that keeps # a running sum of the positions of each successive chromosome. # For example: # chrm bp pos # 1 1 1 # 1 2 2 # 2 1 3 # 2 2 4 # 3 1 5 if self.nChr == 1: # For a single chromosome self.data[self.pos] = self.data[bp] self.ticks.append(int(len(self.data[self.pos]) / 2.) + 1) self.xlabel = "Chromosome %s position" % (self.data[chrm].unique()) self.ticksLabels = self.ticks else: # For multiple chromosomes lastbase = 0 for i in self.data[self.index].unique(): if i == 1: self.data.loc[self.data[self.index] == i, self.pos] = \ self.data.loc[self.data[self.index] == i, bp].values else: prevbp = self.data.loc[self.data[self.index] == i - 1, bp] # Shift the basepair position by the largest bp of the # current chromosome lastbase = lastbase + prevbp.iat[-1] self.data.loc[self.data[self.index] == i, self.pos] = \ self.data.loc[self.data[self.index] == i, bp].values \ + lastbase tmin = min(self.data.loc[self.data[self.index] == i, self.pos]) tmax = max(self.data.loc[self.data[self.index] == i, self.pos]) self.ticks.append(int((tmin + tmax) / 2.) + 1) self.xlabel = 'Chromosome' self.data[self.pos] = self.data[self.pos].astype( self.data[bp].dtype) if self.nChr > 10: # To avoid crowded labels self.ticksLabels = [ t if np.mod(int(t), 2) # Only every two ticks else '' for t in self.data[chrm].unique() ] else: self.ticksLabels = self.data[chrm].unique() # All the ticks
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') #target_names = column_mapping.get('target_names') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [ name for name in num_feature_names if is_numeric_dtype(reference_data[name]) ] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [ name for name in cat_feature_names if is_numeric_dtype(reference_data[name]) ] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [ date_column, id_column, target_column, prediction_column ] num_feature_names = list( set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list( set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) #target_names = None if production_data is not None and target_column is not None and prediction_column is not None: production_data.replace([np.inf, -np.inf], np.nan, inplace=True) production_data.dropna(axis=0, how='any', inplace=True) array_prediction = production_data[prediction_column].to_numpy() prediction_ids = np.argmax(array_prediction, axis=-1) prediction_labels = [prediction_column[x] for x in prediction_ids] #plot support bar graphs = [] for label in prediction_column: fig = go.Figure() fig.add_trace( go.Scatter(x=np.random.random(production_data[ production_data[target_column] == label].shape[0]), y=production_data[production_data[target_column] == label][label], mode='markers', name=str(label), marker=dict(size=6, color=red))) fig.add_trace( go.Scatter( x=np.random.random(production_data[ production_data[target_column] != label].shape[0]), y=production_data[ production_data[target_column] != label][label], mode='markers', name='others', marker=dict(size=6, color=grey))) fig.update_layout(yaxis_title="Probability", xaxis=dict(range=(-2, 3), showticklabels=False)) fig_json = json.loads(fig.to_json()) graphs.append({ "id": "tab_" + str(label), "title": str(label), "graph": { "data": fig_json["data"], "layout": fig_json["layout"], } }) self.wi = BaseWidgetInfo( title=self.title, type="tabbed_graph", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=1, params={"graphs": graphs}, additionalGraphs=[], ) else: self.wi = None
def fit_data(self, df, valid_percentage=0.3): X = df[self.features] y = df[self.response] self.response_is_numeric = is_numeric_dtype(df[self.response]) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=valid_percentage, random_state=42) # Drop NaN in response. X_train = X_train[~pd.isnull(y_train)] y_train = y_train[~pd.isnull(y_train)] X_test = X_test[~pd.isnull(y_test)] y_test = y_test[~pd.isnull(y_test)] self.X = X self.y = y self.X_train = X_train self.y_train = y_train self.X_test = X_test self.y_test = y_test # Train self.pipe.fit(X_train, y_train) # Importance self.perm_imp = permutation_importance( self.pipe, self.X_train, self.y_train, n_repeats=10, random_state=42, n_jobs=2, ) self.sorted_idx = self.perm_imp.importances_mean.argsort() self.perm_imp_labels = X_train.columns[self.sorted_idx] self.top_features = X_train.columns[self.sorted_idx].tolist() self.top_features.reverse() # Explainer self.X_numeric = self.transform_numeric.transform(X_train) categorical_feature_indices = [ self.features.index(x) for x in self.categorical_features ] if self.response_is_numeric: self.explainer = lime.lime_tabular.LimeTabularExplainer( self.X_numeric, feature_names=self.features, class_names=[self.response], discretize_continuous=True, categorical_features=categorical_feature_indices, mode="regression", ) else: self.explainer = lime.lime_tabular.LimeTabularExplainer( self.X_numeric, feature_names=self.features, class_names=self.pipe.classes_.tolist(), discretize_continuous=True, categorical_features=categorical_feature_indices, ) # Typical Feature Values typical_feature_values_list = [] for col in self.features: if col in self.numeric_features: mn = df[col].min().item() mx = df[col].max().item() defv = df[col].mean() if mx > mn: sp = (mx - mn) / 100 sp = round(sp, 1 - int(floor(log10(abs(sp)))) - 1) if df[col].dtype == np.int64: defv = np.ceil(defv).astype(int).item() sp = np.ceil(sp).astype(int).item() else: defv = df[col].mode().item() typical_feature_values_list += [defv] self.typical_feature_values = pd.DataFrame( [typical_feature_values_list], columns=self.features)
def f(df, column): if not is_numeric_dtype(df[column]): dtype = df[column].dtype msg = (f"Expected type of column $column to be one of numeric" f" but found {dtype} instead!") raise ValueError(msg)