def train_cats(df): """Change any columns of strings in a panda's dataframe to a column of catagorical values. This applies the changes inplace. Parameters: ----------- df: A pandas dataframe. Any columns of strings will be changed to categorical values. Examples: --------- >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']}) >>> df col1 col2 0 1 a 1 2 b 2 3 a note the type of col2 is string >>> train_cats(df) >>> df col1 col2 0 1 a 1 2 b 2 3 a now the type of col2 is category """ for n,c in df.items(): if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered()
def _check_fields_df(self, df): """Check format of the data collected for a certain location.""" assert df.indicator.isin({ "Daily hospital occupancy", "Daily ICU occupancy", "Weekly new hospital admissions", "Weekly new ICU admissions", }).all(), "One of the indicators for this country is not recognized!" assert is_string_dtype(df.date), "The date column is not a string!"
def filter_data(data): if is_string_dtype(data[0]): data = data.drop(data[data[0].str.contains('VOLTAGE')].index) data[0] = data[0].str.replace('LOAD ', '') data[0] = pd.to_numeric(data[0], errors='coerce') data = data.reset_index(drop=True) data[0] = data[0] - data.loc[0, 0] data = data.drop(data[data[0] > 200].index) return data
def convert_columns(s: Series, drop_first: bool) -> AnyPandas: if is_string_dtype(s.dtype) and s.map(lambda v: isinstance(v, str)).all(): s = s.astype("category") if is_categorical_dtype(s): out = get_dummies(s, drop_first=drop_first) out.columns = [str(s.name) + "." + str(c) for c in out] return out return s
def contains_op(series: pd.Series, state: dict) -> bool: is_valid_dtype = pdt.is_categorical_dtype( series) and not pdt.is_bool_dtype(series) if is_valid_dtype: return True elif not pdt.is_object_dtype(series): return pandas_has_string_dtype_flag and pdt.is_string_dtype(series) return series_is_string(series, state)
def astype(self, dtype, copy=True): if isinstance(dtype, type(self.dtype)) and dtype == self.dtype: if copy: return self.copy() return self elif is_string_dtype(dtype) and not is_object_dtype(dtype): # numpy has problems with astype(str) for nested elements return np.array([str(x) for x in self.data], dtype=dtype) return np.array(self.data, dtype=dtype, copy=copy)
def create_category_fields(df, is_train=True, train_df=None): if is_train: for col_name, data in df.items(): if is_string_dtype(data): df[col_name] = data.astype('category').cat.as_ordered() else: for col_name, data in df.items(): if (col_name in train_df.columns) and (train_df[col_name].dtype.name == 'category'): df[col_name] = pd.Categorical(data, categories=train_df[col_name].cat.categories, ordered=True)
def prepare_sklearn_data(self, data_set, data_targets): """sets up the data as encoded values cause sklearn's implementation can't handle categorical values""" encoder = LabelEncoder() for column in data_set: if is_string_dtype(data_set[column]): encoder.fit(data_set[column]) data_set[column] = encoder.transform(data_set[column]) self.sklearn_training_data, self.sklearn_testing_data, self.sklearn_training_targets, \ self.sklearn_testing_targets = train_test_split(data_set.values.tolist(), data_targets, test_size=0.3)
def format_missings(df): for column in df.columns: if is_numeric_dtype(df[column]): fill_value = df[column].mean() df[column] = df[column].fillna(fill_value, downcast=False) elif is_object_dtype(df[column]) or is_string_dtype(df[column]): df[column] = df[column].fillna('MISSING', downcast=False) print("Shape after format_missing:", df.shape) return df
def df_normalize_strings(df): for col in df.columns: if is_string_dtype(df[col]) or is_object_dtype(df[col]): df[col] = df[col].str.lower() df[col] = df[col].fillna(np.nan) # make None -> np.nan df[col] = df[col].replace('none or unspecified', np.nan) df[col] = df[col].replace('none', np.nan) df[col] = df[col].replace('#name?', np.nan) df[col] = df[col].replace('', np.nan)
def FindColumns(dataframe): x, y = "", "" for column in dataframe.columns: if is_string_dtype(dataframe[column]): y = column if is_numeric_dtype(dataframe[column]): x = column return x, y
def is_string_dtype(df: pd.DataFrame) -> pd.Series: """ Check if each series in DataFrame is of a string dtype. Wrapper function to allow function to be applied on the entire dataframe instead of a series level. This is a workaround to dill which fails to pickle local contexts in nested lambda statements. """ return df.apply(lambda s: types.is_string_dtype(s), result_type="expand")
def _assign_feature_type(feature_type, unique_count=0): if is_string_dtype(feature_type) or ( is_numeric_dtype(feature_type) and unique_count <= 2 ): return "categorical" elif is_numeric_dtype(feature_type): return "continuous" else: # pragma: no cover return "unknown"
def get_numeric_string_columns(self): numeric_columns = [] string_columns = [] for column in self.df.columns: if is_string_dtype(self.df[column]): string_columns.append(column) if is_numeric_dtype(self.df[column]): numeric_columns.append(column) return numeric_columns, string_columns
def __array__(self, dtype=None, copy=False): if dtype is None or is_object_dtype(dtype): return self._to_array_of_quantity(copy=copy) if (isinstance(dtype, str) and dtype == "string") or isinstance( dtype, pd.StringDtype ): return pd.array([str(x) for x in self.quantity], dtype=pd.StringDtype()) if is_string_dtype(dtype): return np.array([str(x) for x in self.quantity], dtype=str) return np.array(self._data, dtype=dtype, copy=copy)
def str_dtype_to_cats(df): """ Makes in-place transformations of all string_dtypes in data-frame (df) to ordered categories """ for col_name, col_series in df.items(): if is_string_dtype(col_series): df[col_name] = col_series.astype('category').cat.as_ordered() return df
def clean_data(data_frame): # Clean the data data_frame.dropna() data_frame.drop_duplicates(keep='first', inplace=False) # Removing duplicates for column in data_frame: if(is_string_dtype(data_frame[column])): data_frame[column] = data_frame[column].apply(remove_non_ascii) return data_frame
def reduce_df(self, df, num_cols): #add 1 column if first column has strings (i.e. filenames, not mfcc data) dfc = df.copy() if is_string_dtype(dfc[dfc.columns[0]]): num_cols += 1 cols_red = [i for i in range(num_cols)] df_red = dfc[cols_red] df_var = dfc[dfc.columns[-1]] df_red = pd.concat([df_red, df_var], axis=1) return df_red
def filter_data(data, drop): keep = 'VOLTAGE ' if drop == 'LOAD' else 'LOAD ' if is_string_dtype(data[0]): data = data.drop(data[data[0].str.contains(drop)].index) data[0] = data[0].str.replace(keep, '') data[0] = pd.to_numeric(data[0], errors='coerce') data = data.reset_index(drop=True) data[0] = data[0] - data.loc[0, 0] data = data.drop(data[data[0] > 200].index) return data
def string_conversion(df): """ All the string columns in the dataframe are converted to lower case :param df: dataframe :return: dataframe """ for columns in df.columns: if is_string_dtype(df[columns]): df[columns] = df[columns].str.lower() return df
def train_cats(df, max_n_cat): """ if dtype is string or if dtype is numeric and cardinality is less than max_n_cat: change dtype to category """ for n, c in df.items(): if is_string_dtype(c) or is_numeric_dtype(c) and ( c.nunique() != 2 and c.nunique() <= max_n_cat): df[n] = c.astype('category').cat.as_ordered()
def object_contains(series: pd.Series, state: dict) -> bool: is_object = pdt.is_object_dtype(series) if is_object: ret = True elif pandas_has_string_dtype_flag: ret = pdt.is_string_dtype( series) and not pdt.is_categorical_dtype(series) else: ret = False return ret
def components(self, data, eval_env): # Returns components and whether they are categoric or numeric x = data[self.variable] if is_numeric_dtype(x): type_ = "numeric" elif is_string_dtype(x) or is_categorical_dtype(x): type_ = "categoric" else: raise NotImplementedError return {self.name: type_}
def makeFeatName(self, data): featName = {} for col in data: if is_categorical_dtype(data[col]) or\ is_object_dtype(data[col]) or\ is_string_dtype(data[col]): featName[col] = 'discrete' else: featName[col] = 'continue' return featName
def response_processing(df, response): # Check var type of response ######################################## # Decision rules for categorical: # - If string # - If unique values make up less than 5% of total obs response_col = df[response] # Replace NAs with 0s response_col.fillna(0, inplace=True) resp_string_check = is_string_dtype(response_col) resp_unique_ratio = len(np.unique(response_col.values)) / len( response_col.values) if resp_string_check or resp_unique_ratio < 0.05: resp_type = "Categorical" # Plot histogram # resp_col_plot = response_col.to_frame() resp_plot = px.histogram(response_col) resp_plot.write_html(file=f"./midterm_plots/response.html", include_plotlyjs="cdn") # Encode response_col = pd.Categorical(response_col, categories=response_col.unique()) response_col, resp_labels = pd.factorize(response_col) response_col = pd.DataFrame(response_col, columns=[response]) response_col_uncoded = df[response] else: resp_type = "Continuous" response_col_uncoded = [] # Plot histogram resp_plot = px.histogram(response_col) resp_plot.write_html(file=f"./midterm_plots/response.html", include_plotlyjs="cdn") # Get response mean resp_mean = response_col.mean() if resp_type == "Categorical": print( "\nThis script uses Plotly to generate plots, which does not support logistic regression trendlines." ) print( "Plots will reflect linear probability models, not logit regressions.\n" ) return response_col, resp_type, resp_mean, response_col_uncoded
def searchTable(self): """Searches the table and presents the search results as a view""" try: self.view = self.view.iloc[0:0] # search table and generates view searchQuery = str(self.search.text()) #throws an error if there is no search value if searchQuery == "": raise ValueError("Please enter a search query!") # creates dictionary to hold the float, int and str values of the query queryDict = {'string': searchQuery} #if value can be an integer or float, it will convert accordingly. else it will set a default value of 0. try: queryDict['int'] = int(searchQuery) queryDict['float'] = float(searchQuery) except: queryDict['int'] = 0 queryDict['float'] = 0.0 # initialises the view dataframe for i in self.table.columns: loopQ = "" # sets the query datatype according to the type in the column if is_numeric_dtype(self.table[i].dtype): loopQ = queryDict['int'] queryBool = self.table[i] == loopQ elif is_string_dtype(self.table[i].dtype): loopQ = queryDict['string'] queryBool = self.table[i].str.contains(loopQ, case=False, regex=True) elif is_float_dtype(self.table[i].dtype): loopQ = queryDict['float'] queryBool = self.table[i] == loopQ # if there are matches, add the rows to the view otherwise will show no result. if self.table[queryBool].empty is False: self.view = pd.concat( [self.view, self.table.loc[queryBool]]) else: continue # displays view if not self.view.empty: self.csvTable.setSortingEnabled(True) self.csvTable.setModel(PandasModel(self.view)) self.csvTable.resizeColumnsToContents() self.csvTable.show() else: raise ValueError("No Value Found!") except Exception as e: mod.errorGUI(str(e))
def fit(self, X, y): """Fit the Imputer to the dataset and determine the right approach. Args: X (pd.Series): Dataset to fit the imputer, or predictors y (pd.Series): None, or dataset to fit predictors Returns: self. Instance of the class. """ # start off with stats blank stats = {"param": None, "strategy": None} # if y is None, fitting simply X. univariate method. if y is None: if is_numeric_dtype(X): stats = { "param": self.num_imputer.fit(X, y), "strategy": self.num_imputer.strategy } if is_string_dtype(X): stats = { "param": self.cat_imputer.fit(X, y), "strategy": self.cat_imputer.strategy } # if y is not None, fitting X to y. predictive method. if not y is None: if is_numeric_dtype(y): stats = { "param": self.num_imputer.fit(X, y), "strategy": self.num_imputer.strategy } if is_string_dtype(y): stats = { "param": self.cat_imputer.fit(X, y), "strategy": self.cat_imputer.strategy } # return final stats self.statistics_ = stats return self
def __speculate_ordered_index(self, s): s = s.copy() if is_string_dtype(s): return (True, s.iloc[0]) step = len(s) // 100 + 1 for i in range(0, len(s) - 1, step): if s.iloc[i + step] - s.iloc[i] != step: return (False, ) return True, s.iloc[0]
def test_svdm_nan_row(self): """Tests that correct svdm is computed if NaNs occur in a row of a column""" df = pd.DataFrame({ "A": ["high", np.nan, "high", "low", "low", "high"], "B": [3, 2, 1, 1, 1, 2], "C": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"] }) class_col_name = "Class" lookup = \ { "A": { 'high': 3, 'low': 2, CONDITIONAL: { 'high': Counter({ 'banana': 2, 'apple': 1 }), 'low': Counter({ 'banana': 2 }) } } } rule = pd.Series({ "A": "high", "B": (1, 1), "C": "bla", "Class": "banana" }) classes = ["apple", "banana"] correct = [ pd.Series([0.0, 1.0, 0.0, 2 / 3 * 2 / 3, 2 / 3 * 2 / 3, 0.0], name="A"), pd.Series([1.0, 1.0, 1.0, 1.0, 1.0, 1.0], name="A") ] j = 0 for i, col_name in enumerate(df): if col_name == class_col_name: continue col = df[col_name] if is_string_dtype(col): dist = svdm(col, rule, lookup, classes) if j == 0: self.assertTrue(np.allclose(correct[0], dist)) else: self.assertTrue(dist.equals(correct[j])) j += 1
def eval(self, data, eval_env, encoding, is_response=False): # Workaround: var names present in 'data' are taken from '__DATA__['col'] # the rest are left as they are and looked up in the upper namespace data_cols = data.columns.tolist() x = eval_in_data_mask(self.get_eval_str(data_cols), data, eval_env) if is_categorical_dtype(x) or is_string_dtype(x): return self.eval_categoric(x, encoding, is_response) elif is_numeric_dtype(x): return self.eval_numeric(x) else: return NotImplemented
def components(self, data, eval_env): data_cols = data.columns.tolist() x = eval_in_data_mask(self.get_eval_str(data_cols), data, eval_env) if is_numeric_dtype(x): type_ = "numeric" elif is_string_dtype(x) or is_categorical_dtype(x) or isinstance( x, dict): type_ = "categoric" else: raise NotImplementedError return {self.name: type_}
def generate_plotly_dim_dict(df, field): dim_dict = {} dim_dict["label"] = field column = df[field] if is_numeric_dtype(column): dim_dict["values"] = column elif is_string_dtype(column): texts = column.unique() dim_dict["values"] = [ np.argwhere(texts == x).flatten()[0] for x in column ] dim_dict["tickvals"] = list(range(len(texts))) dim_dict["ticktext"] = texts else: raise Exception("Unidentifiable Type") return dim_dict
def test__write_frame__read_frame(): print("Start") from django.db import utils from econdata.models import Listing from libclair.dataframes import write_frame_create, read_frame, write_frame # Create a DataFrame and write it ino the database fr1 = pd.DataFrame([{'id':'foo-1', 'site':'a', 'id_site':'1', 'title':'The 1st record.'}, {'id':'foo-2', 'site':'a', 'id_site':'2', 'title':'The 2nd record.'}]) print('\nfr1:\n', fr1) write_frame_create(fr1, Listing, delete=True) # The records already exist. Creating them again, without deleting them, # must raise an exception. with pytest.raises(utils.IntegrityError): write_frame_create(fr1, Listing) # Read the records, that were just created, from the database. # Read a few additional empty columns. qset = Listing.objects.filter(id__in=['foo-1', 'foo-2']) fr2 = read_frame(qset, ['id', 'title', 'time', 'price']) print('\nfr2:\n', fr2) assert pd_types.is_string_dtype(fr2['title']) assert pd_types.is_datetime64_any_dtype(fr2['time']) assert pd_types.is_numeric_dtype(fr2['price']) assert fr2['id'][0] == 'foo-1' assert fr2['id'][1] == 'foo-2' assert fr2['title'][0] == 'The 1st record.' assert fr2['title'][1] == 'The 2nd record.' # Change the dataframe fr2['time'] = [pd.Timestamp('2017-01-01 12:00+0'), pd.Timestamp('2017-01-02 12:00+0'),] fr2['price'] = [101.0, 102.0,] print('\nfr2:\n', fr2) # Update the records in the database write_frame(fr2, Listing) # Read the updated records from the database. qset = Listing.objects.filter(id__in=['foo-1', 'foo-2']) fr3 = read_frame(qset, ['id', 'title', 'time', 'price']) print('\nfr3:\n', fr3) assert_frames_equal(fr2, fr3)
def dtype_detection(data,category_detection=True,StructureText_detection=True,\ datetime_to_category=True,criterion='sqrt',min_mean_counts=5,fix=False): '''检测数据中单个变量的数据类型 将数据类型分为以下4种 1. number,数值型 2. category,因子 3. datetime,时间类型 4. text,文本型 5. text_st,结构性文本,比如ID, 6. group_number,连续 parameter --------- data: pd.Series 数据, 仅支持一维 # 如果有data,则函数会改变原来data的数据类型 category_detection: bool,根据 nunique 检测是否是因子类型 StructureText_detection: bool, 结构化文本,如列中都有一个分隔符"-" datetime_to_category: 时间序列如果 nunique过少是否转化成因子变量 criterion: string or int, optional (default="sqrt",即样本数的开根号) 支持:'sqrt':样本量的开根号, int: 绝对数, 0-1的float:样本数的百分多少 检测因子变量时,如果一个特征的nunique小于criterion,则判定为因子变量 min_mean_counts: default 5,数值型判定为因子变量时,需要满足每个类别的平均频数要大于min_mean_counts fix: bool,是否返回修改好类型的数据 return: result:dict{ 'name':列名, 'vtype':变量类型, 'ordered':是否是有序因子, 'categories':所有的因子} ''' assert len(data.shape)==1 data=data.copy() data=pd.Series(data) dtype,name,n_sample=data.dtype,data.name,data.count() min_mean_counts=5 if criterion=='sqrt': max_nuniques=np.sqrt(n_sample) elif isinstance(criterion,int): max_nuniques=criterion elif isinstance(criterion,float) and (0<criterion<1): max_nuniques=criterion else: max_nuniques=np.sqrt(n_sample) ordered=False categories=[] if is_numeric_dtype(dtype): vtype='number' ordered=False categories=[] # 纠正误分的数据类型。如将1.0,2.0,3.0都修正为1,2,3 if data.dropna().astype(np.int64).sum()==data.dropna().sum(): data[data.notnull()]=data[data.notnull()].astype(np.int64) if category_detection: nunique=len(data.dropna().unique()) mean_counts=data.value_counts().median() if nunique<max_nuniques and mean_counts>=min_mean_counts: data=data.astype('category') ordered=data.cat.ordered vtype='category' categories=list(data.dropna().cat.categories) result={'name':name,'vtype':vtype,'ordered':ordered,'categories':categories} elif is_string_dtype(dtype): # 处理时间类型 tmp=data.map(lambda x: np.nan if '%s'%x == 'nan' else len('%s'%x)) tmp=tmp.dropna().astype(np.int64) if not(any(data.dropna().map(is_number))) and 7<tmp.max()<20 and tmp.std()<0.1: try: data=pd.to_datetime(data) except : pass # 处理可能的因子类型 #时间格式是否处理为True 且 if datetime_to_category: if len(data.dropna().unique())<np.sqrt(n_sample): data=data.astype('category') else: nunique=len(data.dropna().unique()) #print(data.dtype) if not(is_categorical_dtype(data.dtype)) and not(np.issubdtype(data.dtype,np.datetime64)) and nunique<max_nuniques: data=data.astype('category') # 在非因子类型的前提下,将百分数转化成浮点数,例如21.12%-->0.2112 if is_string_dtype(data.dtype) and not(is_categorical_dtype(data.dtype)) and all(data.str.contains('%')): data=data.str.strip('%').astype(np.float64)/100 if is_categorical_dtype(data.dtype): vtype='category' categories=list(data.cat.categories) ordered=data.cat.ordered # 时间格式 elif np.issubdtype(data.dtype,np.datetime64): vtype='datetime' # 是否是结构化数组 elif StructureText_detection and tmp.dropna().std()==0: # 不可迭代,不是字符串 if not(isinstance(data.dropna().iloc[0],Iterable)): vtype='text' else: k=set(list(data.dropna().iloc[0])) for x in data: if isinstance(x,str) and len(x)>0: k&=set(list(x)) if len(k)>0: vtype='text_st' else: vtype='text' elif is_numeric_dtype(data.dtype): vtype='number' ordered=False categories=[] else: vtype='text' result={'name':name,'vtype':vtype,'ordered':ordered,'categories':categories} elif is_datetime64_any_dtype(dtype): vtype='datetime' result={'name':name,'vtype':vtype,'ordered':ordered,'categories':categories} else: print('unknown dtype!') result=None if fix: return result,data else: return result