Esempio n. 1
0
def train_cats(df):
    """Change any columns of strings in a panda's dataframe to a column of
    catagorical values. This applies the changes inplace.

    Parameters:
    -----------
    df: A pandas dataframe. Any columns of strings will be changed to
        categorical values.

    Examples:
    ---------

    >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']})
    >>> df
       col1 col2
    0     1    a
    1     2    b
    2     3    a

    note the type of col2 is string

    >>> train_cats(df)
    >>> df

       col1 col2
    0     1    a
    1     2    b
    2     3    a

    now the type of col2 is category
    """
    for n,c in df.items():
        if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered()
Esempio n. 2
0
 def _check_fields_df(self, df):
     """Check format of the data collected for a certain location."""
     assert df.indicator.isin({
         "Daily hospital occupancy",
         "Daily ICU occupancy",
         "Weekly new hospital admissions",
         "Weekly new ICU admissions",
     }).all(), "One of the indicators for this country is not recognized!"
     assert is_string_dtype(df.date), "The date column is not a string!"
Esempio n. 3
0
def filter_data(data):
    if is_string_dtype(data[0]):
        data = data.drop(data[data[0].str.contains('VOLTAGE')].index)
        data[0] = data[0].str.replace('LOAD ', '')
        data[0] = pd.to_numeric(data[0], errors='coerce')
        data = data.reset_index(drop=True)
    data[0] = data[0] - data.loc[0, 0]
    data = data.drop(data[data[0] > 200].index)
    return data
Esempio n. 4
0
def convert_columns(s: Series, drop_first: bool) -> AnyPandas:
    if is_string_dtype(s.dtype) and s.map(lambda v: isinstance(v, str)).all():
        s = s.astype("category")

    if is_categorical_dtype(s):
        out = get_dummies(s, drop_first=drop_first)
        out.columns = [str(s.name) + "." + str(c) for c in out]
        return out
    return s
Esempio n. 5
0
    def contains_op(series: pd.Series, state: dict) -> bool:
        is_valid_dtype = pdt.is_categorical_dtype(
            series) and not pdt.is_bool_dtype(series)
        if is_valid_dtype:
            return True
        elif not pdt.is_object_dtype(series):
            return pandas_has_string_dtype_flag and pdt.is_string_dtype(series)

        return series_is_string(series, state)
Esempio n. 6
0
 def astype(self, dtype, copy=True):
     if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
         if copy:
             return self.copy()
         return self
     elif is_string_dtype(dtype) and not is_object_dtype(dtype):
         # numpy has problems with astype(str) for nested elements
         return np.array([str(x) for x in self.data], dtype=dtype)
     return np.array(self.data, dtype=dtype, copy=copy)
Esempio n. 7
0
def create_category_fields(df, is_train=True, train_df=None):  
    if is_train:
        for col_name, data in df.items():
            if is_string_dtype(data):
                df[col_name] = data.astype('category').cat.as_ordered()
    else:
        for col_name, data in df.items():
            if (col_name in train_df.columns) and (train_df[col_name].dtype.name == 'category'):
                df[col_name] = pd.Categorical(data, categories=train_df[col_name].cat.categories, ordered=True)
Esempio n. 8
0
 def prepare_sklearn_data(self, data_set, data_targets):
     """sets up the data as encoded values cause sklearn's implementation can't handle categorical values"""
     encoder = LabelEncoder()
     for column in data_set:
         if is_string_dtype(data_set[column]):
             encoder.fit(data_set[column])
             data_set[column] = encoder.transform(data_set[column])
     self.sklearn_training_data, self.sklearn_testing_data, self.sklearn_training_targets, \
     self.sklearn_testing_targets = train_test_split(data_set.values.tolist(), data_targets, test_size=0.3)
Esempio n. 9
0
def format_missings(df):
  for column in df.columns:
    if is_numeric_dtype(df[column]):
      fill_value = df[column].mean()
      df[column] = df[column].fillna(fill_value, downcast=False)
    elif is_object_dtype(df[column]) or is_string_dtype(df[column]):
      df[column] = df[column].fillna('MISSING', downcast=False)
  print("Shape after format_missing:", df.shape)
  return df
Esempio n. 10
0
def df_normalize_strings(df):
    for col in df.columns:
        if is_string_dtype(df[col]) or is_object_dtype(df[col]):
            df[col] = df[col].str.lower()
            df[col] = df[col].fillna(np.nan)  # make None -> np.nan
            df[col] = df[col].replace('none or unspecified', np.nan)
            df[col] = df[col].replace('none', np.nan)
            df[col] = df[col].replace('#name?', np.nan)
            df[col] = df[col].replace('', np.nan)
Esempio n. 11
0
def FindColumns(dataframe):
    x, y = "", ""
    for column in dataframe.columns:
        if is_string_dtype(dataframe[column]):
            y = column
        if is_numeric_dtype(dataframe[column]):
            x = column

    return x, y
Esempio n. 12
0
def is_string_dtype(df: pd.DataFrame) -> pd.Series:
    """
    Check if each series in DataFrame is of a string dtype.

    Wrapper function to allow function to be applied on the entire dataframe
    instead of a series level. This is a workaround to dill which fails to pickle
    local contexts in nested lambda statements.
    """
    return df.apply(lambda s: types.is_string_dtype(s), result_type="expand")
Esempio n. 13
0
def _assign_feature_type(feature_type, unique_count=0):
    if is_string_dtype(feature_type) or (
        is_numeric_dtype(feature_type) and unique_count <= 2
    ):
        return "categorical"
    elif is_numeric_dtype(feature_type):
        return "continuous"
    else:  # pragma: no cover
        return "unknown"
Esempio n. 14
0
    def get_numeric_string_columns(self):
        numeric_columns = []
        string_columns = []
        for column in self.df.columns:
            if is_string_dtype(self.df[column]):
                string_columns.append(column)
            if is_numeric_dtype(self.df[column]):
                numeric_columns.append(column)

        return numeric_columns, string_columns
Esempio n. 15
0
 def __array__(self, dtype=None, copy=False):
     if dtype is None or is_object_dtype(dtype):
         return self._to_array_of_quantity(copy=copy)
     if (isinstance(dtype, str) and dtype == "string") or isinstance(
         dtype, pd.StringDtype
     ):
         return pd.array([str(x) for x in self.quantity], dtype=pd.StringDtype())
     if is_string_dtype(dtype):
         return np.array([str(x) for x in self.quantity], dtype=str)
     return np.array(self._data, dtype=dtype, copy=copy)
Esempio n. 16
0
def str_dtype_to_cats(df):
    """
    Makes in-place transformations of all string_dtypes in data-frame (df)
    to ordered categories
    """

    for col_name, col_series in df.items():
        if is_string_dtype(col_series):
            df[col_name] = col_series.astype('category').cat.as_ordered()
    return df
Esempio n. 17
0
def clean_data(data_frame):
    # Clean the data
    data_frame.dropna()
    data_frame.drop_duplicates(keep='first', inplace=False) # Removing duplicates

    for column in data_frame:
        if(is_string_dtype(data_frame[column])):
            data_frame[column] = data_frame[column].apply(remove_non_ascii)

    return data_frame
 def reduce_df(self, df, num_cols):
     #add 1 column if first column has strings (i.e. filenames, not mfcc data)
     dfc = df.copy()
     if is_string_dtype(dfc[dfc.columns[0]]):
         num_cols += 1
     cols_red = [i for i in range(num_cols)]
     df_red = dfc[cols_red]
     df_var = dfc[dfc.columns[-1]]
     df_red = pd.concat([df_red, df_var], axis=1)
     return df_red
Esempio n. 19
0
def filter_data(data, drop):
    keep = 'VOLTAGE ' if drop == 'LOAD' else 'LOAD '
    if is_string_dtype(data[0]):
        data = data.drop(data[data[0].str.contains(drop)].index)
        data[0] = data[0].str.replace(keep, '')
        data[0] = pd.to_numeric(data[0], errors='coerce')
        data = data.reset_index(drop=True)
    data[0] = data[0] - data.loc[0, 0]
    data = data.drop(data[data[0] > 200].index)
    return data
Esempio n. 20
0
def string_conversion(df):
    """
    All the string columns in the dataframe are converted to lower case
    :param df: dataframe
    :return: dataframe
    """
    for columns in df.columns:
        if is_string_dtype(df[columns]):
            df[columns] = df[columns].str.lower()
    return df
def train_cats(df, max_n_cat):
    """
    if dtype is string
    or if dtype is numeric and cardinality is less than max_n_cat:
    change dtype to category
    """
    for n, c in df.items():
        if is_string_dtype(c) or is_numeric_dtype(c) and (
                c.nunique() != 2 and c.nunique() <= max_n_cat):
            df[n] = c.astype('category').cat.as_ordered()
Esempio n. 22
0
def object_contains(series: pd.Series, state: dict) -> bool:
    is_object = pdt.is_object_dtype(series)
    if is_object:
        ret = True
    elif pandas_has_string_dtype_flag:
        ret = pdt.is_string_dtype(
            series) and not pdt.is_categorical_dtype(series)
    else:
        ret = False
    return ret
Esempio n. 23
0
 def components(self, data, eval_env):
     # Returns components and whether they are categoric or numeric
     x = data[self.variable]
     if is_numeric_dtype(x):
         type_ = "numeric"
     elif is_string_dtype(x) or is_categorical_dtype(x):
         type_ = "categoric"
     else:
         raise NotImplementedError
     return {self.name: type_}
Esempio n. 24
0
 def makeFeatName(self, data):
     featName = {}
     for col in data:
         if is_categorical_dtype(data[col]) or\
            is_object_dtype(data[col]) or\
            is_string_dtype(data[col]):
             featName[col] = 'discrete'
         else:
             featName[col] = 'continue'
     return featName
Esempio n. 25
0
def response_processing(df, response):
    # Check var type of response
    ########################################

    # Decision rules for categorical:
    # - If string
    # - If unique values make up less than 5% of total obs

    response_col = df[response]

    # Replace NAs with 0s
    response_col.fillna(0, inplace=True)

    resp_string_check = is_string_dtype(response_col)
    resp_unique_ratio = len(np.unique(response_col.values)) / len(
        response_col.values)

    if resp_string_check or resp_unique_ratio < 0.05:
        resp_type = "Categorical"

        # Plot histogram
        # resp_col_plot = response_col.to_frame()
        resp_plot = px.histogram(response_col)
        resp_plot.write_html(file=f"./midterm_plots/response.html",
                             include_plotlyjs="cdn")

        # Encode
        response_col = pd.Categorical(response_col,
                                      categories=response_col.unique())
        response_col, resp_labels = pd.factorize(response_col)

        response_col = pd.DataFrame(response_col, columns=[response])
        response_col_uncoded = df[response]

    else:
        resp_type = "Continuous"
        response_col_uncoded = []

        # Plot histogram
        resp_plot = px.histogram(response_col)
        resp_plot.write_html(file=f"./midterm_plots/response.html",
                             include_plotlyjs="cdn")

    # Get response mean
    resp_mean = response_col.mean()

    if resp_type == "Categorical":
        print(
            "\nThis script uses Plotly to generate plots, which does not support logistic regression trendlines."
        )
        print(
            "Plots will reflect linear probability models, not logit regressions.\n"
        )

    return response_col, resp_type, resp_mean, response_col_uncoded
Esempio n. 26
0
    def searchTable(self):
        """Searches the table and presents the search results as a view"""
        try:
            self.view = self.view.iloc[0:0]
            # search table and generates view
            searchQuery = str(self.search.text())

            #throws an error if there is no search value
            if searchQuery == "":
                raise ValueError("Please enter a search query!")

            # creates dictionary to hold the float, int and str values of the query
            queryDict = {'string': searchQuery}

            #if value can be an integer or float, it will convert accordingly. else it will set a default value of 0.
            try:
                queryDict['int'] = int(searchQuery)
                queryDict['float'] = float(searchQuery)
            except:
                queryDict['int'] = 0
                queryDict['float'] = 0.0

            # initialises the view dataframe
            for i in self.table.columns:
                loopQ = ""
                # sets the query datatype according to the type in the column
                if is_numeric_dtype(self.table[i].dtype):
                    loopQ = queryDict['int']
                    queryBool = self.table[i] == loopQ
                elif is_string_dtype(self.table[i].dtype):
                    loopQ = queryDict['string']
                    queryBool = self.table[i].str.contains(loopQ,
                                                           case=False,
                                                           regex=True)
                elif is_float_dtype(self.table[i].dtype):
                    loopQ = queryDict['float']
                    queryBool = self.table[i] == loopQ

                # if there are matches, add the rows to the view otherwise will show no result.
                if self.table[queryBool].empty is False:
                    self.view = pd.concat(
                        [self.view, self.table.loc[queryBool]])
                else:
                    continue
            # displays view
            if not self.view.empty:
                self.csvTable.setSortingEnabled(True)
                self.csvTable.setModel(PandasModel(self.view))
                self.csvTable.resizeColumnsToContents()
                self.csvTable.show()
            else:
                raise ValueError("No Value Found!")

        except Exception as e:
            mod.errorGUI(str(e))
Esempio n. 27
0
    def fit(self, X, y):
        """Fit the Imputer to the dataset and determine the right approach.

        Args:
            X (pd.Series): Dataset to fit the imputer, or predictors
            y (pd.Series): None, or dataset to fit predictors

        Returns:
            self. Instance of the class.
        """
        # start off with stats blank
        stats = {"param": None, "strategy": None}

        # if y is None, fitting simply X. univariate method.
        if y is None:
            if is_numeric_dtype(X):
                stats = {
                    "param": self.num_imputer.fit(X, y),
                    "strategy": self.num_imputer.strategy
                }
            if is_string_dtype(X):
                stats = {
                    "param": self.cat_imputer.fit(X, y),
                    "strategy": self.cat_imputer.strategy
                }

        # if y is not None, fitting X to y. predictive method.
        if not y is None:
            if is_numeric_dtype(y):
                stats = {
                    "param": self.num_imputer.fit(X, y),
                    "strategy": self.num_imputer.strategy
                }
            if is_string_dtype(y):
                stats = {
                    "param": self.cat_imputer.fit(X, y),
                    "strategy": self.cat_imputer.strategy
                }

        # return final stats
        self.statistics_ = stats
        return self
Esempio n. 28
0
    def __speculate_ordered_index(self, s):
        s = s.copy()
        if is_string_dtype(s):
            return (True, s.iloc[0])

        step = len(s) // 100 + 1

        for i in range(0, len(s) - 1, step):
            if s.iloc[i + step] - s.iloc[i] != step:
                return (False, )
        return True, s.iloc[0]
Esempio n. 29
0
 def test_svdm_nan_row(self):
     """Tests that correct svdm is computed if NaNs occur in a row of a column"""
     df = pd.DataFrame({
         "A": ["high", np.nan, "high", "low", "low", "high"],
         "B": [3, 2, 1, 1, 1, 2],
         "C": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
         "Class":
         ["apple", "apple", "banana", "banana", "banana", "banana"]
     })
     class_col_name = "Class"
     lookup = \
         {
             "A":
                 {
                     'high': 3,
                     'low': 2,
                     CONDITIONAL:
                         {
                             'high':
                                 Counter({
                                     'banana': 2,
                                     'apple': 1
                                 }),
                             'low':
                                 Counter({
                                     'banana': 2
                                 })
                         }
                 }
         }
     rule = pd.Series({
         "A": "high",
         "B": (1, 1),
         "C": "bla",
         "Class": "banana"
     })
     classes = ["apple", "banana"]
     correct = [
         pd.Series([0.0, 1.0, 0.0, 2 / 3 * 2 / 3, 2 / 3 * 2 / 3, 0.0],
                   name="A"),
         pd.Series([1.0, 1.0, 1.0, 1.0, 1.0, 1.0], name="A")
     ]
     j = 0
     for i, col_name in enumerate(df):
         if col_name == class_col_name:
             continue
         col = df[col_name]
         if is_string_dtype(col):
             dist = svdm(col, rule, lookup, classes)
             if j == 0:
                 self.assertTrue(np.allclose(correct[0], dist))
             else:
                 self.assertTrue(dist.equals(correct[j]))
             j += 1
Esempio n. 30
0
 def eval(self, data, eval_env, encoding, is_response=False):
     # Workaround: var names present in 'data' are taken from '__DATA__['col']
     # the rest are left as they are and looked up in the upper namespace
     data_cols = data.columns.tolist()
     x = eval_in_data_mask(self.get_eval_str(data_cols), data, eval_env)
     if is_categorical_dtype(x) or is_string_dtype(x):
         return self.eval_categoric(x, encoding, is_response)
     elif is_numeric_dtype(x):
         return self.eval_numeric(x)
     else:
         return NotImplemented
Esempio n. 31
0
 def components(self, data, eval_env):
     data_cols = data.columns.tolist()
     x = eval_in_data_mask(self.get_eval_str(data_cols), data, eval_env)
     if is_numeric_dtype(x):
         type_ = "numeric"
     elif is_string_dtype(x) or is_categorical_dtype(x) or isinstance(
             x, dict):
         type_ = "categoric"
     else:
         raise NotImplementedError
     return {self.name: type_}
Esempio n. 32
0
def generate_plotly_dim_dict(df, field):
    dim_dict = {}
    dim_dict["label"] = field
    column = df[field]
    if is_numeric_dtype(column):
        dim_dict["values"] = column
    elif is_string_dtype(column):
        texts = column.unique()
        dim_dict["values"] = [
            np.argwhere(texts == x).flatten()[0] for x in column
        ]
        dim_dict["tickvals"] = list(range(len(texts)))
        dim_dict["ticktext"] = texts
    else:
        raise Exception("Unidentifiable Type")

    return dim_dict
Esempio n. 33
0
def test__write_frame__read_frame():
    print("Start")
    from django.db import utils
    from econdata.models import Listing
    from libclair.dataframes import write_frame_create, read_frame, write_frame
    
    # Create a DataFrame and write it ino the database
    fr1 = pd.DataFrame([{'id':'foo-1', 'site':'a', 'id_site':'1', 'title':'The 1st record.'},
                        {'id':'foo-2', 'site':'a', 'id_site':'2', 'title':'The 2nd record.'}])
    print('\nfr1:\n', fr1)
    write_frame_create(fr1, Listing, delete=True)
    # The records already exist. Creating them again, without deleting them, 
    # must raise an exception.
    with pytest.raises(utils.IntegrityError):
        write_frame_create(fr1, Listing)
    
    # Read the records, that were just created, from the database.
    # Read a few additional empty columns.
    qset = Listing.objects.filter(id__in=['foo-1', 'foo-2'])
    fr2 = read_frame(qset, ['id', 'title', 'time', 'price'])
    print('\nfr2:\n', fr2)
    
    assert pd_types.is_string_dtype(fr2['title'])
    assert pd_types.is_datetime64_any_dtype(fr2['time'])
    assert pd_types.is_numeric_dtype(fr2['price'])
    assert fr2['id'][0] == 'foo-1'
    assert fr2['id'][1] == 'foo-2'
    assert fr2['title'][0] == 'The 1st record.'
    assert fr2['title'][1] == 'The 2nd record.'
    
    # Change the dataframe
    fr2['time'] = [pd.Timestamp('2017-01-01 12:00+0'), 
                   pd.Timestamp('2017-01-02 12:00+0'),]
    fr2['price'] = [101.0, 102.0,]
    print('\nfr2:\n', fr2)
    # Update the records in the database
    write_frame(fr2, Listing)
    
    # Read the updated records from the database.
    qset = Listing.objects.filter(id__in=['foo-1', 'foo-2'])
    fr3 = read_frame(qset, ['id', 'title', 'time', 'price'])
    print('\nfr3:\n', fr3)
    assert_frames_equal(fr2, fr3)
Esempio n. 34
0
def dtype_detection(data,category_detection=True,StructureText_detection=True,\
datetime_to_category=True,criterion='sqrt',min_mean_counts=5,fix=False):
    '''检测数据中单个变量的数据类型
    将数据类型分为以下4种
    1. number,数值型
    2. category,因子
    3. datetime,时间类型
    4. text,文本型
    5. text_st,结构性文本,比如ID,
    6. group_number,连续

    parameter
    ---------
    data: pd.Series 数据, 仅支持一维
    # 如果有data,则函数会改变原来data的数据类型
    category_detection: bool,根据 nunique 检测是否是因子类型
    StructureText_detection: bool, 结构化文本,如列中都有一个分隔符"-"
    datetime_to_category: 时间序列如果 nunique过少是否转化成因子变量
    criterion: string or int, optional (default="sqrt",即样本数的开根号)
        支持:'sqrt':样本量的开根号, int: 绝对数, 0-1的float:样本数的百分多少
        检测因子变量时,如果一个特征的nunique小于criterion,则判定为因子变量
    min_mean_counts: default 5,数值型判定为因子变量时,需要满足每个类别的平均频数要大于min_mean_counts
    fix: bool,是否返回修改好类型的数据


    return:
    result:dict{
        'name':列名,
        'vtype':变量类型,
        'ordered':是否是有序因子,
        'categories':所有的因子}

    '''

    assert len(data.shape)==1
    data=data.copy()
    data=pd.Series(data)
    dtype,name,n_sample=data.dtype,data.name,data.count()

    min_mean_counts=5
    if criterion=='sqrt':
        max_nuniques=np.sqrt(n_sample)
    elif isinstance(criterion,int):
        max_nuniques=criterion
    elif isinstance(criterion,float) and (0<criterion<1):
        max_nuniques=criterion
    else:
        max_nuniques=np.sqrt(n_sample)
    ordered=False
    categories=[]
    if is_numeric_dtype(dtype):
        vtype='number'
        ordered=False
        categories=[]
        # 纠正误分的数据类型。如将1.0,2.0,3.0都修正为1,2,3
        if data.dropna().astype(np.int64).sum()==data.dropna().sum():
            data[data.notnull()]=data[data.notnull()].astype(np.int64)
        if category_detection:
            nunique=len(data.dropna().unique())
            mean_counts=data.value_counts().median()
            if nunique<max_nuniques and mean_counts>=min_mean_counts:
                data=data.astype('category')
                ordered=data.cat.ordered
                vtype='category'
                categories=list(data.dropna().cat.categories)
        result={'name':name,'vtype':vtype,'ordered':ordered,'categories':categories}
    elif is_string_dtype(dtype):
        # 处理时间类型
        tmp=data.map(lambda x: np.nan if '%s'%x == 'nan' else len('%s'%x))
        tmp=tmp.dropna().astype(np.int64)
        if not(any(data.dropna().map(is_number))) and 7<tmp.max()<20 and tmp.std()<0.1:
            try:
                data=pd.to_datetime(data)
            except :
                pass
        # 处理可能的因子类型
        #时间格式是否处理为True 且
        if datetime_to_category:
            if len(data.dropna().unique())<np.sqrt(n_sample):
                data=data.astype('category')
        else:
            nunique=len(data.dropna().unique())
            #print(data.dtype)
            if not(is_categorical_dtype(data.dtype)) and not(np.issubdtype(data.dtype,np.datetime64)) and nunique<max_nuniques:
                data=data.astype('category')

        # 在非因子类型的前提下,将百分数转化成浮点数,例如21.12%-->0.2112
        if is_string_dtype(data.dtype) and not(is_categorical_dtype(data.dtype)) and all(data.str.contains('%')):
            data=data.str.strip('%').astype(np.float64)/100

        if is_categorical_dtype(data.dtype):
            vtype='category'
            categories=list(data.cat.categories)
            ordered=data.cat.ordered
        # 时间格式
        elif np.issubdtype(data.dtype,np.datetime64):
            vtype='datetime'
        # 是否是结构化数组
        elif StructureText_detection and tmp.dropna().std()==0:
            # 不可迭代,不是字符串
            if not(isinstance(data.dropna().iloc[0],Iterable)):
                vtype='text'
            else:
                k=set(list(data.dropna().iloc[0]))
                for x in data:
                    if isinstance(x,str) and len(x)>0:
                        k&=set(list(x))
                if len(k)>0:
                    vtype='text_st'
                else:
                    vtype='text'
        elif is_numeric_dtype(data.dtype):
            vtype='number'
            ordered=False
            categories=[]
        else:
            vtype='text'
        result={'name':name,'vtype':vtype,'ordered':ordered,'categories':categories}
    elif is_datetime64_any_dtype(dtype):
        vtype='datetime'
        result={'name':name,'vtype':vtype,'ordered':ordered,'categories':categories}
    else:
        print('unknown dtype!')
        result=None

    if fix:
        return result,data
    else:
        return result