Exemple #1
0
    def _cleanse_data(self, data):
        """
        Excel will print np.Nan as 65535.
        This function aims to cleanse any representations of NULL so that they print as expected to Excel.
        At this stage we also attempt to convert datetimes to a numeric value used by Excel.
        """
        if isinstance(data, pd.DataFrame):
            for column in data:

                _dtype = data[column].dtype

                if types.is_numeric_dtype(_dtype):
                    data.loc[:, column] = data[column].fillna(0)

                if types.is_string_dtype(_dtype):
                    data.loc[:, column] = data[column].fillna("")

                if types.is_datetime64_any_dtype(_dtype):
                    data.loc[:, column] = self.excel_date(data[column])

        elif isinstance(data, (pd.Series, list)):
            _dtype = pd.Series(data).dtype

            if types.is_numeric_dtype(_dtype):
                data = data.fillna(0)

            elif types.is_string_dtype(_dtype):
                data = data.fillna("")

            elif types.is_datetime64_any_dtype(_dtype):
                data = self.excel_date(data)

        return data
Exemple #2
0
def series_to_numeric_sequence(
    series: typing.Union[typing.Iterable, pandas.Series, pandas.Index,
                         pandas.DataFrame]
) -> typing.Sequence[NUMBER]:
    """
    Takes a set of values and converts them into a set of numbers that can be fed into function that requires their
    values to be numeric. Useful for converting indices to series of numbers that are needed for functions.

    Args:
        series: The collection of values to convert

    Returns:

    """
    if isinstance(series, pandas.DataFrame):
        series = series.index.to_series()

    sequence = list()

    if len(series) == 0:
        return sequence

    if isinstance(series, pandas.Series):
        series = pandas.Series(series)

    if pandas_types.is_numeric_dtype(series):
        return series.to_list()
    elif pandas_types.is_datetime64_any_dtype(series):
        return [date.timestamp() for date in series]
    else:
        sequence = [index for index in range(len(series))]

    return sequence
Exemple #3
0
def histogram(df, category, reindexer='auto'):
  '''
  generates histogram based on specified datetime category
  useful for making frequency plots

  :param df: confessions with timestamp index
  :type df: pandas DataFrame
  :param category: the datetime attribute to group by
  :type category: str name of attribute
  :param reindexer: function to reindex x labels (optional)
  :type reindexer: function
  :return: histogram in pandas Series
  '''
  if reindexer=='auto':
    if category in auto_reindex_lut:
      reindexer = auto_reindex_lut[category]
    else:
      reindexer = None
  try:
    category = df[category]
  except:
    # ensure DataFrame has datetime index, and proper datetime category passed
    assert ptypes.is_datetime64_any_dtype(df.index)
    assert hasattr(df.index, category)
    # group by category and generate histogram
    category = getattr(df.index, category)
  dist = df.groupby(category).count().content
  if reindexer: dist.index = [reindexer(i) for i in dist.index]
  return dist
Exemple #4
0
    def __check_data(self):
        """
        Check input data type and frequency. Also checks the data prerequisites, defined in the models.
        """
        if not (ptypes.is_datetime64_any_dtype(self.data.index)
                or ptypes.is_period_dtype(self.data.index)):
            raise TypeError(
                'Input data index should be datetime or period object. Received : {} instead.'
                .format(self.data.index.dtypes))

        if ptypes.is_period_dtype(self.data.index):
            self.data.index = self.data.index.to_timestamp()

        for prerequisite in self.Model.input_requirements:
            if prerequisite.name not in self.data:
                raise ValueError(
                    'Input data should contains "{}" data! Keyword "{}" not found.'
                    .format(prerequisite.name, prerequisite.name))
            elif not ptypes.is_float_dtype(self.data[prerequisite.name]):
                raise ValueError(
                    'Input data "{}" should be float! Currently : {}'.format(
                        prerequisite.name, self.data[prerequisite].dtypes))
            self.__check_for_na_in_inputs(prerequisite.name)

            if prerequisite.positive:
                self.__check_for_negative_values_in_inputs(prerequisite.name)
    def get_col_type(self, df, column):
        """Returns the type of column - numeric or categoric.
        
        Identifies the column in the dataframe as either a
        numeric or categoric or datetime variable. Treats
        column having integer values but less than <10>
        distinct values as categoric.
        
        Args:
            column: a str refering to the column whose type
                is to be identified.
        Returns:
            A string 'numeric' or 'categoric'
        """

        try:
            if is_numeric_dtype(df[column]) and df[column].nunique() >= 10:
                return 'numeric'
            elif is_numeric_dtype(df[column]) and df[column].nunique() <= 10:
                return 'categoric'
            elif is_datetime64_any_dtype(df[column]):
                return 'datetime'
            elif is_object_dtype(df[column]):
                return 'categoric'
            else:
                return 'problem_in_detecting_dtype'

        except KeyError:
            print('Error in column name')
            return
Exemple #6
0
 def column_treewidget_item_clicked(self, item, ncol):
     colname = item.text(0)
     if ncol == 1:
         key_columns = self.config_data.get('key_columns', [])
         if item.checkState(ncol) == qtc.Qt.Checked:
             if colname not in key_columns:
                 key_columns.append(colname)
                 self.config_data['key_columns'] = key_columns
         else:
             if colname in key_columns:
                 key_columns.remove(colname)
             self.config_data['key_columns'] = key_columns
     elif ncol == 0:
         keep_columns = self.config_data.get('keep_columns', [])
         if item.checkState(ncol) == qtc.Qt.Checked:
             if colname not in keep_columns:
                 keep_columns.append(colname)
                 self.config_data['keep_columns'] = keep_columns
         else:
             if colname in keep_columns:
                 keep_columns.remove(colname)
             self.config_data['keep_columns'] = keep_columns
     elif ncol == 2:
         date_columns = self.config_data.get('date_columns', [])
         if item.checkState(ncol) == qtc.Qt.Checked:
             if not is_datetime64_any_dtype(
                     self.data_table.model().data[colname]):
                 response = qtw.QMessageBox.warning(
                     self, 'Invalid Date Data',
                     'Dates in this column may not be valid.'
                     '\nYou can use Excel to create valid dates from data in this column.',
                     qtw.QMessageBox.Abort)
                 if response == qtw.QMessageBox.Abort:
                     item.setCheckState(2, qtc.Qt.Unchecked)
             else:
                 if colname not in date_columns:
                     date_columns.append(colname)
                 self.config_data['date_columns'] = date_columns
         else:
             if colname in date_columns:
                 date_columns.remove(colname)
             self.config_data['date_columns'] = date_columns
     elif ncol == 3:
         drug_columns = self.config_data.get('drug_columns', [])
         if item.checkState(ncol) == qtc.Qt.Checked:
             if colname not in drug_columns:
                 drug_columns.append(colname)
                 self.config_data['drug_columns'] = drug_columns
         else:
             if colname in drug_columns:
                 drug_columns.remove(colname)
             self.config_data['drug_columns'] = drug_columns
     elif ncol == 4:
         organism_column = self.config_data.get('organism_column', '')
         if item.checkState(ncol) == qtc.Qt.Checked:
             if colname != organism_column:
                 self.config_data['organism_column'] = colname
         else:
             if colname == organism_column:
                 self.config_data['organism_column'] = ''
Exemple #7
0
def _infer_task(df, x, y):
    "Returns str with the name of the inferred task based on the columns x and y"
    if x == y:
        return "predict_itself"

    category_count = df[y].value_counts().count()
    if category_count == 1:
        return "predict_constant"
    if category_count == 2:
        return "classification"
    if category_count == len(df[y]) and (
        is_string_dtype(df[y]) or is_categorical_dtype(df[y])
    ):
        return "predict_id"
    if category_count <= NUMERIC_AS_CATEGORIC_BREAKPOINT and is_numeric_dtype(df[y]):
        return "classification"

    if is_bool_dtype(df[y]) or is_string_dtype(df[y]) or is_categorical_dtype(df[y]):
        return "classification"

    if is_datetime64_any_dtype(df[y]) or is_timedelta64_dtype(df[y]):
        raise Exception(
            f"The target column {y} has the dtype {df[y].dtype} which is not supported. A possible solution might be to convert {y} to a string column"
        )

    # this check needs to be after is_bool_dtype because bool is considered numeric by pandas
    if is_numeric_dtype(df[y]):
        return "regression"

    raise Exception(
        f"Could not infer a valid task based on the target {y}. The dtype {df[y].dtype} is not yet supported"
    )  # pragma: no cover
def __calculate_ncp_attribute(original_series, anonymized_series):
    if must_be_flattened(original_series):
        original_flattened, original_indexes, is_category = flatten_set_valued_series(
            original_series)
        if is_categorical_dtype(original_series) or is_category:
            original_flattened_series = pd.Series(original_flattened,
                                                  index=original_indexes,
                                                  dtype="category",
                                                  name=original_series.name)
        else:
            original_flattened_series = pd.Series(original_flattened,
                                                  index=original_indexes,
                                                  name=original_series.name)
        ncp = __calculate_ncp_attribute(original_flattened_series,
                                        anonymized_series)
    elif is_node(anonymized_series):  # Has been anonymized using a hierarchy
        ncp = __ncp_numerical_hierarchy(original_series, anonymized_series)
    elif is_datetime64_any_dtype(original_series):
        ncp = __ncp_date(original_series, anonymized_series)
    elif is_categorical_dtype(original_series):
        ncp = __ncp_categorical(original_series, anonymized_series)
    elif is_numeric_dtype(original_series):
        ncp = __ncp_numerical(original_series, anonymized_series)
    elif is_token_list(original_series):
        ncp = __ncp_tokens(original_series, anonymized_series)
    else:
        ncp = __ncp_set_valued(original_series, anonymized_series)
    return ncp
Exemple #9
0
def is_datetime_series(obj):
    """
    Check if an object is a Datetime64 series
    """
    import pandas.api.types as ptypes

    return ptypes.is_datetime64_any_dtype(obj)
Exemple #10
0
def align_datetime_dtypes(*dfs):
    """
    Make all of the passed frames have DateTime dtype for the same columns.

    Cast column type of the certain frame to the DateTime type if any frame in
    the `dfs` sequence has DateTime type for this column.

    Parameters
    ----------
    *dfs : iterable of DataFrames
        DataFrames to align DateTime dtypes.

    Notes
    -----
    Passed Modin frames may be casted to pandas in the result.
    """
    datetime_cols = {}
    for df in dfs:
        for col, dtype in df.dtypes.items():
            # If we already decided to cast this column to DateTime no more actions are needed
            if col not in datetime_cols and is_datetime64_any_dtype(dtype):
                datetime_cols[col] = dtype

    casted_dfs = (
        # OmniSci has difficulties with casting to certain dtypes (i.e. datetime64),
        # so casting it to pandas before doing 'astype'
        tuple(try_cast_to_pandas(df).astype(datetime_cols) for df in dfs)
        # This is required so we don't try to cast empty OmniSci frames to pandas:
        # https://github.com/modin-project/modin/issues/3428
        if len(datetime_cols)
        else dfs
    )
    return casted_dfs
def check_if_series_has_internal_type(series, internal_type):
    """Check if data type of series fits to the internal type of gettsim.

    Parameters
    ----------
    series : pd.Series
        Some data series.
    internal_type : TypeVar
        One of the internal gettsim types.

    Returns
    -------
    out : bool
        Return check variable.
    """
    if internal_type == FloatSeries:
        out = is_float_dtype(series) or is_integer_dtype(series)
    elif internal_type == BoolSeries:
        out = is_bool_dtype(series)
    elif internal_type == IntSeries:
        out = is_integer_dtype(series)
    elif internal_type == DateTimeSeries:
        out = is_datetime64_any_dtype(series)
    else:
        raise ValueError(f"The internal type {internal_type} is not defined.")
    return out
Exemple #12
0
def checar_conversiones(df_reto_1):

    import pandas as pd
    import pandas.api.types as ptypes

    assert ptypes.is_float_dtype(
        df_reto_1['relative_velocity.kilometers_per_hour']
    ), 'Cuidado... La columna `relative_velocity.kilometers_per_hour` no es de tipo `float64`'
    assert ptypes.is_datetime64_any_dtype(
        df_reto_1['close_approach_date']
    ), 'Cuidado... La columna `close_approach_date` no es de tipo `datetime64[ns]`'
    assert ptypes.is_datetime64_any_dtype(
        df_reto_1['epoch_date_close_approach']
    ), 'Cuidado... La columna `epoch_date_close_approach` no es de tipo `datetime64[ns]'

    print(f'¡Éxito! ¡Todas tus conversiones fueron realizadas adecuadamente!')
Exemple #13
0
    def read(self, input_file):
        """
        Reading given input file and returning a dataframe
        Parameters
        ----------
        input_file: (str, Path)
            Input file path.
        Returns
        -------
        DataFrame
            Data read in DataFrame.
        """
        date_attributes = self.__config.get_date_attributes()
        date_formats = self.__config.get_date_formats()
        textual_attributes = self.__config.get_textual_attributes()
        data_types = self.__config.get_data_types()
        ordinal_orders = self.__config.get_ordinal_orders()

        df = pd.read_csv(input_file)
        df.columns = map(str.lower, df.columns)
        df = df.dropna(subset=df.columns.difference(textual_attributes), axis='index', how='any')  # drop any rows with empty values except for the textual ones

        for date_attribute in date_attributes:
            if not is_datetime64_any_dtype(df[date_attribute]):
                df[date_attribute] = pd.to_datetime(df[date_attribute], format=date_formats[date_attribute], errors='coerce')
            df = df.dropna(subset=[date_attribute])  # drop rows if there are empty timestamps

        df = df.dropna(subset=df.columns.difference(textual_attributes), axis='index', how='any')  # drop any rows with empty values except for the textual ones

        df = df.astype(data_types)

        for ordinal_attribute in ordinal_orders:
            df[ordinal_attribute].cat.reorder_categories(ordinal_orders[ordinal_attribute], ordered=True, inplace=True)
        return df
Exemple #14
0
def get_formatter(dtype):
    if types.is_datetime64_any_dtype(dtype):
        return DateFormatter(format="%Y-%m-%d %H:%M:%S.%N")
    elif types.is_bool_dtype(dtype):
        return None
        # return BooleanFormatter()
    else:
        return None
Exemple #15
0
    def _check_time_index(self, time_index):

        self.time_index = time_index.get("retro_index", None)
        self.time_cal = time_index.get("cur_index", None)

        self._check_columns([self.time_index, self.time_cal])

        time_window = time_index.get('time_window', [None])
        time_type = time_index.get('type', None)
        logger.info('config time_window loaded:{}'.format(time_window))

        if time_type == None:
            time_type = 'd'

        time_type = time_type.lower()
        if time_type not in ['d', 'm', 'y']:
            raise ValueError("time_type must in ('d','m','y')")
        if time_type == 'd':
            self.time_filter_name = 'Days'
            self.time_filter_cn_name = '天'

        if None in time_window:
            time_window.remove(None)
            time_window.sort()
            time_window.append(None)
        else:
            time_window.sort()
        self.time_window = time_window

        if self.time_index != None and is_datetime64_any_dtype(
                self.df[self.time_index]) == False:
            print("{} not datetime64 ".format(self.time_index))
            self.df[self.time_index] = _To_datetime(self.df[self.time_index])
        if self.time_cal != None and is_datetime64_any_dtype(
                self.df[self.time_cal]) == False:
            print("{} not datetime64 ".format(self.time_cal))
            self.df[self.time_cal] = _To_datetime(self.df[self.time_cal])

        if 'deal_time' not in self.columns:
            if operator.eq(self.time_window, [None]):
                self.df["deal_time"] = 0
            else:
                self.df["deal_time"] = (self.df[self.time_index] -
                                        self.df[self.time_cal]).dt.days
                print("timeintel max: {} ;min  {}".format(
                    self.df["deal_time"].max(), self.df["deal_time"].min()))
Exemple #16
0
def agg_mean(series):
    """Aggregate series values by calculating the mean"""
    if is_numeric_dtype(series):
        return series.mean()
    elif is_datetime64_any_dtype(series):
        return pd.to_datetime(series.dropna().astype(np.int64).mean())
    else:
        raise Exception("Could not aggregate since no option for mean")
Exemple #17
0
def is_datetime_ordered(df: DataFrame or Series) -> bool:
    """Returns True if the index is a datetime and ordered."""
    index_is_datetime = is_datetime64_any_dtype(df.index)
    try:
        ordered = df.index[0] < df.index[-1]
    except RuntimeWarning: pass
    finally:
        return True if index_is_datetime and ordered else False
Exemple #18
0
 def get_empty_dataframe(self):
     """Create an empty dataframe with the correct columns and dtypes"""
     columns = []
     for col in self.columns:
         type = ('datetime64[ns]'
                 if is_datetime64_any_dtype(col.type) else col.type)
         columns.append((col.column_name, type))
     return pd.DataFrame({k: pd.Series(dtype=t) for k, t in columns})
Exemple #19
0
def process_date(df):
    for label,content in df.items():
        if is_datetime64_any_dtype(content):
            df[label].fillna(value=0)
            fld = df[label]
            targ_pre = re.sub('[Dd]ate$', '', label)
            attr = ['Year', 'Month', 'Day', 'Dayofweek', 'Dayofyear','Quarter']
            for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower())
Exemple #20
0
def coerce_column_type(col_data, col_type):

    if not is_datetime64_any_dtype(col_data) and col_type == "datetime":
        return pd.to_datetime(col_data)
    elif col_data.dtype.name == "category" and col_type == "text":
        return col_data.astype("string")
    else:
        return col_data
Exemple #21
0
def save_as_spss(data_frame: pandas.DataFrame,
                 out_path: str,
                 labels: dict = None,
                 find=None,
                 repl=None) -> None:
    """
    caastools.utils.save_as_spss(data_frame: pandas.DataFrame, out_path: str) -> None
    saves data_frame as an SPSS dataset at out_path
    :param data_frame: the pandas DataFrame to save
    :param out_path: the path at which to save the file
    :param labels: a dictionary mapping column labels in the data frame to a variable label in the SPSS dataset
    :param find: a sequence of characters within variable names to be replaced with other values. Default None
    :param repl: a sequence of characters with which to replace corresponding entries in find, or a function
    which yields their replacements. Default None
    :return: None
    :raise ValueError: if either find/repl is None and the other is not
    :raise ValueError: if find and repl are sequences of unequal length
    """

    cols = data_frame.columns  # type: pandas.Index
    is_multi_index = isinstance(cols, pandas.MultiIndex)
    var_names = []
    var_types = {}
    var_formats = {}
    var_labels = {} if labels is None else labels

    # Construct the various information that the SPSS dictionary will contain about each variable
    for col in cols:
        var_name = sanitize_for_spss(".".join(
            str(i) for i in col) if is_multi_index else str(col),
                                     find=find,
                                     repl=repl)
        var_names.append(var_name)

        # Need to know the data type and format of each column so that the SPSS file can be written properly
        # 0 is a numeric type, any positive integer is a string type where the number represents the number
        # of bytes the string can hold.
        if pandas.api.types.is_string_dtype(data_frame[col]):
            lens = list(
                filter(lambda x: pandas.notna(x) and x is not None,
                       set(data_frame[col].str.len())))
            var_types[var_name] = int(max(lens)) * 2 if len(lens) > 0 else 255
        else:
            var_types[var_name] = 0
            var_formats[var_name] = "F10.2" if ptypes.is_float_dtype(data_frame[col].dtype) else \
                "ADATE8" if ptypes.is_datetime64_any_dtype(data_frame[col]) else \
                "F12.0"

    # Sometimes savReaderWriter has trouble writing a whole dataframe in at once,
    # Writing row by row seems to work without issue
    with SavWriter(out_path,
                   var_names,
                   var_types,
                   formats=var_formats,
                   varLabels=var_labels,
                   ioUtf8=True) as writer:
        for row in data_frame.index:
            writer.writerow(data_frame.loc[row, :].values)
Exemple #22
0
def df_to_mat(df, filename, convert_dates=True):
    data = {}
    for c in df.columns:
        if convert_dates and is_datetime64_any_dtype(df[c]):
            values = [datetime_to_datenum(dt) for dt in df[c]]
        else:
            values = df[c]
        data[c] = np.array(values)
    savemat(filename, data)
Exemple #23
0
    def testDateTimeCleanup(self):

        date = '2020-11-04'
        test = Tweets(userName='******', sinceTime=date, lang='pt')
        test.aquireTweets()
        test.getSentiment()
        test.cleanDateTime()
        self.assertTrue(
            ptypes.is_datetime64_any_dtype(test.tweetSentiment.date))
def _map_pandas_val_to_arrow_dtypes(ser: pd.Series) -> pa.DataType:
    # save on storage w/ second precisison timestamps and float32
    dtype = ser.dtype  # type: ignore
    if pdtypes.is_datetime64_any_dtype(dtype):
        return pa.timestamp("s", tz=getattr(dtype, "tz", None))
    elif pdtypes.is_float_dtype(dtype):
        return pa.float32()
    else:
        return pa.array(ser, from_pandas=True).type
Exemple #25
0
    def contains_op(cls, series: pd.Series) -> bool:
        if not pdt.is_datetime64_any_dtype(series):
            return False

        temp_series = series.dropna().dt
        time_val_map = {"hour": 0, "minute": 0, "second": 0}
        return all(
            getattr(temp_series, time_part).eq(val).all()
            for time_part, val in time_val_map.items())
    def _fill_na(self, array):

        for i in self.missing_value:
            array = array.replace(i, np.nan)
        if not is_datetime64_any_dtype(array):
            array = array.astype(str)
            array = series_parse_date(array)
        array = pd.to_datetime(array, errors='coerce')
        return array
Exemple #27
0
def typeset_datetime_column(dt_series: pd.Series,
                            dt_format: Optional[str]) -> pd.Series:
    dt_series = dt_series.copy()
    if not is_datetime64_any_dtype(dt_series):
        if dt_format is not None:
            dt_series = pd.to_datetime(dt_series, format=dt_format)
        else:
            dt_series = pd.to_datetime(dt_series)
    return dt_series
Exemple #28
0
 def _is_datetime(s):
     if is_datetime64_any_dtype(s):
         return True
     try:
         if is_object_dtype(s):
             pd.to_datetime(s, infer_datetime_format=True)
             return True
     except Exception:
         pass
     return False
Exemple #29
0
 def _is_datetime(s):
     if is_datetime64_any_dtype(s):
         return True
     try:
         if is_object_dtype(s):
             pd.to_datetime(s, infer_datetime_format=True)
             return True
     except Exception:  # pylint: disable=broad-except
         pass
     return False
 def _is_datetime(s):
     if is_datetime64_any_dtype(s):
         return True
     try:
         if is_object_dtype(s):
             pd.to_datetime(s, infer_datetime_format=True)
             return True
     except Exception:  # pylint: disable=broad-except
         pass
     return False
def test_date_cols_gen_simple():
    right = ["date", "DATE", "this_is_a_dAtE"]
    wrong = [
        "note a daate",
        "mandate",
        "datewrong",
        "innerdateinner",
        "1date1",
        "%date%",
    ]

    df = pd.DataFrame(columns=right + wrong)

    df = date_cols_gen(df)

    for c in right:
        assert ptypes.is_datetime64_any_dtype(df[c])
    for c in wrong:
        assert not ptypes.is_datetime64_any_dtype(df[c])
 def _is_datetime(s):
     if is_datetime64_any_dtype(s):
         return True
     try:
         if is_object_dtype(s):
             pd.to_datetime(s, infer_datetime_format=True)
             return True
     except Exception:
         pass
     return False
Exemple #33
0
def test__write_frame__read_frame():
    print("Start")
    from django.db import utils
    from econdata.models import Listing
    from libclair.dataframes import write_frame_create, read_frame, write_frame
    
    # Create a DataFrame and write it ino the database
    fr1 = pd.DataFrame([{'id':'foo-1', 'site':'a', 'id_site':'1', 'title':'The 1st record.'},
                        {'id':'foo-2', 'site':'a', 'id_site':'2', 'title':'The 2nd record.'}])
    print('\nfr1:\n', fr1)
    write_frame_create(fr1, Listing, delete=True)
    # The records already exist. Creating them again, without deleting them, 
    # must raise an exception.
    with pytest.raises(utils.IntegrityError):
        write_frame_create(fr1, Listing)
    
    # Read the records, that were just created, from the database.
    # Read a few additional empty columns.
    qset = Listing.objects.filter(id__in=['foo-1', 'foo-2'])
    fr2 = read_frame(qset, ['id', 'title', 'time', 'price'])
    print('\nfr2:\n', fr2)
    
    assert pd_types.is_string_dtype(fr2['title'])
    assert pd_types.is_datetime64_any_dtype(fr2['time'])
    assert pd_types.is_numeric_dtype(fr2['price'])
    assert fr2['id'][0] == 'foo-1'
    assert fr2['id'][1] == 'foo-2'
    assert fr2['title'][0] == 'The 1st record.'
    assert fr2['title'][1] == 'The 2nd record.'
    
    # Change the dataframe
    fr2['time'] = [pd.Timestamp('2017-01-01 12:00+0'), 
                   pd.Timestamp('2017-01-02 12:00+0'),]
    fr2['price'] = [101.0, 102.0,]
    print('\nfr2:\n', fr2)
    # Update the records in the database
    write_frame(fr2, Listing)
    
    # Read the updated records from the database.
    qset = Listing.objects.filter(id__in=['foo-1', 'foo-2'])
    fr3 = read_frame(qset, ['id', 'title', 'time', 'price'])
    print('\nfr3:\n', fr3)
    assert_frames_equal(fr2, fr3)
Exemple #34
0
def coerce_dtypes(df, dtypes):
    """ Coerce dataframe to dtypes safely

    Operates in place

    Parameters
    ----------
    df: Pandas DataFrame
    dtypes: dict like {'x': float}
    """
    bad_dtypes = []
    bad_dates = []
    errors = []
    for c in df.columns:
        if c in dtypes and df.dtypes[c] != dtypes[c]:
            actual = df.dtypes[c]
            desired = dtypes[c]
            if is_float_dtype(actual) and is_integer_dtype(desired):
                bad_dtypes.append((c, actual, desired))
            elif is_object_dtype(actual) and is_datetime64_any_dtype(desired):
                # This can only occur when parse_dates is specified, but an
                # invalid date is encountered. Pandas then silently falls back
                # to object dtype. Since `object_array.astype(datetime)` will
                # silently overflow, error here and report.
                bad_dates.append(c)
            else:
                try:
                    df[c] = df[c].astype(dtypes[c])
                except Exception as e:
                    bad_dtypes.append((c, actual, desired))
                    errors.append((c, e))

    if bad_dtypes:
        if errors:
            ex = '\n'.join("- %s\n  %r" % (c, e) for c, e in
                           sorted(errors, key=lambda x: str(x[0])))
            exceptions = ("The following columns also raised exceptions on "
                          "conversion:\n\n%s\n\n") % ex
            extra = ""
        else:
            exceptions = ""
            # All mismatches are int->float, also suggest `assume_missing=True`
            extra = ("\n\nAlternatively, provide `assume_missing=True` "
                     "to interpret\n"
                     "all unspecified integer columns as floats.")

        bad_dtypes = sorted(bad_dtypes, key=lambda x: str(x[0]))
        table = asciitable(['Column', 'Found', 'Expected'], bad_dtypes)
        dtype_kw = ('dtype={%s}' % ',\n'
                    '       '.join("%r: '%s'" % (k, v)
                                   for (k, v, _) in bad_dtypes))

        dtype_msg = (
            "{table}\n\n"
            "{exceptions}"
            "Usually this is due to dask's dtype inference failing, and\n"
            "*may* be fixed by specifying dtypes manually by adding:\n\n"
            "{dtype_kw}\n\n"
            "to the call to `read_csv`/`read_table`."
            "{extra}").format(table=table, exceptions=exceptions,
                              dtype_kw=dtype_kw, extra=extra)
    else:
        dtype_msg = None

    if bad_dates:
        also = " also " if bad_dtypes else " "
        cols = '\n'.join("- %s" % c for c in bad_dates)
        date_msg = (
            "The following columns{also}failed to properly parse as dates:\n\n"
            "{cols}\n\n"
            "This is usually due to an invalid value in that column. To\n"
            "diagnose and fix it's recommended to drop these columns from the\n"
            "`parse_dates` keyword, and manually convert them to dates later\n"
            "using `dd.to_datetime`.").format(also=also, cols=cols)
    else:
        date_msg = None

    if bad_dtypes or bad_dates:
        rule = "\n\n%s\n\n" % ('-' * 61)
        msg = ("Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.\n\n"
               "%s" % (rule.join(filter(None, [dtype_msg, date_msg]))))
        raise ValueError(msg)
Exemple #35
0
def convert_col_dtype(col, int_to_category=True, force_fp32=True):
    """Convert datatypes for columns according to "sensible" rules for the
    tasks in this module:

    * integer types are reduced to smallest integer type without losing
      information, or to a categorical if that uses less memory (roughly)
    * float types are all made the same: either the type of the first element,
      or all are reduced to single precision
    * object types that contain strings are converted to categoricals
    * object types that contain numbers are converted according to the rules
      above to either floats, shortest-possible ints, or a categorical
    * bool types are forced to ``numpy.dtype('bool')``

    Parameters
    ----------
    col : pandas.Series
        Column

    int_to_category : bool
        Whether to convert integer types to categoricals in the case that this
        will save memory.

    force_fp32 : bool
        Force all floating-point data types to be single precision (fp32). If
        False, the type of the first element is used instead (for all values in
        the column).

    Returns
    -------
    col : pandas.Series

    """
    from pisa.utils.fileio import fsort

    categorical_dtype = CategoricalDtype()

    recognized_dtype = False
    original_dtype = col.dtype
    col_name = col.name

    if len(col) == 0: #pylint: disable=len-as-condition
        return col

    first_item = col.iloc[0]

    # Default: keep current dtype
    new_dtype = original_dtype

    if (is_categorical_dtype(original_dtype)
            or is_datetime64_any_dtype(original_dtype)
            or is_timedelta64_dtype(original_dtype)
            or is_timedelta64_ns_dtype(original_dtype)):
        recognized_dtype = True
        new_dtype = original_dtype
    elif is_object_dtype(original_dtype):
        if isinstance(first_item, basestring):
            recognized_dtype = True
            new_dtype = categorical_dtype
        # NOTE: Must check bool before int since bools look like ints (but not
        # vice versa)
        elif isinstance(first_item, BOOL_TYPES):
            recognized_dtype = True
            new_dtype = np.dtype('bool')
        elif isinstance(first_item, INT_TYPES + UINT_TYPES):
            recognized_dtype = True
            new_dtype = np.dtype('int')
        elif isinstance(first_item, FLOAT_TYPES):
            recognized_dtype = True
            new_dtype = np.dtype(type(first_item))

    # Convert ints to either shortest int possible or categorical,
    # whichever is smaller (use int if same size)
    if new_dtype in INT_DTYPES + UINT_DTYPES:
        recognized_dtype = True
        # See how large an int would be necessary
        col_min, col_max = col.min(), col.max()
        found_int_dtype = False
        int_dtype = None
        for int_dtype in INT_DTYPES:
            exponent = 8*int_dtype.itemsize - 1
            min_representable = -2 ** exponent
            max_representable = (2 ** exponent) - 1
            if col_min >= min_representable and col_max <= max_representable:
                found_int_dtype = True
                break
        if not found_int_dtype:
            raise ValueError('Value(s) in column "%s" exceed %s bounds'
                             % (col_name, int_dtype))

        # Check if categorical is probably smaller than int dtype; note that
        # the below is not perfect (i.e. is not based on exact internal
        # representation of categoricals in Pandas...) but should get us pretty
        # close, so that at least order-of-magnitude efficiencies will be
        # found)
        if int_to_category:
            num_unique = len(col.unique())
            category_bytes = int(np.ceil(np.log2(num_unique) / 8))
            if category_bytes < int_dtype.itemsize:
                new_dtype = categorical_dtype
            else:
                new_dtype = int_dtype

    elif new_dtype in FLOAT_DTYPES:
        recognized_dtype = True
        if force_fp32:
            new_dtype = np.dtype('float32')
        else:
            new_dtype = np.dtype(type(first_item))

    elif new_dtype in BOOL_DTYPES:
        recognized_dtype = True
        new_dtype = np.dtype('bool')

    if not recognized_dtype:
        wstderr('WARNING: Not modifying column "%s" with unhandled dtype "%s"'
                ' and/or sub-type "%s"\n'
                % (col_name, original_dtype.name, type(first_item)))

    if is_dtype_equal(new_dtype, original_dtype):
        if isinstance(first_item, basestring):
            return col.cat.reorder_categories(fsort(col.cat.categories))
        return col

    if is_categorical_dtype(new_dtype):
        new_col = col.astype('category')
        if isinstance(first_item, basestring):
            new_col.cat.reorder_categories(fsort(new_col.cat.categories),
                                           inplace=True)
        return new_col

    try:
        return col.astype(new_dtype)
    except ValueError:
        wstderr('WARNING: Could not convert column "%s" to dtype "%s"; keeping'
                ' original dtype "%s"\n'
                % (col_name, new_dtype, original_dtype))
        return col
def pandas_to_table(df):
    # type: (pd.DataFrame) -> Orange.data.Table
    """
    Convert a pandas.DataFrame to a Orange.data.Table instance.
    """
    index = df.index
    if not isinstance(index, pd.RangeIndex):
        df = df.reset_index()

    columns = []  # type: List[Tuple[Orange.data.Variable, np.ndarray]]

    for header, series in df.items():  # type: (Any, pd.Series)
        if pdtypes.is_categorical(series):
            coldata = series.values  # type: pd.Categorical
            categories = [str(c) for c in coldata.categories]
            var = Orange.data.DiscreteVariable.make(
                str(header), values=categories, ordered=coldata.ordered
            )
            # Remap the coldata into the var.values order/set
            coldata = pd.Categorical(
                coldata, categories=var.values, ordered=coldata.ordered
            )
            codes = coldata.codes
            assert np.issubdtype(codes.dtype, np.integer)
            orangecol = np.array(codes, dtype=np.float)
            orangecol[codes < 0] = np.nan
        elif pdtypes.is_datetime64_any_dtype(series):
            # Check that this converts tz local to UTC
            series = series.astype(np.dtype("M8[ns]"))
            coldata = series.values  # type: np.ndarray
            assert coldata.dtype == "M8[ns]"
            mask = np.isnat(coldata)
            orangecol = coldata.astype(np.int64) / 10 ** 9
            orangecol[mask] = np.nan
            var = Orange.data.TimeVariable.make(str(header))
            var.have_date = var.have_time = 1
        elif pdtypes.is_object_dtype(series):
            coldata = series.values
            assert isinstance(coldata, np.ndarray)
            orangecol = coldata
            var = Orange.data.StringVariable.make(str(header))
        elif pdtypes.is_integer_dtype(series):
            coldata = series.values
            var = Orange.data.ContinuousVariable.make(str(header))
            var.number_of_decimals = 0
            orangecol = coldata.astype(np.float64)
        elif pdtypes.is_numeric_dtype(series):
            orangecol = series.values.astype(np.float64)
            var = Orange.data.ContinuousVariable.make(str(header))
            var._out_format = "%.15g"
        else:
            warnings.warn(
                "Column '{}' with dtype: {} skipped."
                .format(header, series.dtype),
                UserWarning
            )
            continue
        columns.append((var, orangecol))

    cols_x = [(var, col) for var, col in columns if var.is_primitive()]
    cols_m = [(var, col) for var, col in columns if not var.is_primitive()]

    variables = [v for v, _ in cols_x]
    if cols_x:
        X = np.column_stack([a for _, a in cols_x])
    else:
        X = np.empty((df.shape[0], 0), dtype=np.float)
    metas = [v for v, _ in cols_m]
    if cols_m:
        M = np.column_stack([a for _, a in cols_m])
    else:
        M = None

    domain = Orange.data.Domain(variables, metas=metas)
    return Orange.data.Table.from_numpy(domain, X, None, M)
Exemple #37
0
def dtype_detection(data,category_detection=True,StructureText_detection=True,\
datetime_to_category=True,criterion='sqrt',min_mean_counts=5,fix=False):
    '''检测数据中单个变量的数据类型
    将数据类型分为以下4种
    1. number,数值型
    2. category,因子
    3. datetime,时间类型
    4. text,文本型
    5. text_st,结构性文本,比如ID,
    6. group_number,连续

    parameter
    ---------
    data: pd.Series 数据, 仅支持一维
    # 如果有data,则函数会改变原来data的数据类型
    category_detection: bool,根据 nunique 检测是否是因子类型
    StructureText_detection: bool, 结构化文本,如列中都有一个分隔符"-"
    datetime_to_category: 时间序列如果 nunique过少是否转化成因子变量
    criterion: string or int, optional (default="sqrt",即样本数的开根号)
        支持:'sqrt':样本量的开根号, int: 绝对数, 0-1的float:样本数的百分多少
        检测因子变量时,如果一个特征的nunique小于criterion,则判定为因子变量
    min_mean_counts: default 5,数值型判定为因子变量时,需要满足每个类别的平均频数要大于min_mean_counts
    fix: bool,是否返回修改好类型的数据


    return:
    result:dict{
        'name':列名,
        'vtype':变量类型,
        'ordered':是否是有序因子,
        'categories':所有的因子}

    '''

    assert len(data.shape)==1
    data=data.copy()
    data=pd.Series(data)
    dtype,name,n_sample=data.dtype,data.name,data.count()

    min_mean_counts=5
    if criterion=='sqrt':
        max_nuniques=np.sqrt(n_sample)
    elif isinstance(criterion,int):
        max_nuniques=criterion
    elif isinstance(criterion,float) and (0<criterion<1):
        max_nuniques=criterion
    else:
        max_nuniques=np.sqrt(n_sample)
    ordered=False
    categories=[]
    if is_numeric_dtype(dtype):
        vtype='number'
        ordered=False
        categories=[]
        # 纠正误分的数据类型。如将1.0,2.0,3.0都修正为1,2,3
        if data.dropna().astype(np.int64).sum()==data.dropna().sum():
            data[data.notnull()]=data[data.notnull()].astype(np.int64)
        if category_detection:
            nunique=len(data.dropna().unique())
            mean_counts=data.value_counts().median()
            if nunique<max_nuniques and mean_counts>=min_mean_counts:
                data=data.astype('category')
                ordered=data.cat.ordered
                vtype='category'
                categories=list(data.dropna().cat.categories)
        result={'name':name,'vtype':vtype,'ordered':ordered,'categories':categories}
    elif is_string_dtype(dtype):
        # 处理时间类型
        tmp=data.map(lambda x: np.nan if '%s'%x == 'nan' else len('%s'%x))
        tmp=tmp.dropna().astype(np.int64)
        if not(any(data.dropna().map(is_number))) and 7<tmp.max()<20 and tmp.std()<0.1:
            try:
                data=pd.to_datetime(data)
            except :
                pass
        # 处理可能的因子类型
        #时间格式是否处理为True 且
        if datetime_to_category:
            if len(data.dropna().unique())<np.sqrt(n_sample):
                data=data.astype('category')
        else:
            nunique=len(data.dropna().unique())
            #print(data.dtype)
            if not(is_categorical_dtype(data.dtype)) and not(np.issubdtype(data.dtype,np.datetime64)) and nunique<max_nuniques:
                data=data.astype('category')

        # 在非因子类型的前提下,将百分数转化成浮点数,例如21.12%-->0.2112
        if is_string_dtype(data.dtype) and not(is_categorical_dtype(data.dtype)) and all(data.str.contains('%')):
            data=data.str.strip('%').astype(np.float64)/100

        if is_categorical_dtype(data.dtype):
            vtype='category'
            categories=list(data.cat.categories)
            ordered=data.cat.ordered
        # 时间格式
        elif np.issubdtype(data.dtype,np.datetime64):
            vtype='datetime'
        # 是否是结构化数组
        elif StructureText_detection and tmp.dropna().std()==0:
            # 不可迭代,不是字符串
            if not(isinstance(data.dropna().iloc[0],Iterable)):
                vtype='text'
            else:
                k=set(list(data.dropna().iloc[0]))
                for x in data:
                    if isinstance(x,str) and len(x)>0:
                        k&=set(list(x))
                if len(k)>0:
                    vtype='text_st'
                else:
                    vtype='text'
        elif is_numeric_dtype(data.dtype):
            vtype='number'
            ordered=False
            categories=[]
        else:
            vtype='text'
        result={'name':name,'vtype':vtype,'ordered':ordered,'categories':categories}
    elif is_datetime64_any_dtype(dtype):
        vtype='datetime'
        result={'name':name,'vtype':vtype,'ordered':ordered,'categories':categories}
    else:
        print('unknown dtype!')
        result=None

    if fix:
        return result,data
    else:
        return result