コード例 #1
0
ファイル: test_cli.py プロジェクト: xtmgah/cooler
def test_cload_field():
    runner = CliRunner()
    with runner.isolated_filesystem():
        extra_args = ['--field', 'score=7']
        result = _run_cload_pairs(runner, 2, extra_args)
        assert result.exit_code == 0
        pixels = cooler.Cooler('toy.2.cool').pixels()[:]
        assert 'count' in pixels.columns and types.is_integer_dtype(
            pixels.dtypes['count'])
        assert 'score' in pixels.columns and types.is_float_dtype(
            pixels.dtypes['score'])

        extra_args = ['--field', 'count=7']
        result = _run_cload_pairs(runner, 2, extra_args)
        assert result.exit_code == 0
        pixels = cooler.Cooler('toy.2.cool').pixels()[:]
        assert 'count' in pixels.columns and types.is_integer_dtype(
            pixels.dtypes['count'])
        assert np.allclose(pixels['count'][:], 0)

        extra_args = ['--field', 'count=7:dtype=float']
        result = _run_cload_pairs(runner, 2, extra_args)
        assert result.exit_code == 0
        pixels = cooler.Cooler('toy.2.cool').pixels()[:]
        assert 'count' in pixels.columns and types.is_float_dtype(
            pixels.dtypes['count'])
        assert np.allclose(pixels['count'][:], 0.2)

        extra_args = ['--field', 'count=7:agg=min,dtype=float']
        result = _run_cload_pairs(runner, 2, extra_args)
        assert result.exit_code == 0
        pixels = cooler.Cooler('toy.2.cool').pixels()[:]
        assert 'count' in pixels.columns and types.is_float_dtype(
            pixels.dtypes['count'])
        assert np.allclose(pixels['count'][:], 0.1)
コード例 #2
0
def test_dataframe():
    d2 = DirectAccessV2(
        api_key=DIRECTACCESS_API_KEY,
        client_id=DIRECTACCESS_CLIENT_ID,
        client_secret=DIRECTACCESS_CLIENT_SECRET,
        access_token=DIRECTACCESS_TOKEN,
    )
    df = d2.to_dataframe("rigs", pagesize=10000, deleteddate="null")

    # Check index is set to API endpoint "primary key"
    assert df.index.name == "RigID"

    # Check datetime64 dtypes
    assert is_datetime64_ns_dtype(df.CreatedDate)
    assert is_datetime64_ns_dtype(df.DeletedDate)
    assert is_datetime64_ns_dtype(df.SpudDate)
    assert is_datetime64_ns_dtype(df.UpdatedDate)

    # Check Int64 dtypes
    assert is_int64_dtype(df.PermitDepth)
    assert is_int64_dtype(df.FormationDepth)

    # Check float dtypes
    assert is_float_dtype(df.RigLatitudeWGS84)
    assert is_float_dtype(df.RigLongitudeWGS84)

    return
コード例 #3
0
ファイル: test_cli_ingest.py プロジェクト: zhuakexi/cooler
def test_cload_field():
    runner = CliRunner()
    with runner.isolated_filesystem():
        extra_args = ["--field", "score=8"]
        result = _run_cload_pairs(runner, 2, extra_args)
        assert result.exit_code == 0
        pixels = cooler.Cooler("toy.2.cool").pixels()[:]
        assert "count" in pixels.columns and types.is_integer_dtype(
            pixels.dtypes["count"])
        assert "score" in pixels.columns and types.is_float_dtype(
            pixels.dtypes["score"])

        extra_args = ["--field", "count=8"]
        result = _run_cload_pairs(runner, 2, extra_args)
        assert result.exit_code == 0
        pixels = cooler.Cooler("toy.2.cool").pixels()[:]
        assert "count" in pixels.columns and types.is_integer_dtype(
            pixels.dtypes["count"])
        assert np.allclose(pixels["count"][:], 0)

        extra_args = ["--field", "count=8:dtype=float"]
        result = _run_cload_pairs(runner, 2, extra_args)
        assert result.exit_code == 0
        pixels = cooler.Cooler("toy.2.cool").pixels()[:]
        assert "count" in pixels.columns and types.is_float_dtype(
            pixels.dtypes["count"])
        assert np.allclose(pixels["count"][:], 0.2)

        extra_args = ["--field", "count=8:agg=min,dtype=float"]
        result = _run_cload_pairs(runner, 2, extra_args)
        assert result.exit_code == 0
        pixels = cooler.Cooler("toy.2.cool").pixels()[:]
        assert "count" in pixels.columns and types.is_float_dtype(
            pixels.dtypes["count"])
        assert np.allclose(pixels["count"][:], 0.1)
コード例 #4
0
ファイル: gps.py プロジェクト: hildakosorus/causal-curve
    def _validate_fit_data(self):
        """Verifies that T, X, and y are formatted the right way"""
        # Checks for T column
        if not is_float_dtype(self.T):
            raise TypeError(f"Treatment data must be of type float")

        # Make sure all X columns are float or int
        if isinstance(self.X, pd.Series):
            if not is_numeric_dtype(self.X):
                raise TypeError(
                    f"All covariate (X) columns must be int or float type (i.e. must be numeric)"
                )

        elif isinstance(self.X, pd.DataFrame):
            for column in self.X:
                if not is_numeric_dtype(self.X[column]):
                    raise TypeError(
                        f"All covariate (X) columns must be int or float type "
                        f"(i.e. must be numeric)")

        # Checks for Y column
        if not (is_float_dtype(self.y) or is_integer_dtype(self.y)):
            raise TypeError(f"Outcome data must be of type float or integer")

        if is_integer_dtype(self.y) and (not np.array_equal(
                np.sort(self.y.unique()), np.array([0, 1]))):
            raise TypeError(
                f"If your outcome data is of type integer (binary outcome),"
                f"it should only contain 1's and 0's.")
コード例 #5
0
ファイル: csv.py プロジェクト: sethah/dask
def coerce_dtypes(df, dtypes):
    """ Coerce dataframe to dtypes safely

    Operates in place

    Parameters
    ----------
    df: Pandas DataFrame
    dtypes: dict like {'x': float}
    """
    for c in df.columns:
        if c in dtypes and df.dtypes[c] != dtypes[c]:
            if is_float_dtype(df.dtypes[c]) and is_integer_dtype(dtypes[c]):
                # There is a mismatch between floating and integer columns.
                # Determine all mismatched and error.
                mismatched = sorted(c for c in df.columns
                                    if is_float_dtype(df.dtypes[c])
                                    and is_integer_dtype(dtypes[c]))

                msg = ("Mismatched dtypes found.\n"
                       "Expected integers, but found floats for columns:\n"
                       "%s\n\n"
                       "To fix, specify dtypes manually by adding:\n\n"
                       "%s\n\n"
                       "to the call to `read_csv`/`read_table`.\n\n"
                       "Alternatively, provide `assume_missing=True` to "
                       "interpret all unspecified integer columns as floats.")

                missing_list = '\n'.join('- %r' % c for c in mismatched)
                dtype_list = ('%r: float' % c for c in mismatched)
                missing_dict = 'dtype={%s}' % ',\n       '.join(dtype_list)
                raise ValueError(msg % (missing_list, missing_dict))

            df[c] = df[c].astype(dtypes[c])
コード例 #6
0
def test_data_characters_types():
    from pandas.api.types import is_object_dtype
    from pandas.api.types import is_float_dtype
    las = lasio.read(egfn('data_characters.las'))
    assert is_object_dtype(las.df().index.dtype)
    assert is_object_dtype(las.df()['DATE'].dtype)
    assert is_float_dtype(las.df()['DEPT'].dtype)
    assert is_float_dtype(las.df()['ARC_GR_UNC_RT'].dtype)
コード例 #7
0
ファイル: test_read.py プロジェクト: kinverarity1/lasio
def test_data_characters_types():
    from pandas.api.types import is_object_dtype
    from pandas.api.types import is_float_dtype
    las = lasio.read(egfn('data_characters.las'))
    assert is_object_dtype(las.df().index.dtype)
    assert is_object_dtype(las.df()['DATE'].dtype)
    assert is_float_dtype(las.df()['DEPT'].dtype)
    assert is_float_dtype(las.df()['ARC_GR_UNC_RT'].dtype)
コード例 #8
0
ファイル: test_read.py プロジェクト: wassemalward/lasio
def test_data_characters_types():
    from pandas.api.types import is_object_dtype
    from pandas.api.types import is_float_dtype

    las = lasio.read(egfn("data_characters.las"))
    assert is_object_dtype(las.df().index.dtype)
    assert is_object_dtype(las.df()["DATE"].dtype)
    assert is_float_dtype(las.df()["DEPT"].dtype)
    assert is_float_dtype(las.df()["ARC_GR_UNC_RT"].dtype)
コード例 #9
0
ファイル: mediation.py プロジェクト: vargeus/causal-curve
    def _validate_fit_data(self):
        """Verifies that T, M, and y are formatted the right way"""
        # Checks for T column
        if not is_float_dtype(self.T):
            raise TypeError(f"Treatment data must be of type float")

        # Checks for M column
        if not is_float_dtype(self.M):
            raise TypeError(f"Mediation data must be of type float")

        # Checks for Y column
        if not is_float_dtype(self.y):
            raise TypeError(f"Outcome data must be of type float")
コード例 #10
0
    def guess_natsort_alg(cls, dtype: Type[Any]) -> NatsortFlagsAndValue:
        """
        Guesses a good natsorted flag for the dtype.

        Here are some specifics:
            - integers       ⇒ INT and SIGNED
            - floating-point ⇒ FLOAT and SIGNED
            - strings        ⇒ COMPATIBILITYNORMALIZE and GROUPLETTERS
            - datetime       ⇒ GROUPLETTERS (only affects 'Z' vs. 'z'; shouldn't matter)

        Args:
            dtype: Probably from ``pd.Series.dtype``

        Returns:
            A tuple of (set of flags, int) -- see :meth:`exact_natsort_alg`
        """
        st, x = set(), 0
        if is_string_dtype(dtype):
            st.update(["COMPATIBILITYNORMALIZE", "GROUPLETTERS"])
            x |= ns_enum.COMPATIBILITYNORMALIZE | ns_enum.GROUPLETTERS
        elif is_categorical_dtype(dtype):
            pass
        elif is_integer_dtype(dtype) or is_bool_dtype(dtype):
            st.update(["INT", "SIGNED"])
            x |= ns_enum.INT | ns_enum.SIGNED
        elif is_float_dtype(dtype):
            st.update(["FLOAT", "SIGNED"])
            x |= ns_enum.FLOAT | ns_enum.SIGNED  # same as ns_enum.REAL
        return NatsortFlagsAndValue(st, x)
コード例 #11
0
ファイル: helpers.py プロジェクト: notsambeck/pandabase
def series_is_boolean(col: pd.Series or pd.Index):
    """
    returns:
        None if column is all None;
        True if a pd.Series contains True, False, and None;
        False otherwise

    caveat: does not interpret all-zero or all-one columns as boolean"""
    if len(col.unique()) == 1 and col.unique()[0] is None:
        # return None for all-None columns
        return None
    elif col.isna().all():
        return None
    elif is_bool_dtype(col):
        return True
    elif is_object_dtype(col):
        for val in col.unique():
            if val not in [True, False, None]:
                return False
        return False in col.unique() and True in col.unique()
    elif is_integer_dtype(col) or is_float_dtype(col):
        for val in col.unique():
            if pd.isna(val):
                continue
            if val not in [1, 0, None]:
                return False
            if 0 not in col.unique() or 1 not in col.unique():
                return False
        return True
    return False
コード例 #12
0
def metric():
    try:
        file = request.files["file"]
        extension = f.filename.split(".")[-1]

        if not file or extension not in app.config["ALLOWED_FILE_EXTENSIONS"]:
            return 'Invalid'

        data = io.BytesIO()
        file.save(data)
        data = data.getvalue().decode('utf-8')
        data = io.StringIO(data)
        df_pred = pd.read_csv(data)

        for col in ['name', *FIELDS]:
            if col not in df_pred.columns:
                return 'Invalid: Lack of column {}'.format(col)
        if not (df_pred.name == df_true.name).all():
            return 'Invalid: names are not correct'
        if not all(is_float_dtype(df_pred[field]) for field in FIELDS):
            return 'Invalid: Expect data to be float'

        y_pred = df_pred[FIELDS].values.reshape(-1, 4, 2)
        rmse = np.sqrt(((y_true - y_pred) ** 2).sum(axis=2).mean()).item()

        print(rmse)

        return jsonify({
            'rmse': round(rmse, 3),
        })
    except:
        return jsonify({
            "error_msg": traceback.format_exc()
        })
コード例 #13
0
ファイル: test_creation_ingest.py プロジェクト: Phlya/cooler
def test_cload_field(bins_path, pairs_path):
    kwargs = dict(
        metadata=None,
        assembly="toy",
        chunksize=10,
        zero_based=False,
        comment_char="#",
        input_copy_status="unique",
        no_symmetric_upper=False,
        temp_dir=None,
        no_delete_temp=False,
        storage_options=None,
        no_count=True,
        max_merge=200,
        chrom1=2,
        pos1=3,
        chrom2=4,
        pos2=5,
    )
    cload_pairs.callback(bins_path,
                         pairs_path,
                         testcool_path,
                         field=("score=8:dtype=float", ),
                         **kwargs)
    pixels = cooler.Cooler(testcool_path).pixels()[:]
    assert "count" in pixels.columns and types.is_integer_dtype(
        pixels.dtypes["count"])
    assert "score" in pixels.columns and types.is_float_dtype(
        pixels.dtypes["score"])
コード例 #14
0
def prepare_data(
    dataset_df: pd.DataFrame,
    drop_na: bool = False,
    mean_int: bool = True,
    mean_float: bool = True,
    rescale_float: bool = True,
    standardize_float: bool = True,
) -> None:
    """Fill missing values and standardize float columns.

    :author: Robin Courant
    :param dataset_df: dataset to process.
    :param drop_na: whether to drop every row with at least on `NaN` cell.
    :param mean_int: whether to use mean or the median for missing integers.
    :param mean_float: whether to use mean or the median for missing floats.
    :param rescale_float: whether to rescale floats (standardize or normalize).
    :param standardize_float: whether to apply standardization or normalization.
    """
    if drop_na:
        dataset_df.dropna()
        return

    for column_name, column_series in dataset_df.iteritems():
        if is_integer_dtype(column_series):
            if set(column_series.unique()) == {0, 1}:
                dataset_df[column_name] = _prepare_bool(column_series)
            else:
                dataset_df[column_name] = _prepare_int(column_series, mean_int)
        elif is_float_dtype(column_series):
            dataset_df[column_name] = _prepare_float(column_series, mean_float,
                                                     rescale_float,
                                                     standardize_float)
        # Raise an error is the column's type is not boolean, integer or float
        else:
            raise TypeError(f"Unrecognized type, column: {column_name}")
コード例 #15
0
	def test_time_formater_is_float(self):
		"""test that the expected time format is created on a given dataframe as a float variable"""
		result1 = time_formater(self.df_gr)
		result_1 = self.df_gr_time
		self.assertIsInstance(result1, pd.DataFrame)
		self.assertTrue(ptypes.is_float_dtype(result1["time_hours"]))
		self.assertEqual(result1, result_1)
コード例 #16
0
def check_if_series_has_internal_type(series, internal_type):
    """Check if data type of series fits to the internal type of gettsim.

    Parameters
    ----------
    series : pd.Series
        Some data series.
    internal_type : TypeVar
        One of the internal gettsim types.

    Returns
    -------
    out : bool
        Return check variable.
    """
    if internal_type == FloatSeries:
        out = is_float_dtype(series) or is_integer_dtype(series)
    elif internal_type == BoolSeries:
        out = is_bool_dtype(series)
    elif internal_type == IntSeries:
        out = is_integer_dtype(series)
    elif internal_type == DateTimeSeries:
        out = is_datetime64_any_dtype(series)
    else:
        raise ValueError(f"The internal type {internal_type} is not defined.")
    return out
コード例 #17
0
    def __check_data(self):
        """
        Check input data type and frequency. Also checks the data prerequisites, defined in the models.
        """
        if not (ptypes.is_datetime64_any_dtype(self.data.index)
                or ptypes.is_period_dtype(self.data.index)):
            raise TypeError(
                'Input data index should be datetime or period object. Received : {} instead.'
                .format(self.data.index.dtypes))

        if ptypes.is_period_dtype(self.data.index):
            self.data.index = self.data.index.to_timestamp()

        for prerequisite in self.Model.input_requirements:
            if prerequisite.name not in self.data:
                raise ValueError(
                    'Input data should contains "{}" data! Keyword "{}" not found.'
                    .format(prerequisite.name, prerequisite.name))
            elif not ptypes.is_float_dtype(self.data[prerequisite.name]):
                raise ValueError(
                    'Input data "{}" should be float! Currently : {}'.format(
                        prerequisite.name, self.data[prerequisite].dtypes))
            self.__check_for_na_in_inputs(prerequisite.name)

            if prerequisite.positive:
                self.__check_for_negative_values_in_inputs(prerequisite.name)
コード例 #18
0
def _default_transformer(col, train_df):
    if is_integer_dtype(train_df[col]):
        return int
    if is_float_dtype(train_df[col]):
        return float
    if is_string_dtype(train_df[col]):
        return LabelEncoder
コード例 #19
0
ファイル: file.py プロジェクト: qingyun-wu/automlbenchmark
    def load_metadata(self):
        self._ensure_loaded()
        dtypes = self.dataset._dtypes
        to_feature_type = lambda dt: (
            'int' if pat.is_integer_dtype(dt) else 'float'
            if pat.is_float_dtype(dt) else 'number'
            if pat.is_numeric_dtype(dt) else 'category'
            if pat.is_categorical_dtype(dt) else 'string'
            if pat.is_string_dtype(dt)
            # else 'datetime' if pat.is_datetime64_dtype(dt)
            else 'object')
        features = [
            Feature(i, col, to_feature_type(dtypes[i]))
            for i, col in enumerate(self._ds.columns)
        ]

        for f in features:
            col = self._ds.iloc[:, f.index]
            f.has_missing_values = col.hasnans
            if f.is_categorical():
                f.values = sorted(self._ds.dtypes[f.name].categories.values)

        target = self._find_target_feature(features)
        self._set_feature_as_target(target)

        meta = dict(features=features, target=target)
        log.debug("Metadata for dataset %s: %s", self.path, meta)
        return meta
コード例 #20
0
ファイル: test_integer.py プロジェクト: zkw03/pandas
    def _check_op(self, s, op_name, other, exc=None):
        op = self.get_op_from_name(op_name)
        result = op(s, other)

        # compute expected
        mask = s.isna()

        # other array is an Integer
        if isinstance(other, IntegerArray):
            omask = getattr(other, 'mask', None)
            mask = getattr(other, 'data', other)
            if omask is not None:
                mask |= omask

        # float result type or float op
        if ((is_float_dtype(other) or is_float(other) or
             op_name in ['__rtruediv__', '__truediv__',
                         '__rdiv__', '__div__'])):
            rs = s.astype('float')
            expected = op(rs, other)
            self._check_op_float(result, expected, mask, s, op_name, other)

        # integer result type
        else:
            rs = pd.Series(s.values._data)
            expected = op(rs, other)
            self._check_op_integer(result, expected, mask, s, op_name, other)
コード例 #21
0
def categorical_func(series):
    natural_language_threshold = ww.config.get_option(
        'natural_language_threshold')
    numeric_categorical_threshold = ww.config.get_option(
        'numeric_categorical_threshold')

    if pdtypes.is_string_dtype(series.dtype) and not col_is_datetime(series):
        # heuristics to predict this some other than categorical
        sample = series.sample(min(10000, len(series)))
        # catch cases where object dtype cannot be interpreted as a string
        try:
            avg_length = sample.str.len().mean()
            if avg_length > natural_language_threshold:
                return False
        except AttributeError:
            pass
        return True

    if pdtypes.is_categorical_dtype(series.dtype):
        return True
    if ((pdtypes.is_float_dtype(series.dtype)
         or pdtypes.is_integer_dtype(series.dtype)) and
            _is_numeric_categorical(series, numeric_categorical_threshold)):
        return True
    return False
コード例 #22
0
ファイル: test_creation_ingest.py プロジェクト: xtmgah/cooler
def test_cload_field(bins_path, pairs_path):
    kwargs = dict(
        metadata=None,
        assembly='toy',
        chunksize=10,
        zero_based=False,
        comment_char='#',
        input_copy_status='unique',
        no_symmetric_upper=False,
        temp_dir=None,
        no_delete_temp=False,
        storage_options=None,
        no_count=True,
        max_merge=200,
        chrom1=1,
        pos1=2,
        chrom2=3,
        pos2=4,
    )
    cload_pairs.callback(bins_path,
                         pairs_path,
                         testcool_path,
                         field=('score=7:dtype=float', ),
                         **kwargs)
    pixels = cooler.Cooler(testcool_path).pixels()[:]
    assert 'count' in pixels.columns and types.is_integer_dtype(
        pixels.dtypes['count'])
    assert 'score' in pixels.columns and types.is_float_dtype(
        pixels.dtypes['score'])
コード例 #23
0
def double_func(series):
    numeric_categorical_threshold = ww.config.get_option(
        'numeric_categorical_threshold')
    if (pdtypes.is_float_dtype(series.dtype) and not _is_numeric_categorical(
            series, numeric_categorical_threshold)):
        return True
    return False
コード例 #24
0
ファイル: types.py プロジェクト: ieaves/tenzing
 def contains_op(self, series):
     if not pdt.is_float_dtype(series):
         return False
     elif series in tenzing_integer:
         return False
     else:
         return True
コード例 #25
0
ファイル: design.py プロジェクト: neerajprad/brmp
 def dispatch(col):
     dfcol = df[col.factor]
     if type(col) == IndicatorCol:
         assert is_categorical_dtype(dfcol)
         return (dfcol == col.level).to_numpy()
     elif type(col) == NumericCol:
         assert is_float_dtype(dfcol) or is_integer_dtype(dfcol)
         return dfcol.to_numpy()
     elif type(col) == CustomCol:
         assert col.factor in contrasts
         mat = contrasts[col.factor]
         levels = metadata.column(col.factor).levels
         # TODO: This can be triggered in normal use, so turn into
         # friendly error. It probably makes sense to check for
         # this earlier. This could possibly happen in `defm` after
         # creating the metadata, though this would also require a
         # separate check for SequentialOED. Is there anywhere
         # sensible to put this that catches both?
         assert len(levels) == mat.shape[0]
         assert col.index < mat.shape[1]
         # TODO: Better asymptotics then using `.index()`
         out = mat[[levels.index(val) for val in dfcol], col.index]
         return out
     else:
         raise Exception('unknown column type')
コード例 #26
0
    def categorical_bar_graph(self, params=None):
        df_parameters = self._dataframe_parameters()
        if not params:
            params = list(df_parameters)

        for item in params:
            if is_float_dtype(df_parameters[item]) or is_integer_dtype(
                    df_parameters[item]):
                continue
            else:
                # ToDO: fix
                # this is broken
                for i in range(len(df_parameters[item].index)):
                    if type(df_parameters[item][i]) == str or type(
                            df_parameters[item][i]) == bool:
                        continue
                    else:
                        # ToDO: enhancement
                        #this is where I could potentially make it so that an additional bar said numerical
                        df_parameters[item].drop(i)
            try:
                plt.figure(figsize=(20, 8))
                plt.rcParams['font.size'] = 18
                sns.countplot(x=item, data=df_parameters,
                              palette="husl").set_title(
                                  '{} bar graph'.format(item))
                path = self._save_to_folder('/bar_parameters',
                                            '{}_bar_graph.pdf'.format(item))
                plt.savefig(path)
                plt.close('all')
            except:
                continue
コード例 #27
0
    def categorical_evolution(self, params=None):
        df_parameters = self._dataframe_parameters()
        if not params:
            params = list(df_parameters)

        df_parameters['iteration'] = self.results['iteration']
        for item in params:
            if is_float_dtype(df_parameters[item]) or is_integer_dtype(
                    df_parameters[item]):
                continue
            else:
                for i in range(len(df_parameters[item].index)):
                    if type(df_parameters[item][i]) == str or type(
                            df_parameters[item][i]) == bool:
                        continue
                    else:
                        df_parameters[item] = df_parameters[item].drop(i)
            try:
                plt.figure(figsize=(20, 8))
                plt.rcParams['font.size'] = 18
                sns.catplot(data=df_parameters, x='iteration',
                            y=item).fig.suptitle(
                                '{} over iterations'.format(item))
                path = self._save_to_folder(
                    '/category_evolution',
                    '{}_category_iter_graph.pdf'.format(item))
                plt.savefig(path)
                plt.close('all')
            except:
                continue
コード例 #28
0
def load_external_csv_dataset(filename, split_percent, algo_type):
    try:
        filepath = os.path.join(media_root, filename)
        df = pd.read_csv(filepath)
        cols = list(df.columns)
        last = cols[-1]
    except Exception as e:
        return f'{e}', None, None, None, None, None

    if df.isnull().values.any():
        return "NAN VALUES IN DATASET", None, None, None, None, None

    #check whether all input data is numeric or not
    for column in cols[:-1]:
        if not is_numeric_dtype(df[column]):
            return "NON NUMERIC VALUE FOUND IN DATASET (EXCLUDING OUTPUT COLUMN)", None, None, None, None, None
    if is_float_dtype(df[last]) and algo_type == "Classification":
        return "OUTPUT COLUMN DOES NOT CONTAIN CATEGORICAL VALUES", None, None, None, None, None
    if not is_float_dtype(df[last]) and algo_type == "Regression":
        return "OUTPUT COLUMN DOES NOT CONTAIN CONTINUOUS NUMERIC VALUES", None, None, None, None, None

    status = "VALID DATASET"
    if split_percent < 0:
        train_output = df[last]
        train_input = df.drop([last], axis=1)
        test_input = train_input.copy()
        test_output = train_output.copy()
        return status, train_input, train_output, test_input, test_output, last
    #randomly shuffle the data first
    np.random.seed(2)
    df = df.sample(frac=1).reset_index(drop=True)

    #splitting the training data and testing data
    test_data_size = (df.shape[0] * split_percent) // 100
    r_no = np.random.randint(df.shape[0] - test_data_size)

    test_data = df[r_no:r_no + test_data_size]
    train_data = df.drop(range(r_no, r_no + test_data_size))

    #splitting ouptut and input
    train_output = train_data[last]
    test_output = test_data[last]

    train_input = train_data.drop([last], axis=1)
    test_input = test_data.drop([last], axis=1)

    return status, train_input, train_output, test_input, test_output, last
コード例 #29
0
 def handle_categorical(df: pd.DataFrame, columns: frozenset) -> None:
     cols = frozenset(df.columns) & columns
     cols = [
         col for col, dtype in zip(cols, df[cols].dtypes)
         if not is_float_dtype(dtype)
     ]
     if cols:
         df[cols] = df[cols].astype("category")
コード例 #30
0
def save_as_spss(data_frame: pandas.DataFrame,
                 out_path: str,
                 labels: dict = None,
                 find=None,
                 repl=None) -> None:
    """
    caastools.utils.save_as_spss(data_frame: pandas.DataFrame, out_path: str) -> None
    saves data_frame as an SPSS dataset at out_path
    :param data_frame: the pandas DataFrame to save
    :param out_path: the path at which to save the file
    :param labels: a dictionary mapping column labels in the data frame to a variable label in the SPSS dataset
    :param find: a sequence of characters within variable names to be replaced with other values. Default None
    :param repl: a sequence of characters with which to replace corresponding entries in find, or a function
    which yields their replacements. Default None
    :return: None
    :raise ValueError: if either find/repl is None and the other is not
    :raise ValueError: if find and repl are sequences of unequal length
    """

    cols = data_frame.columns  # type: pandas.Index
    is_multi_index = isinstance(cols, pandas.MultiIndex)
    var_names = []
    var_types = {}
    var_formats = {}
    var_labels = {} if labels is None else labels

    # Construct the various information that the SPSS dictionary will contain about each variable
    for col in cols:
        var_name = sanitize_for_spss(".".join(
            str(i) for i in col) if is_multi_index else str(col),
                                     find=find,
                                     repl=repl)
        var_names.append(var_name)

        # Need to know the data type and format of each column so that the SPSS file can be written properly
        # 0 is a numeric type, any positive integer is a string type where the number represents the number
        # of bytes the string can hold.
        if pandas.api.types.is_string_dtype(data_frame[col]):
            lens = list(
                filter(lambda x: pandas.notna(x) and x is not None,
                       set(data_frame[col].str.len())))
            var_types[var_name] = int(max(lens)) * 2 if len(lens) > 0 else 255
        else:
            var_types[var_name] = 0
            var_formats[var_name] = "F10.2" if ptypes.is_float_dtype(data_frame[col].dtype) else \
                "ADATE8" if ptypes.is_datetime64_any_dtype(data_frame[col]) else \
                "F12.0"

    # Sometimes savReaderWriter has trouble writing a whole dataframe in at once,
    # Writing row by row seems to work without issue
    with SavWriter(out_path,
                   var_names,
                   var_types,
                   formats=var_formats,
                   varLabels=var_labels,
                   ioUtf8=True) as writer:
        for row in data_frame.index:
            writer.writerow(data_frame.loc[row, :].values)
コード例 #31
0
    def test_search(self, mp_wfs, mp_remote_describefeaturetype, mp_remote_md,
                    mp_remote_fc, mp_remote_wfs_feature, mp_dov_xml):
        """Test the search method with only the query parameter.

        Test whether the result is correct.

        Parameters
        ----------
        mp_wfs : pytest.fixture
            Monkeypatch the call to the remote GetCapabilities request.
        mp_remote_describefeaturetype : pytest.fixture
            Monkeypatch the call to a remote DescribeFeatureType.
        mp_remote_md : pytest.fixture
            Monkeypatch the call to get the remote metadata.
        mp_remote_fc : pytest.fixture
            Monkeypatch the call to get the remote feature catalogue.
        mp_remote_wfs_feature : pytest.fixture
            Monkeypatch the call to get WFS features.
        mp_dov_xml : pytest.fixture
            Monkeypatch the call to get the remote XML data.

        """
        df = self.get_search_object().search(
            query=self.get_valid_query_single())

        assert type(df) is DataFrame

        assert list(df) == self.get_df_default_columns()

        datatype = self.get_type()
        allfields = datatype.get_field_names()
        ownfields = datatype.get_field_names(include_subtypes=False)
        subfields = [f for f in allfields if f not in ownfields]

        assert len(df) >= 1

        for field in list(df):
            if field in ownfields:
                assert len(df[field].unique()) == 1
            elif field in subfields:
                assert len(df[field].unique()) >= 1

        # dtype checks of the resulting df columns
        fields = self.get_type().get_fields(source=('wfs', 'xml', 'custom'))

        for field in list(df):
            datatype = fields[field]['type']
            if datatype == 'string':
                assert (is_object_dtype(df[field])
                        or df[field].isnull().values.all())  # all Nan/None
            elif datatype == 'float':
                assert is_float_dtype(df[field])
            elif datatype == 'integer':
                assert is_int64_dtype(df[field])
            elif datatype == 'date':
                assert is_object_dtype(df[field])
            elif datatype == 'boolean':
                assert is_bool_dtype(df[field])
コード例 #32
0
ファイル: test_integer.py プロジェクト: brianholland/pandas
    def _check_op(self, s, op_name, other, exc=None):
        op = self.get_op_from_name(op_name)
        result = op(s, other)

        # compute expected
        mask = s.isna()

        # if s is a DataFrame, squeeze to a Series
        # for comparison
        if isinstance(s, pd.DataFrame):
            result = result.squeeze()
            s = s.squeeze()
            mask = mask.squeeze()

        # other array is an Integer
        if isinstance(other, IntegerArray):
            omask = getattr(other, 'mask', None)
            mask = getattr(other, 'data', other)
            if omask is not None:
                mask |= omask

        # 1 ** na is na, so need to unmask those
        if op_name == '__pow__':
            mask = np.where(s == 1, False, mask)

        elif op_name == '__rpow__':
            mask = np.where(other == 1, False, mask)

        # float result type or float op
        if ((is_float_dtype(other) or is_float(other) or
             op_name in ['__rtruediv__', '__truediv__',
                         '__rdiv__', '__div__'])):
            rs = s.astype('float')
            expected = op(rs, other)
            self._check_op_float(result, expected, mask, s, op_name, other)

        # integer result type
        else:
            rs = pd.Series(s.values._data)
            expected = op(rs, other)
            self._check_op_integer(result, expected, mask, s, op_name, other)
コード例 #33
0
ファイル: csv.py プロジェクト: caseyclements/dask
def coerce_dtypes(df, dtypes):
    """ Coerce dataframe to dtypes safely

    Operates in place

    Parameters
    ----------
    df: Pandas DataFrame
    dtypes: dict like {'x': float}
    """
    bad_dtypes = []
    bad_dates = []
    errors = []
    for c in df.columns:
        if c in dtypes and df.dtypes[c] != dtypes[c]:
            actual = df.dtypes[c]
            desired = dtypes[c]
            if is_float_dtype(actual) and is_integer_dtype(desired):
                bad_dtypes.append((c, actual, desired))
            elif is_object_dtype(actual) and is_datetime64_any_dtype(desired):
                # This can only occur when parse_dates is specified, but an
                # invalid date is encountered. Pandas then silently falls back
                # to object dtype. Since `object_array.astype(datetime)` will
                # silently overflow, error here and report.
                bad_dates.append(c)
            else:
                try:
                    df[c] = df[c].astype(dtypes[c])
                except Exception as e:
                    bad_dtypes.append((c, actual, desired))
                    errors.append((c, e))

    if bad_dtypes:
        if errors:
            ex = '\n'.join("- %s\n  %r" % (c, e) for c, e in
                           sorted(errors, key=lambda x: str(x[0])))
            exceptions = ("The following columns also raised exceptions on "
                          "conversion:\n\n%s\n\n") % ex
            extra = ""
        else:
            exceptions = ""
            # All mismatches are int->float, also suggest `assume_missing=True`
            extra = ("\n\nAlternatively, provide `assume_missing=True` "
                     "to interpret\n"
                     "all unspecified integer columns as floats.")

        bad_dtypes = sorted(bad_dtypes, key=lambda x: str(x[0]))
        table = asciitable(['Column', 'Found', 'Expected'], bad_dtypes)
        dtype_kw = ('dtype={%s}' % ',\n'
                    '       '.join("%r: '%s'" % (k, v)
                                   for (k, v, _) in bad_dtypes))

        dtype_msg = (
            "{table}\n\n"
            "{exceptions}"
            "Usually this is due to dask's dtype inference failing, and\n"
            "*may* be fixed by specifying dtypes manually by adding:\n\n"
            "{dtype_kw}\n\n"
            "to the call to `read_csv`/`read_table`."
            "{extra}").format(table=table, exceptions=exceptions,
                              dtype_kw=dtype_kw, extra=extra)
    else:
        dtype_msg = None

    if bad_dates:
        also = " also " if bad_dtypes else " "
        cols = '\n'.join("- %s" % c for c in bad_dates)
        date_msg = (
            "The following columns{also}failed to properly parse as dates:\n\n"
            "{cols}\n\n"
            "This is usually due to an invalid value in that column. To\n"
            "diagnose and fix it's recommended to drop these columns from the\n"
            "`parse_dates` keyword, and manually convert them to dates later\n"
            "using `dd.to_datetime`.").format(also=also, cols=cols)
    else:
        date_msg = None

    if bad_dtypes or bad_dates:
        rule = "\n\n%s\n\n" % ('-' * 61)
        msg = ("Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.\n\n"
               "%s" % (rule.join(filter(None, [dtype_msg, date_msg]))))
        raise ValueError(msg)