Example #1
0
def test_cast_str_to_str(source_stype):
    DT = dt.Frame(S=["Right", "middle", None, "", "A" * 1000],
                  stype=source_stype)
    assert DT.stypes == (source_stype, )
    RES = DT[:, [dt.str32(f.S), dt.str64(f.S)]]
    frame_integrity_check(RES)
    assert RES.to_list() == DT.to_list() * 2
Example #2
0
def test_cast_bool_to_str():
    DT = dt.Frame(P=[False, None, True, True, False, True])
    assert DT.stypes == (dt.bool8, )
    RES = DT[:, [dt.str32(f.P), dt.str64(f.P)]]
    assert RES.stypes == (dt.str32, dt.str64)
    ans = ["False", None, "True", "True", "False", "True"]
    assert RES.to_list() == [ans, ans]
Example #3
0
def test_cast_obj_to_str():
    src = [noop, "Hello!", ..., {}, dt, print, None]
    DT = dt.Frame(src)
    assert DT.stypes == (dt.obj64, )
    RES = DT[:, [dt.str32(f[0]), dt.str64(f[0])]]
    frame_integrity_check(RES)
    ans = [str(x) for x in src]
    ans[-1] = None
    assert RES.to_list() == [ans, ans]
Example #4
0
def test_cast_float_to_str(source_stype):
    DT = dt.Frame(J=[3.5, 7.049, -3.18, math.inf, math.nan, 1.0, -math.inf,
                     1e16, 0],
                  stype=source_stype)
    assert DT.stypes == (source_stype,)
    RES = DT[:, [dt.str32(f.J), dt.str64(f.J)]]
    frame_integrity_check(RES)
    ans = ["3.5", "7.049", "-3.18", "inf", None, "1.0", "-inf", "1.0e+16", "0.0"]
    assert RES.to_list() == [ans, ans]
Example #5
0
def test_cast_int_to_str(source_stype):
    DT = dt.Frame([None, 0, -3, 189, 77, 14, None, 394831, -52939047130424957],
                  stype=source_stype)
    assert DT.stypes == (source_stype, )
    RES = DT[:, [dt.str32(f.C0), dt.str64(f.C0)]]
    frame_integrity_check(RES)
    assert RES.stypes == (dt.str32, dt.str64)
    assert RES.shape == (DT.nrows, 2)
    ans = [None if v is None else str(v) for v in DT.to_list()[0]]
    assert RES.to_list()[0] == ans
Example #6
0
def test_cast_int_to_str(stype0):
    dt0 = dt.Frame(
        [None, 0, -3, 189, 77, 14, None, 394831, -52939047130424957],
        stype=stype0)
    dt1 = dt0[:, [dt.str32(f.C0), dt.str64(f.C0)]]
    dt1.internal.check()
    assert dt1.stypes == (dt.str32, dt.str64)
    assert dt1.shape == (dt0.nrows, 2)
    ans = [None if v is None else str(v) for v in dt0.topython()[0]]
    assert dt1.topython()[0] == ans
Example #7
0
def test_cast_to_str(src):
    def to_str(x):
        if x is None: return None
        if isinstance(x, bool): return str(int(x))
        # if isinstance(x, float) and math.isnan(x): return None
        return str(x)

    dt0 = dt.Frame(src)
    dt1 = dt0[:, [dt.str32(f[i]) for i in range(dt0.ncols)]]
    dt2 = dt0[:, [dt.str64(f[i]) for i in range(dt0.ncols)]]
    dt1.internal.check()
    dt2.internal.check()
    assert dt1.stypes == (dt.str32, ) * dt0.ncols
    assert dt2.stypes == (dt.str64, ) * dt0.ncols
    assert dt1.topython()[0] == [to_str(x) for x in src]
Example #8
0
 def _write_csv(self, df, file):
     # Before writing, we need to convert all columns to strings for two
     # reasons:
     # - We have to convert any obj64 types to str64: Frame.to_csv can't
     #   process them.
     # - We have to replace None with NULL to tell MySQL, that we have
     #   actual NULL values. An empty cell is sometimes, but not always a
     #   NULL value. See #30
     # - We have to check if the frame is empty. If so we have to
     #   circumvent a  bug in datatable: see #36
     if df.shape[0] == 0:
         return None
     df = df[:, f[:].remove(f[:]).extend(str64(f[:]))][:, df.names]
     df.replace(None, "NULL")
     df.to_csv(path=file, header=False)
Example #9
0
    def transform(self, X: dt.Frame):
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        X = dt.Frame(X)
        original_zip_column_name = X.names[0]
        X = X[:, dt.str64(dt.f[0])]
        X.names = ['zip_key']
        try:
            zip_list = dt.unique(X[~dt.isna(dt.f.zip_key),
                                   0]).to_list()[0] + ['79936']
            zip_features = [self.get_zipcode_features(x) for x in zip_list]
            X_g = dt.Frame({"zip_key": zip_list})
            X_g.cbind(dt.Frame(zip_features))
            X_g.key = 'zip_key'
            X_result = X[:, :, dt.join(X_g)]
            self._output_feature_names = [
                "{}:{}.{}".format(self.transformer_name,
                                  original_zip_column_name,
                                  self.replaceBannedCharacters(f))
                for f in list(X_result[:, 1:].names)
            ]
            self._feature_desc = [
                "Property '{}' of zipcode column ['{}'] from US zipcode database (recipe '{}')"
                .format(f, original_zip_column_name, self.transformer_name)
                for f in list(X_result[:, 1:].names)
            ]
            return X_result[:, 1:]
        except ValueError as ve:
            loggerinfo(
                logger, "Column '{}' is not a zipcode: {}".format(
                    original_zip_column_name, str(ve)))
            return self.get_zipcode_null_result(X, original_zip_column_name)
        except TypeError as te:
            loggerwarning(
                logger, "Column '{}' triggered TypeError: {}".format(
                    original_zip_column_name, str(te)))
            raise te