Exemple #1
0
def get_combined_frame(input_frame, result_frame, include_on_out_fields):
    combined_frame = dt.Frame()

    for include_column in include_on_out_fields:
        combined_frame = dt.cbind(combined_frame, input_frame[include_column])

    combined_frame = dt.cbind(combined_frame, result_frame)
    return combined_frame
Exemple #2
0
def test_cbind_issue2024():
    DT = dt.Frame([[]] * 2, names=["A.1", "A.5"])
    with pytest.warns(DatatableWarning):
        RZ = dt.cbind(DT, DT)
        assert RZ.names == ("A.1", "A.5", "A.2", "A.6")
        RZ = dt.cbind(DT, DT, DT)
        assert RZ.names == ("A.1", "A.5", "A.2", "A.6", "A.3", "A.7")
        RZ = dt.cbind(DT, DT, DT, DT)
        assert RZ.names == ("A.1", "A.5", "A.2", "A.6", "A.3", "A.7", "A.4",
                            "A.8")
        RZ = dt.cbind(DT, DT, DT, DT, DT)
        assert RZ.names == ("A.1", "A.5", "A.2", "A.6", "A.3", "A.7", "A.4",
                            "A.8", "A.9", "A.10")
Exemple #3
0
def test_issue1905():
    # cbind() is passed a generator, where each generated Frame is a
    # temporary. In this case cbind() should take care to keep the
    # references to all frames while working, lest they get gc-d by
    # the end of the generator loop.
    DT = dt.cbind(dt.Frame(range(50), names=[str(i)]) for i in range(30))
    assert DT.shape == (50, 30)
Exemple #4
0
def test_issue2055(numpy):
    DT = dt.cbind(dt.Frame(A=[1, 2]),
                  dt.Frame(numpy.ma.array([True, True], mask=[False, False])))
    DT.nrows = 1
    DT = DT.copy()
    frame_integrity_check(DT)
    assert DT.to_list() == [[1], [True]]
Exemple #5
0
def test_issue1921():
    n = 1921
    DTA = dt.Frame(A=range(n))
    DTB = dt.repeat(dt.Frame(B=["hey"], stype=dt.str64), n)
    DT = dt.cbind(DTA, DTB)
    out = DT.to_csv()
    assert out == "\n".join(["A,B"] + ["%d,hey" % i for i in range(n)] + [""])
Exemple #6
0
def test_columnset_sum(DT):
    assert_equals(DT[:, f[int].extend(f[float])], DT[:, [int, float]])
    assert_equals(DT[:, f[:3].extend(f[-3:])], DT[:, [0, 1, 2, -3, -2, -1]])
    assert_equals( DT[:, f['A','B','C'].extend(f['E','F', 'G'])], DT[:, [0, 1, 2, -3, -2, -1]])
    assert_equals(DT[:, f.A.extend(f.B)], DT[:, ['A', 'B']])
    assert_equals(DT[:, f[:].extend({"extra": f.A + f.C})],
                  dt.cbind(DT, DT[:, {"extra": f.A + f.C}]))
def py_tidy_descriptive_stats(DT):
    """Generate summary statistics of datatable"""
    datos_dict = DT.to_dict()
    summary_stats_of_dict = {
        k: [
            np.nanmean(v),
            np.nanmedian(v),
            np.nanmin(v),
            np.nanmax(v),
            np.nanstd(v),
            np.percentile(v, 25, interpolation='midpoint'),
            np.percentile(v, 75, interpolation='midpoint'),
            np.percentile(v, 75, interpolation='midpoint') -
            np.percentile(v, 25, interpolation='midpoint'),
            np.nanstd(v) / np.sqrt(np.shape(v)[0])
        ]
        for k, v in datos_dict.items()
    }
    summary_dict_names = dt.Frame({
        'descriptive_stats':
        ['Mean', 'Median', 'Min', 'Max', 'Std', 'Q1', 'Q3', 'IQR', 'SE']
    })
    summary_stats_of_dict_prep = {
        k: list(map(lambda x: np.round(x, 3), v))
        for k, v in summary_stats_of_dict.items()
    }
    summary_stat_dt = dt.Frame(summary_stats_of_dict_prep)
    return dt.cbind(summary_dict_names, summary_stat_dt)
Exemple #8
0
def test_materialize():
    DT1 = dt.Frame(A=range(12))[::2, :]
    DT2 = dt.repeat(dt.Frame(B=["red", "green", "blue"]), 2)
    DT3 = dt.Frame(C=[4, 2, 9.1, 12, 0])
    DT = dt.cbind(DT1, DT2, DT3, force=True)
    assert frame_columns_virtual(DT) == (True, True, True)
    DT.materialize()
    assert frame_columns_virtual(DT) == (False, False, False)
Exemple #9
0
def test_aggregate_3d_fixed_small_radius():
    DT = dt.Frame([range(10)] * 3)
    [DTE, DTM] = aggregate(DT, min_rows=0, nd_max_bins=1, fixed_radius=0.1)
    DTE_ref = cbind(
        DT, dt.Frame([1] * 10 / dt.stype.int32, names=["members_count"]))
    DTM_ref = dt.Frame(range(10), names=["exemplar_id"])
    assert_equals(DTE, DTE_ref)
    assert_equals(DTM, DTM_ref)
Exemple #10
0
def test_cbind_method():
    d0 = dt.Frame({"A": [1, 2, 3]})
    d1 = dt.Frame({"B": list('abc')})
    d2 = dt.Frame({"C": [5.6, 7.1, -3.3]})
    dr = dt.cbind(d0, d1, d2)
    assert dr.names == ("A", "B", "C")
    res = dt.Frame([[1, 2, 3], ["a", "b", "c"], [5.6, 7.1, -3.3]],
                   names=("A", "B", "C"))
    assert_equals(dr, res)
Exemple #11
0
def test_cbind_notinplace():
    d0 = dt.Frame({"A": [1, 2, 3]})
    d1 = dt.Frame({"B": [4, 5, 6]})
    dt_compute_stats(d0, d1)
    dd = dt.cbind(d0, d1)
    dr = dt.Frame({"A": [1, 2, 3], "B": [4, 5, 6]})
    assert_equals(dd, dr)
    assert_equals(d0, dt.Frame({"A": [1, 2, 3]}))
    assert_equals(d1, dt.Frame({"B": [4, 5, 6]}))
Exemple #12
0
def test_debug_logger_default_with_report_args(capsys):
    assert dt.options.debug.logger is None
    with dt.options.debug.context(enabled=True, report_args=True):
        assert dt.options.debug.logger is None
        assert dt.options.debug.enabled is True
        DT = dt.Frame(range(100000))
        out, err = capsys.readouterr()
        print(out)
        assert not err
        assert re.search(r"<Frame#[\da-fA-F]+>.__init__\(range\(0, 100000\)\)", out)
        assert re.search(r"# \d+(?:\.\d+)?(?:[eE][+-]?\d+)? s", out)

        with pytest.raises(TypeError):
            dt.cbind(3)
        out, err = capsys.readouterr()
        assert not err
        assert "datatable.cbind(3) {" in out
        assert re.search(r"} # \d+(?:\.\d+)?(?:[eE][+-]?\d+)? s \(failed\)", out)
Exemple #13
0
def test_materialize():
    DT1 = dt.Frame(A=range(12))[::2, :]
    DT2 = dt.repeat(dt.Frame(B=["red", "green", "blue"]), 2)
    DT3 = dt.Frame(C=[4, 2, 9.1, 12, 0])
    DT = dt.cbind(DT1, DT2, DT3, force=True)
    assert frame_column_rowindex(DT, 0).type == "slice"
    assert frame_column_rowindex(DT, 1).type == "arr32"
    assert frame_column_rowindex(DT, 2) is None
    DT.materialize()
    assert frame_column_rowindex(DT, 0) is None
    assert frame_column_rowindex(DT, 1) is None
    assert frame_column_rowindex(DT, 2) is None
Exemple #14
0
def test_topandas_view_mixed():
    d0 = dt.Frame(A=range(100))
    d1 = d0[7:17, :]
    d2 = dt.Frame(B=['foo', 'bar', 'buzz'] * 3 + ['finale'])
    d3 = dt.Frame(V=[2.2222])
    d3.nrows = 10
    dd = dt.cbind(d1, d2, d3)
    pp = dd.to_pandas()
    assert pp.columns.tolist() == ["A", "B", "V"]
    assert pp["A"].tolist() == list(range(7, 17))
    assert pp["B"].tolist() == d2.to_list()[0]
    assert pp["V"].tolist()[0] == 2.2222
    assert all(math.isnan(x) for x in pp["V"].tolist()[1:])
Exemple #15
0
def pd_dt_concat(frames, axis=0):
    """
    Concatenate sequence of datatable Frames or pandas DataFrames `frames` along `axis` (0 means rows, 1 means columns).
    """

    if USE_DT:
        if axis == 0:
            return dt.rbind(*frames)
        elif axis == 1:
            return dt.cbind(*frames)
        else:
            raise ValueError('invalid axis:', axis)
    else:
        return pd.concat(frames, axis=axis)
Exemple #16
0
def write_table(df, name, output_dir, add_index=True):
    """
    Add a primary key to df ('id' column) and write it to output_dir
    as a .csv file.

    @param df: [`datatable.Frame`] A PharmacoDB table
    @param name: [`string`] The name of the table
    @param output_dir: [`string`] The directory to write the table to
    @return: [`datatable.Frame`] The indexed PharmacoDB table
    """
    print(f'Writing {name} table to {output_dir}...')
    if add_index:
        # Index datatable
        df = cbind(dt.Frame(id=np.arange(df.nrows) + 1), df)
    # Write to .csv
    df.to_csv(os.path.join(output_dir, f'{name}.csv'))
    return df
Exemple #17
0
def pydt_reshape_wide_to_long(DT,*measure_vars,var_name=None,val_name=None):
    """reshaping datatable from wide columns to long """
    dt_cols=[*measure_vars]
    measure_col_dict = DT[:,[*measure_vars]].to_dict()
    variables_dict={'variable':[],'value':[]}
    for k,v in measure_col_dict.items():
        variables_dict['variable'].extend(repeat(k,len(v)))
        variables_dict['value'].extend(v)
    wide_to_long_dt = dt.Frame(variables_dict)
    removed_cols_dt = DT[:,f[:].remove([ f[col] for col in dt_cols])].to_dict()
    non_measures_dt = dt.Frame({k:list(chain.from_iterable(list(repeat(v,len(dt_cols))))) for k,v in removed_cols_dt.items()})
    
    if var_name and val_name is not None:
        wide_to_long_dt.names={'variable':var_name,'value':val_name}
        
    wide_to_long_prep_dt=dt.cbind(non_measures_dt,wide_to_long_dt)
        
    return wide_to_long_prep_dt
Exemple #18
0
def test_cbind_api():
    DT1 = dt.Frame(A=[1, 2, 3])
    DT2 = dt.Frame(B=[-4, -5, None])
    DT3 = dt.Frame(X=["makes", "gonna", "make"])
    RES1 = dt.cbind(DT1, DT2, DT3)
    RES2 = dt.cbind([DT1, DT2, DT3])
    RES3 = dt.cbind((DT1, DT2, DT3))  # tuple
    RES4 = dt.cbind([DT1], [DT2, DT3])
    RES5 = dt.cbind(DT1, [DT2], DT3)
    RES6 = dt.cbind((frame for frame in [DT1, DT2, DT3]))  # generator
    assert_equals(RES1, RES2)
    assert_equals(RES1, RES3)
    assert_equals(RES1, RES4)
    assert_equals(RES1, RES5)
    assert_equals(RES1, RES6)
Exemple #19
0
def dtm_to_datatable(dtm, doc_labels, vocab, colname_rowindex='_doc'):
    """
    Convert a (sparse) DTM to a datatable Frame using document labels `doc_labels` as row idenitifier (with column name
    `colname_rowindex`) and `vocab` as column names.

    .. seealso:: :func:`~tmtoolkit.bow.dtm.dtm_to_dataframe` for generating a pandas DataFrame.

    :param dtm: (sparse) document-term-matrix of size NxM (N docs, M is vocab size) with raw terms counts
    :param doc_labels: document labels used as row index (row names); size must equal number of rows in `dtm`
    :param vocab: list or array of vocabulary used as column names; size must equal number of columns in `dtm`
    :param colname_rowindex: column name for row identifier (i.e. column where the document labels are put)
    :return: datatable Frame
    """

    if not USE_DT:
        raise RuntimeError('this function requires the package "datatable" to be installed')

    import datatable as dt

    if dtm.ndim != 2:
        raise ValueError('`dtm` must be a 2D array/matrix')

    if dtm.shape[0] != len(doc_labels):
        raise ValueError('number of rows must be equal to `len(doc_labels)')

    if dtm.shape[1] != len(vocab):
        raise ValueError('number of rows must be equal to `len(vocab)')

    if isinstance(dtm, np.matrix):
        dtm = dtm.A

    if not isinstance(dtm, np.ndarray):
        dtm = dtm.toarray()

    return dt.cbind(dt.Frame({colname_rowindex: doc_labels}),
                    dt.Frame(dtm, names=list(vocab)))
Exemple #20
0
def test_cbind_nones():
    DT = dt.cbind(None, dt.Frame(A=range(5)), None, dt.Frame(B=[0] * 5))
    assert_equals(DT, dt.Frame(A=range(5), B=[0] * 5))
Exemple #21
0
    
})

##### Notes:
* Now have a pandas DF **seattle_dates_df** with these columns- year,month,day,hour and week_day, and it needs to be converted to a DT for further analysis
* Here, a DF should be passed to **dt.Frame()** method and assigned to a new DT with name as illustrated below

# Convering date df to dt
seatle_dates_dt = dt.Frame(seatle_dates_df)

* In our original seattle DT we no more require date column in string format, so better to remove it from DT  

# delating a string date column from dt
del seattle_bikes_dt['date']

* The DT's seattle_dates_dt and seattle_bikes_dt will be concatenated with the help of **dt.cbind()** function

# Concatinating two dts to have a tidy dt
seatle_bikes_dt_tidy = dt.cbind(seatle_dates_dt,seattle_bikes_dt)

# Tidy DT first 4 and last 4 observations
seatle_bikes_dt_tidy

##### Notes:
* We can see that a column crossing has a different categorical level values, and 3 of them are having longer names, to make them shorter their values should be updated/modified with shorter string values. 
* In Datatable it can be achived as done in below 4 code chunks.
* here is a simple syntax for this: DT[column value condition, the column to be updated]='new value'

# Modifying observations of crossing col - set - 1
seatle_bikes_dt_tidy[f.crossing=="39th Ave NE Greenway at NE 62nd St",f.crossing]='Greenwayway-NE-62Strt'
Exemple #22
0
 def join(names1, names2):
     with pytest.warns(DatatableWarning):
         return dt.cbind(dt.Frame(names=names1),
                         dt.Frame(names=names2)).names
Exemple #23
0
def test_cbind_0rows_3():
    DT0 = dt.Frame(A=[], B=[], C=[])
    RES1 = dt.cbind(dt.Frame(), DT0)
    RES2 = dt.cbind(DT0, dt.Frame())
    assert_equals(RES1, DT0)
    assert_equals(RES2, DT0)
Exemple #24
0
def test_cbind_0rows_1():
    """Issue #1604."""
    res = dt.cbind(dt.Frame(A=[]), dt.Frame(B=[]))
    assert res.names == ("A", "B")
    assert res.shape == (0, 2)
Exemple #25
0
    def create_data(X: dt.Frame = None):
        """ Convert transactional data to i.i.d. data by making time-based aggregations """

        if X is None:
            X = TransactionalToIID.make_transactions()
            if not make_features_from_scratch:
                return {'raw_transactions_non_iid': X}

        if X is None:
            return []

        X_pd = X[:, [col_date, col_group, target]].to_pandas()  # faster, since only working on a few cols
        X_pd[col_row_id] = np.arange(X_pd.shape[0])

        y = X_pd[target]
        y_enc = target + ".enc"

        # Create boolean target
        X_pd[y_enc] = (y == target_labels[1]).astype(int)

        # Make sure time is datetime64, not string
        X_pd[col_date] = pd.to_datetime(X_pd[col_date])

        for leak in leaky_choices:
            # Create the groups
            groups = X_pd.groupby(col_group)

            shift_amount = 0 if leak else 1  # this is critical to avoid leaks!  DO NOT SET IT TO 0 IN PRODUCTION!

            # Compute aggregation over time
            for t in window_length_days:
                t_days = str(t) + "d"  # pandas will do rolling window over this many days ('5d' etc.)
                for op in operators:
                    lag_feature = []
                    for _, df in groups:
                        df = df.sort_values(col_date)
                        time_window = df.set_index(col_date)[y_enc].shift(shift_amount). \
                            dropna().rolling(t_days, min_periods=1)  # get time window. if leaky, includes self
                        res = getattr(time_window, op)()  # apply operator on time window
                        res.index = df.index[shift_amount:]
                        lag_feature.append(res)
                    # Index is set on both side so equal works and reorders rows automatically
                    X_pd["%s%s_%s_past_%d_days_grouped_by_%s" %
                         ("leaky_" if leak else "", op, target, t, col_group)] = pd.concat(lag_feature, axis=0)

        del X_pd[y_enc]  # delete temporary binary response column

        # delete grouping column, since have all aggregations already in iid form
        del X_pd[col_group]
        del X[col_group]

        # create datatable frame of new features (only)
        X_features = dt.Frame(X_pd.loc[:, [x for x in X_pd.columns if x not in [col_date, target, col_row_id]]])

        # add new features to original frame
        X_new = dt.cbind(X, X_features)

        out = {}
        for name, time_range in {
            # 2-way split: ideal for iid, let Driverless do internal validation splits on training split
            'train_iid': X_pd[col_date] <= split_date,
            'test_iid': X_pd[col_date] > split_date
        }.items():
            # X_pd is pandas - easier to deal with time slices, and keep row_id to index into datatable below
            which_rows = X_pd.loc[time_range, col_row_id].reset_index(drop=True).values
            if shuffle:
                np.random.shuffle(which_rows)  # shuffle data for generality - no impact on iid modeling
                name += ".shuf"
            for leak in leaky_choices:
                X_out = X_new.copy()  # shallow copy
                if leak:
                    cols_to_del = [x for x in X_features.names if "leaky" != x[:5]]
                else:
                    cols_to_del = [x for x in X_features.names if "leaky" == x[:5]]
                del X_out[:, cols_to_del]
                out[name + (".leaky" if leak else "")] = X_out[which_rows, :]

        return out
Exemple #26
0
py_dt_two_group_proportions_summary(policia_tidy_dt,'driver_race','is_arrested')

# stop time
stop_time_df = policia_tidy_dt[:,(f.stop_time)].to_pandas()

# extracting hour
stop_time_hour = stop_time_df.stop_time.str.extract(r'([\d]{2})')

# a new dataframe
stop_time_hour_dt = dt.Frame(stop_time_hour)

# change a col name
stop_time_hour_dt.names={'0':'stop_hour'}

# Binding two dts
policia_tidy_dt_v1 = dt.cbind(policia_tidy_dt,stop_time_hour_dt)

# Hour wise arrests
hour_wise_arrests_dt = py_dt_two_group_proportions_summary(policia_tidy_dt_v1,'stop_hour','is_arrested')

# Visualization
alt.Chart(hour_wise_arrests_dt.to_pandas()).mark_bar().encode(
    alt.X('stop_hour:N'),
    alt.Y('count'),
    alt.Color('is_arrested')
).properties(

    title= 'Hour wise arrest trends'
)

# Hour wise arrest rates
amigos_info_dt[:,dt.update(temp=f.directed_by==f.written_by)]

# are the directors and writers same for a title ?
amigos_info_dt[f.temp==True,:]

# remove the temp col
del amigos_info_dt["temp"]

# split writers column
writers_list = [ elemento.split('&') for elemento in amigos_info_dt[:,f.written_by].to_list()[0] ]

# create a new DT with writers
writers_dt = dt.Frame({'no_of_writers':[len(elem) for elem in writers_list]})

# Joining two DTs
amigos_info_dt_v1 = dt.cbind(amigos_info_dt,writers_dt)

# No of writers 
alt.Chart(amigos_info_dt_v1[:,count(),by(f.no_of_writers)].to_pandas()).mark_bar().encode(

    alt.X('count'),
    alt.Y('no_of_writers:O')
).properties(
    
    title='Number of writers in titles'
)

amigos_year = dt.Frame({'year':[re.findall(r'[\d]{4}',fecha)[0] for fecha in amigos_info_dt_v1[:,f.air_date].to_list()[0] ]})

alt.Chart(amigos_year[:,count(),by(f.year)].to_pandas()).mark_line().encode(alt.X('year'),alt.Y('count'))
Exemple #28
0
def test_cbind_empty3():
    DT = dt.cbind()
    assert_equals(DT, dt.Frame())
Exemple #29
0
def test_create_from_doublestar_expansion():
    DT0 = dt.Frame(A=range(3), B=["df", "qe;r", None])
    DT1 = dt.Frame(D=[7.99, -12.5, 0.1], E=[None]*3)
    DT = dt.Frame(**DT0, **DT1)
    assert_equals(DT, dt.cbind(DT0, DT1))
Exemple #30
0
def test_cbind_expanded_frame():
    DT = dt.Frame(A=[1, 2], B=['a', "E"], C=[7, 1000], D=[-3.14, 159265])
    RES = dt.cbind(*DT)
    assert_equals(DT, RES)