def __init__(self, parent, controller, config, dbvar=None): self.widget_name = "analysispage" super().__init__(parent, controller, config, dbvar) self.engine = AnalysisPageEngine() self.graph_table_notebook = ttk.Notebook(self) self.graph_table_notebook.grid(row=0, rowspan=1, column=0, sticky="news") self.graph_frame = graphframe.GraphFrame(self.graph_table_notebook, self, config) self.datatable = datatable.DataTable(self.graph_table_notebook, self, config) self.graph_table_notebook.add(self.graph_frame, text="Graph") self.graph_table_notebook.add(self.datatable, text="Table") self.query_panel = querypanel.QueryPanel(self, self, config) self.query_panel.grid(row=0, column=1, sticky="nw", rowspan=2) self.menu_pane = ttk.Labelframe(self, text="Controls") self.menu_pane.grid(row=1, column=0, sticky="NW") self.b_menu_pane() self.columnconfigure(0, weight=0) self.columnconfigure(1, weight=1) self.config_chain.append(self.query_panel)
def fit_transform(self, X: dt.Frame, y: np.array = None): target = '__target__' X[:, target] = dt.Frame(y) target_is_numeric = X[:, target][:, [bool, int, float]].shape[1] > 0 if not target_is_numeric: X[:, target] = dt.Frame(LabelEncoder().fit_transform( X[:, target].to_pandas().iloc[:, 0].values).ravel()) self._group_means = X[:, dt.mean(dt.f[target]), dt.by(*self.input_feature_names)] self._group_means.key = self.input_feature_names self.dataset_mean = X[target].mean().to_numpy().ravel()[0] #Expanding mean transform X_ = X.to_pandas()[self.input_feature_names + [target]] X_["index"] = X_.index X_shuffled = X_.sample(n=len(X_), replace=False) X_shuffled["cnt"] = 1 X_shuffled["cumsum"] = (X_shuffled.groupby( self.input_feature_names, sort=False)['__target__'].apply(lambda x: x.shift().cumsum())) X_shuffled["cumcnt"] = (X_shuffled.groupby( self.input_feature_names, sort=False)['cnt'].apply(lambda x: x.shift().cumsum())) X_shuffled["encoded"] = X_shuffled["cumsum"] / X_shuffled["cumcnt"] X_shuffled["encoded"] = X_shuffled["encoded"].fillna(self.dataset_mean) X_transformed = X_shuffled.sort_values("index")["encoded"].values return dt.DataTable(X_transformed)
def parse_table(self, table): ''' Takes HTML for a single table and returns a Table. ''' # Formatting issues sometimes prevent table extraction, so just return if table is None: return False # Count columns. Check either just one row, or all of them. def n_cols_in_row(row): return sum([ int(td['colspan']) if td.has_key('colspan') else 1 for td in row.find_all('td') ]) if config.CAREFUL_PARSING: n_cols = max([ n_cols_in_row(row) for row in table.find('tbody').find_all('tr') ]) else: n_cols = n_cols_in_row(table.find('tbody').find('tr')) # Initialize grid and populate data = datatable.DataTable(0, n_cols) rows = table.find_all('tr') for r in rows: try: cols = r.find_all(['td', 'th']) cols_found_in_row = 0 n_cells = len(cols) # Assign number of rows and columns this cell fills. We use these rules: # * If a rowspan/colspan is explicitly provided, use it # * If not, initially assume span == 1 for both rows and columns. # * Check to make sure that we don't have unaccounted-for columns in the # row after including the current cell. If we do, adjust the colspan # to take up all of the remaining columns. This is necessary because # some tables have malformed HTML, and BeautifulSoup can also # cause problems in its efforts to fix bad tables. The most common # problem is deletion or omission of enough <td> tags to fill all # columns, hence our adjustment. for (i, c) in enumerate(cols): r_num = int(c['rowspan']) if c.has_key('rowspan') else 1 c_num = int(c['colspan']) if c.has_key('colspan') else 1 cols_found_in_row += c_num if i + 1 == n_cells and cols_found_in_row < n_cols: c_num += n_cols - cols_found_in_row data.add_val(c.get_text(), r_num, c_num) except Exception as e: if not config.SILENT_ERRORS: logger.error(e.message) if not config.IGNORE_BAD_ROWS: raise return tableparser.parse_table(data)
def test_groups_internal2(): d0 = dt.DataTable([[1, 5, 3, 2, 1, 3, 1, 1, None], ["a", "b", "c", "a", None, "f", "b", "h", "d"]], names=["A", "B"]) d1 = d0(groupby="A") d1.internal.check() gb = d1.internal.groupby assert gb.ngroups == 5 assert gb.group_sizes == [1, 4, 1, 2, 1] assert d1.to_list() == [[None, 1, 1, 1, 1, 2, 3, 3, 5], ["d", "a", None, "b", "h", "a", "c", "f", "b"]] d2 = d0(groupby="B") d2.internal.check() gb = d2.internal.groupby assert gb.ngroups == 7 assert gb.group_sizes == [1, 2, 2, 1, 1, 1, 1] assert d2.to_list() == [[1, 1, 2, 5, 1, 3, None, 3, 1], [None, "a", "a", "b", "b", "c", "d", "f", "h"]]
def test_groups_internal2(): d0 = dt.DataTable([[1, 5, 3, 2, 1, 3, 1, 1, None], ["a", "b", "c", "a", None, "f", "b", "h", "d"]], names=["A", "B"]) d1 = d0(groupby="A") assert d1.internal.check() ri = d1.internal.rowindex assert ri.ngroups == 5 assert ri.group_sizes == [1, 4, 1, 2, 1] assert d1.topython() == [[None, 1, 1, 1, 1, 2, 3, 3, 5], ["d", "a", None, "b", "h", "a", "c", "f", "b"]] d2 = d0(groupby="B") assert d2.internal.check() ri = d2.internal.rowindex assert ri.ngroups == 7 assert ri.group_sizes == [1, 2, 2, 1, 1, 1, 1] assert d2.topython() == [[1, 1, 2, 5, 1, 3, None, 3, 1], [None, "a", "a", "b", "b", "c", "d", "f", "h"]]
def test_groups_internal2(): d0 = dt.DataTable([[1, 5, 3, 2, 1, 3, 1, 1, None], ["a", "b", "c", "a", None, "f", "b", "h", "d"]], names=["A", "B"]) d1 = d0[:, :, by("A")] # gb = d1.internal.groupby # assert gb.ngroups == 5 # assert gb.group_sizes == [1, 4, 1, 2, 1] assert_equals( d1, dt.Frame(A=[None, 1, 1, 1, 1, 2, 3, 3, 5], B=["d", "a", None, "b", "h", "a", "c", "f", "b"])) d2 = d0[:, :, by("B")] # gb = d2.internal.groupby # assert gb.ngroups == 7 # assert gb.group_sizes == [1, 2, 2, 1, 1, 1, 1] assert_equals( d2, dt.Frame(B=[None, "a", "a", "b", "b", "c", "d", "f", "h"], A=[1, 1, 2, 5, 1, 3, None, 3, 1]))
def test_create_datatable(): """DataTable is old symbol for Frame.""" d = dt.DataTable([1, 2, 3]) d.internal.check() assert d.__class__.__name__ == "Frame" assert d.topython() == [[1, 2, 3]]
def transform(self, X: dt.Frame): transformed_X = X[:, :, dt.join(self._group_means)][:, -1] return dt.DataTable(transformed_X.to_pandas().fillna( self.dataset_mean))
def run_benchmark(algorithm='gpu_hist', rows=1000000, columns=50, iterations=5, test_size=0.25): print("Generating dataset: {} rows * {} columns".format(rows, columns)) print("{}/{} test/train split".format(test_size, 1.0 - test_size)) tmp = time.time() X, y = make_classification(rows, n_features=columns, random_state=7) aa = np.random.rand(X.shape[0],X.shape[1]) fraction_missing = 0.1 X[aa<fraction_missing]=np.NaN print("Number of Nans: %d" % (np.isnan(X).sum())) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=7) print ("Generate Time: %s seconds" % (str(time.time() - tmp))) param = {'objective': 'binary:logistic', 'max_depth': 6, 'silent': 0, 'n_gpus': 1, 'gpu_id': 0, 'eval_metric': 'error', 'debug_verbose': 0, } param['tree_method'] = algorithm do_dt = True do_dt_likeDAI = True do_ccont = False do_nondt = True do_check_accuracy = True tmp = time.time() if do_ccont: X_train_cc = X_train X_test_cc = X_test y_train_cc = y_train y_test_cc = y_test else: # convert to dt as test X_train_cc = np.asfortranarray(X_train) X_test_cc = np.asfortranarray(X_test) y_train_cc = np.asfortranarray(y_train) y_test_cc = np.asfortranarray(y_test) if not (X_train_cc.flags['F_CONTIGUOUS'] and X_test_cc.flags['F_CONTIGUOUS'] \ and y_train_cc.flags['F_CONTIGUOUS'] and y_test_cc.flags['F_CONTIGUOUS']): ValueError("Need data to be Fortran (i.e. column-major) contiguous") print("dt prepare1 Time: %s seconds" % (str(time.time() - tmp))) res={} if do_nondt: print("np->DMatrix Start") # omp way tmp = time.time() # below takes about 2.826s if do_ccont=False # below takes about 0.248s if do_ccont=True dtrain = xgb.DMatrix(X_train_cc, y_train_cc, nthread=-1) print ("np->DMatrix1 Time: %s seconds" % (str(time.time() - tmp))) tmp = time.time() dtest = xgb.DMatrix(X_test_cc, y_test_cc, nthread=-1) print ("np->DMatrix2 Time: %s seconds" % (str(time.time() - tmp))) print("Training with '%s'" % param['tree_method']) tmp = time.time() res_tmp = {} xgb.train(param, dtrain, iterations, evals=[(dtrain, "train"),(dtest, "test")], evals_result=res_tmp) res['1'] = res_tmp['train']['error'] print("Train Time: %s seconds" % (str(time.time() - tmp))) if HAVE_DT and do_dt: # convert to column-major contiguous in memory to mimic persistent column-major state # do_cccont = True leads to prepare2 time of about 1.4s for 1000000 rows * 50 columns # do_cccont = False leads to prepare2 time of about 0.000548 for 1000000 rows * 50 columns tmp = time.time() dtdata_X_train = dt.DataTable(X_train_cc) dtdata_X_test = dt.DataTable(X_test_cc) dtdata_y_train = dt.DataTable(y_train_cc) dtdata_y_test = dt.DataTable(y_test_cc) print ("dt prepare2 Time: %s seconds" % (str(time.time() - tmp))) #test = dtdata_X_train.tonumpy() #print(test) print ("dt->DMatrix Start") # omp way tmp = time.time() # below takes about 0.47s - 0.53s independent of do_ccont dtrain = xgb.DMatrix(dtdata_X_train, dtdata_y_train, nthread=-1) print ("dt->DMatrix1 Time: %s seconds" % (str(time.time() - tmp))) tmp = time.time() dtest = xgb.DMatrix(dtdata_X_test, dtdata_y_test, nthread=-1) print ("dt->DMatrix2 Time: %s seconds" % (str(time.time() - tmp))) print("Training with '%s'" % param['tree_method']) tmp = time.time() res_tmp = {} xgb.train(param, dtrain, iterations, evals=[(dtrain, "train"),(dtest, "test")], evals_result=res_tmp) res['2'] = res_tmp['train']['error'] print ("Train Time: %s seconds" % (str(time.time() - tmp))) if HAVE_DT and do_dt_likeDAI: # convert to column-major contiguous in memory to mimic persistent column-major state # do_cccont = True leads to prepare2 time of about 1.4s for 1000000 rows * 50 columns # do_cccont = False leads to prepare2 time of about 0.000548 for 1000000 rows * 50 columns tmp = time.time() dtdata_X_train = dt.DataTable(X_train_cc) dtdata_X_test = dt.DataTable(X_test_cc) dtdata_y_train = dt.DataTable(y_train_cc) dtdata_y_test = dt.DataTable(y_test_cc) print ("dt prepare2 Time: %s seconds" % (str(time.time() - tmp))) #test = dtdata_X_train.tonumpy() #print(test) print ("dt->DMatrix Start") # omp way tmp = time.time() dtrain = xgb.DMatrix(dtdata_X_train.tonumpy(), dtdata_y_train.tonumpy(), nthread=-1) print ("dt->DMatrix1 Time: %s seconds" % (str(time.time() - tmp))) tmp = time.time() dtest = xgb.DMatrix(dtdata_X_test.tonumpy(), dtdata_y_test.tonumpy(), nthread=-1) print ("dt->DMatrix2 Time: %s seconds" % (str(time.time() - tmp))) print("Training with '%s'" % param['tree_method']) tmp = time.time() res_tmp = {} xgb.train(param, dtrain, iterations, evals=[(dtrain, "train"),(dtest, "test")], evals_result=res_tmp) res['3'] = res_tmp['train']['error'] print ("Train Time: %s seconds" % (str(time.time() - tmp))) if HAVE_DT and do_check_accuracy: assert_accuracy(res['1'],res['2']) assert_accuracy(res['1'],res['3'])
def test_create_datatable(): """DataTable is old symbol for Frame.""" d = dt.DataTable([1, 2, 3]) frame_integrity_check(d) assert d.__class__.__name__ == "Frame" assert d.to_list() == [[1, 2, 3]]
def new_table(self, name, cols=[]) : self.table_id += 1 name_with_id = '%s_%06d' % (name, self.table_id) return datatable.DataTable(self, self.con, name_with_id, cols)