def test_table_to_frame_metas(self): from Orange.data.pandas_compat import table_to_frame table = Table("zoo") domain = table.domain df = table_to_frame(table) cols = pd.Index([var.name for var in domain.variables]) pd.testing.assert_index_equal(df.columns, cols) df = table_to_frame(table, include_metas=True) cols = pd.Index([var.name for var in domain.variables + domain.metas]) pd.testing.assert_index_equal(df.columns, cols)
def test_table_to_frame_on_all_orange_dataset(self): from os import listdir from Orange.data.pandas_compat import table_to_frame dataset_directory = "Orange/datasets/" def _filename_to_dataset_name(f): return f.split('.')[0] def _get_orange_demo_datasets(): x = [ _filename_to_dataset_name(f) for f in listdir(dataset_directory) if '.tab' in f ] return x for name in _get_orange_demo_datasets(): table = Table(name) df = table_to_frame(table) assert_message = "Failed to process Table('{}')".format(name) self.assertEqual(type(df), pd.DataFrame, assert_message) self.assertEqual(len(df), len(table), assert_message) self.assertEqual(len(df.columns), len(table.domain.variables), assert_message)
def test_load_data(self): corpus = self.client.search_content(["orange"]) self.assertEqual(4, len(corpus)) self.assertTupleEqual(tuple(m[0] for m in twitter.METAS), corpus.domain.metas) df = table_to_frame(corpus, include_metas=True) pd.testing.assert_frame_equal(df.reset_index(drop=True), ER, check_dtype=False, check_categorical=False)
def test_table_to_frame_object_dtype(self): from Orange.data.pandas_compat import table_to_frame domain = Domain([], metas=[ContinuousVariable("a", number_of_decimals=0)]) table = Table.from_numpy( domain, np.empty((10, 0)), metas=np.ones((10, 1), dtype=object) ) df = table_to_frame(table, include_metas=True) self.assertEqual(["a"], df.columns) np.testing.assert_array_equal(df["a"].values, np.ones((10,)))
def test_table_to_frame(self): from Orange.data.pandas_compat import table_to_frame table = Table("iris") df = table_to_frame(table) table_column_names = [var.name for var in table.domain.variables] frame_column_names = df.columns self.assertEqual(sorted(table_column_names), sorted(frame_column_names)) self.assertEqual(type(df['iris'].dtype), pd.api.types.CategoricalDtype) self.assertEqual(list(df['sepal length'])[0:4], [5.1, 4.9, 4.7, 4.6]) self.assertEqual(list(df['iris'])[0:2], ['Iris-setosa', 'Iris-setosa'])
def fit(self, data): if not contains_survival_endpoints(data.domain): raise ValueError(self.learner_adequacy_err_msg) time_var, event_var = get_survival_endpoints(data.domain) df = table_to_frame(data, include_metas=False) df = df.dropna(axis=0) df[time_var.name] = df[time_var.name].astype(float) df[event_var.name] = df[event_var.name].astype(float) cph = CoxPHFitter(**self.params['kwargs']) cph = cph.fit(df, duration_col=time_var.name, event_col=event_var.name) return CoxRegressionModel(cph)
def test_load_authors(self, user_mock): user_mock.return_value = MagicMock(data=MagicMock(id=1)) corpus = self.client.search_authors(["orange"]) self.assertEqual(4, len(corpus)) self.assertTupleEqual(tuple(m[0] for m in twitter.METAS), corpus.domain.metas) df = table_to_frame(corpus, include_metas=True) pd.testing.assert_frame_equal(df.reset_index(drop=True), ER, check_dtype=False, check_categorical=False)
def set_data(self, data, id): """Set the input data for channel id.""" if id in self.tables: if data is None: del self.tables[id] else: df = table_to_frame(data, include_metas=True) df.name = data.name self.tables[id] = df else: if data is not None: df = table_to_frame(data, include_metas=True) df.name = data.name self.tables[id] = df if len(self.tables): self.infoa.setText("%d tables on input." % len(self.tables)) self.infob.setText("Please click the button below to run Glueviz.") self.tablesbox.setDisabled(False) else: self.infoa.setText("No data on input.") self.infob.setText("Please connect to other widgets.") self.tablesbox.setDisabled(True) self._update_tablelist()
def table_to_binary_df(table, target_class='hit'): import Orange from Orange.data.pandas_compat import table_to_frame import pandas as pd if all([a.is_discrete for a in table.domain.attributes]) == True: disc_data_table = table else: disc = Orange.preprocess.Discretize() # disc.method = Orange.preprocess.discretize.EqualFreq(n=5) disc.method = Orange.preprocess.discretize.EntropyMDL(force=True) disc_data_table = disc(table) df = table_to_frame(disc_data_table) # Y = pd.DataFrame(disc_data_table.Y,columns=[disc_data_table.domain.class_var.name],dtype='int32') Y = disc_data_table.Y df.drop(df.columns[-1], axis=1, inplace=True) df = pd.get_dummies(df) return disc_data_table, df, Y
def test_table_to_frame_nans(self): from Orange.data.pandas_compat import table_to_frame domain = Domain( [ContinuousVariable("a", number_of_decimals=0), ContinuousVariable("b")] ) table = Table( domain, np.column_stack((np.ones(10), np.hstack((np.ones(9), [np.nan])))) ) df = table_to_frame(table) table_column_names = [var.name for var in table.domain.variables] frame_column_names = df.columns self.assertEqual(sorted(table_column_names), sorted(frame_column_names)) self.assertEqual(df["a"].dtype, int) self.assertEqual(df["b"].dtype, float) self.assertEqual([1, 1, 1], list(df["a"].iloc[-3:])) self.assertTrue(np.isnan(df["b"].iloc[-1]))
def worker(table: Table, covariates: List, time_var: str, event_var: str, state: TaskState): with multiprocessing.Manager() as _manager: _queue = _manager.Queue() _cpu_count = cpu_count() df = table_to_frame(table, include_metas=False) df = df.astype({event_var: np.float64}) if len(covariates) > 50: batches = (df[[time_var, event_var] + batch] for batch in [covariates[i::_cpu_count] for i in range(_cpu_count)]) else: batches = (df[[time_var, event_var] + [cov]] for cov in covariates) progress_steps = iter(np.linspace(0, 100, len(covariates))) with multiprocessing.Pool(processes=_cpu_count) as pool: results = pool.map_async( partial( batch_to_process, _queue, time_var, event_var, ), batches, ) while True: try: state.set_progress_value(next(progress_steps)) _queue.get(timeout=3) except (queue.Empty, StopIteration): break stacked_result = np.vstack(results.get()) covariate_names = stacked_result[:, 0] results = stacked_result[:, 1:].astype(float) _, pvals_corrected = fdrcorrection(results[:, -1], is_sorted=False) results = np.hstack( (results, pvals_corrected.reshape(pvals_corrected.shape[0], -1))) return covariate_names, results
def test_table_to_frame_on_all_orange_dataset(self): from os import listdir from Orange.data.pandas_compat import table_to_frame import pandas as pd dataset_directory = "Orange/datasets/" def _filename_to_dataset_name(f): return f.split('.')[0] def _get_orange_demo_datasets(): x = [_filename_to_dataset_name(f) for f in listdir(dataset_directory) if '.tab' in f] return x for name in _get_orange_demo_datasets(): table = Table(name) df = table_to_frame(table) assert_message = "Failed to process Table('{}')".format(name) self.assertEqual(type(df), pd.DataFrame, assert_message) self.assertEqual(len(df), len(table), assert_message) self.assertEqual(len(df.columns), len(table.domain), assert_message)
def explain_tabular(dataset, blackbox, target_class_idx=1, pre_label=True, random_seed=42): ''' Input Params: 1. dataset: a Orange data table 3. blackbox: a blackbox predict function, such as `c.predict` where c is a scikit-classifier 4. target_class --- Output: A decision set. ''' np.random.seed(random_seed) if pre_label == False: # re-labelled the data using the blackbox, otherwise assuming the labels provided is labeled by the classifier, (instead of the groundtruth label) labels = blackbox(dataset.X) dataset = Orange.data.Table(dataset.domain, dataset.X, labels) # fit the explainer to the data # explainer = IDS(dataset, blackbox) # rule_set = explainer.fit(dataset.domain,dataset.X,dataset.Y,target_class=target_class) # df = pd.read_csv('titanic_train.tab',' ', header=None, names=['Passenger_Cat', 'Age_Cat', 'Gender']) # df1 = pd.read_csv('titanic_train.Y', ' ', header=None, names=['Died', 'Survived']) # Y = list(df1['Died'].values) # df1.head() import Orange from Orange.data.pandas_compat import table_to_frame import pandas as pd if all([ a.is_discrete for a in dataset.domain.attributes]) == True: disc_data_table = dataset else: print("discre") disc = Orange.preprocess.Discretize() disc.method = Orange.preprocess.discretize.EqualFreq(n=5) # disc.method = Orange.preprocess.discretize.EntropyMDL(force=True) disc_data_table = disc(dataset) # df = table_to_frame(disc_data_table) # Y = pd.DataFrame(disc_data_table.Y,columns=[disc_data_table.domain.class_var.name],dtype='int32') assert all([ a.is_discrete for a in disc_data_table.domain.attributes]), " is not pre-discretized!" # disc_data_table = dataset Y = disc_data_table.Y df = table_to_frame(disc_data_table) df.drop(df.columns[-1],axis = 1, inplace = True) print("start Apriori") itemsets = run_apriori(df, 0.05) # itemsets = run_apriori(df, 0.5) print("finish Apriori. Converting itemset") list_of_rules = createrules(itemsets, list(set(Y))) print("Pre-mined okay. all pre-mined rules of",len(list_of_rules)) # print("----------------------") # for r in list_of_rules: # r.print_rule() # lambda_array = [1.0]*7 # use separate hyperparamter search routine lambda_array = [0.5,1.0,1.0,1.0,1.0,1.5,1.0] # use separate hyperparamter search routine s1 = smooth_local_search(list_of_rules, df, Y, lambda_array, 0.33, 0.33) s2 = smooth_local_search(list_of_rules, df, Y, lambda_array, 0.33, -1.0) f1 = func_evaluation(s1, list_of_rules, df, Y, lambda_array) f2 = func_evaluation(s2, list_of_rules, df, Y, lambda_array) if f1 > f2: print("The Solution Set is: "+str(s1)) rule_set = [ list_of_rules[idx] for idx in s1] else: print("The Solution Set is: "+str(s2)) rule_set = [ list_of_rules[idx] for idx in s2] print(rule_set) # convert the rule representation rule_set = rules_convert(rule_set,dataset, target_class_idx=target_class_idx) return rule_set