Beispiel #1
0
    def test_table_to_frame_metas(self):
        from Orange.data.pandas_compat import table_to_frame

        table = Table("zoo")
        domain = table.domain

        df = table_to_frame(table)
        cols = pd.Index([var.name for var in domain.variables])
        pd.testing.assert_index_equal(df.columns, cols)

        df = table_to_frame(table, include_metas=True)
        cols = pd.Index([var.name for var in domain.variables + domain.metas])
        pd.testing.assert_index_equal(df.columns, cols)
Beispiel #2
0
    def test_table_to_frame_on_all_orange_dataset(self):
        from os import listdir
        from Orange.data.pandas_compat import table_to_frame

        dataset_directory = "Orange/datasets/"

        def _filename_to_dataset_name(f):
            return f.split('.')[0]

        def _get_orange_demo_datasets():
            x = [
                _filename_to_dataset_name(f)
                for f in listdir(dataset_directory) if '.tab' in f
            ]
            return x

        for name in _get_orange_demo_datasets():
            table = Table(name)
            df = table_to_frame(table)
            assert_message = "Failed to process Table('{}')".format(name)

            self.assertEqual(type(df), pd.DataFrame, assert_message)
            self.assertEqual(len(df), len(table), assert_message)
            self.assertEqual(len(df.columns), len(table.domain.variables),
                             assert_message)
Beispiel #3
0
    def test_load_data(self):
        corpus = self.client.search_content(["orange"])
        self.assertEqual(4, len(corpus))
        self.assertTupleEqual(tuple(m[0] for m in twitter.METAS),
                              corpus.domain.metas)

        df = table_to_frame(corpus, include_metas=True)
        pd.testing.assert_frame_equal(df.reset_index(drop=True),
                                      ER,
                                      check_dtype=False,
                                      check_categorical=False)
Beispiel #4
0
    def test_table_to_frame_object_dtype(self):
        from Orange.data.pandas_compat import table_to_frame

        domain = Domain([], metas=[ContinuousVariable("a", number_of_decimals=0)])
        table = Table.from_numpy(
            domain, np.empty((10, 0)), metas=np.ones((10, 1), dtype=object)
        )

        df = table_to_frame(table, include_metas=True)
        self.assertEqual(["a"], df.columns)
        np.testing.assert_array_equal(df["a"].values, np.ones((10,)))
Beispiel #5
0
    def test_table_to_frame(self):
        from Orange.data.pandas_compat import table_to_frame
        table = Table("iris")
        df = table_to_frame(table)
        table_column_names = [var.name for var in table.domain.variables]
        frame_column_names = df.columns

        self.assertEqual(sorted(table_column_names), sorted(frame_column_names))
        self.assertEqual(type(df['iris'].dtype), pd.api.types.CategoricalDtype)
        self.assertEqual(list(df['sepal length'])[0:4], [5.1, 4.9, 4.7, 4.6])
        self.assertEqual(list(df['iris'])[0:2], ['Iris-setosa', 'Iris-setosa'])
Beispiel #6
0
    def test_table_to_frame(self):
        from Orange.data.pandas_compat import table_to_frame
        table = Table("iris")
        df = table_to_frame(table)
        table_column_names = [var.name for var in table.domain.variables]
        frame_column_names = df.columns

        self.assertEqual(sorted(table_column_names), sorted(frame_column_names))
        self.assertEqual(type(df['iris'].dtype), pd.api.types.CategoricalDtype)
        self.assertEqual(list(df['sepal length'])[0:4], [5.1, 4.9, 4.7, 4.6])
        self.assertEqual(list(df['iris'])[0:2], ['Iris-setosa', 'Iris-setosa'])
    def fit(self, data):
        if not contains_survival_endpoints(data.domain):
            raise ValueError(self.learner_adequacy_err_msg)
        time_var, event_var = get_survival_endpoints(data.domain)

        df = table_to_frame(data, include_metas=False)
        df = df.dropna(axis=0)
        df[time_var.name] = df[time_var.name].astype(float)
        df[event_var.name] = df[event_var.name].astype(float)
        cph = CoxPHFitter(**self.params['kwargs'])
        cph = cph.fit(df, duration_col=time_var.name, event_col=event_var.name)
        return CoxRegressionModel(cph)
Beispiel #8
0
    def test_load_authors(self, user_mock):
        user_mock.return_value = MagicMock(data=MagicMock(id=1))

        corpus = self.client.search_authors(["orange"])
        self.assertEqual(4, len(corpus))
        self.assertTupleEqual(tuple(m[0] for m in twitter.METAS),
                              corpus.domain.metas)

        df = table_to_frame(corpus, include_metas=True)
        pd.testing.assert_frame_equal(df.reset_index(drop=True),
                                      ER,
                                      check_dtype=False,
                                      check_categorical=False)
Beispiel #9
0
 def set_data(self, data, id):
     """Set the input data for channel id."""
     if id in self.tables:
         if data is None:
             del self.tables[id]
         else:
             df = table_to_frame(data, include_metas=True)
             df.name = data.name
             self.tables[id] = df
     else:
         if data is not None:
             df = table_to_frame(data, include_metas=True)
             df.name = data.name
             self.tables[id] = df
     if len(self.tables):
         self.infoa.setText("%d tables on input." % len(self.tables))
         self.infob.setText("Please click the button below to run Glueviz.")
         self.tablesbox.setDisabled(False)
     else:
         self.infoa.setText("No data on input.")
         self.infob.setText("Please connect to other widgets.")
         self.tablesbox.setDisabled(True)
     self._update_tablelist()
def table_to_binary_df(table, target_class='hit'):
    import Orange
    from Orange.data.pandas_compat import table_to_frame
    import pandas as pd
    if all([a.is_discrete for a in table.domain.attributes]) == True:
        disc_data_table = table
    else:
        disc = Orange.preprocess.Discretize()
        # disc.method = Orange.preprocess.discretize.EqualFreq(n=5)
        disc.method = Orange.preprocess.discretize.EntropyMDL(force=True)
        disc_data_table = disc(table)
    df = table_to_frame(disc_data_table)
    # Y = pd.DataFrame(disc_data_table.Y,columns=[disc_data_table.domain.class_var.name],dtype='int32')
    Y = disc_data_table.Y
    df.drop(df.columns[-1], axis=1, inplace=True)
    df = pd.get_dummies(df)
    return disc_data_table, df, Y
Beispiel #11
0
    def test_table_to_frame_nans(self):
        from Orange.data.pandas_compat import table_to_frame
        domain = Domain(
            [ContinuousVariable("a", number_of_decimals=0), ContinuousVariable("b")]
        )
        table = Table(
            domain, np.column_stack((np.ones(10), np.hstack((np.ones(9), [np.nan]))))
        )

        df = table_to_frame(table)
        table_column_names = [var.name for var in table.domain.variables]
        frame_column_names = df.columns

        self.assertEqual(sorted(table_column_names), sorted(frame_column_names))
        self.assertEqual(df["a"].dtype, int)
        self.assertEqual(df["b"].dtype, float)
        self.assertEqual([1, 1, 1], list(df["a"].iloc[-3:]))
        self.assertTrue(np.isnan(df["b"].iloc[-1]))
def worker(table: Table, covariates: List, time_var: str, event_var: str,
           state: TaskState):
    with multiprocessing.Manager() as _manager:
        _queue = _manager.Queue()
        _cpu_count = cpu_count()

        df = table_to_frame(table, include_metas=False)
        df = df.astype({event_var: np.float64})
        if len(covariates) > 50:
            batches = (df[[time_var, event_var] + batch] for batch in
                       [covariates[i::_cpu_count] for i in range(_cpu_count)])
        else:
            batches = (df[[time_var, event_var] + [cov]] for cov in covariates)
        progress_steps = iter(np.linspace(0, 100, len(covariates)))

        with multiprocessing.Pool(processes=_cpu_count) as pool:
            results = pool.map_async(
                partial(
                    batch_to_process,
                    _queue,
                    time_var,
                    event_var,
                ),
                batches,
            )

            while True:
                try:
                    state.set_progress_value(next(progress_steps))
                    _queue.get(timeout=3)
                except (queue.Empty, StopIteration):
                    break

            stacked_result = np.vstack(results.get())
            covariate_names = stacked_result[:, 0]
            results = stacked_result[:, 1:].astype(float)
            _, pvals_corrected = fdrcorrection(results[:, -1], is_sorted=False)
            results = np.hstack(
                (results, pvals_corrected.reshape(pvals_corrected.shape[0],
                                                  -1)))
            return covariate_names, results
Beispiel #13
0
    def test_table_to_frame_on_all_orange_dataset(self):
        from os import listdir
        from Orange.data.pandas_compat import table_to_frame
        import pandas as pd

        dataset_directory = "Orange/datasets/"

        def _filename_to_dataset_name(f):
            return f.split('.')[0]

        def _get_orange_demo_datasets():
            x = [_filename_to_dataset_name(f) for f in listdir(dataset_directory) if '.tab' in f]
            return x

        for name in _get_orange_demo_datasets():
            table = Table(name)
            df = table_to_frame(table)
            assert_message = "Failed to process Table('{}')".format(name)

            self.assertEqual(type(df), pd.DataFrame, assert_message)
            self.assertEqual(len(df), len(table), assert_message)
            self.assertEqual(len(df.columns), len(table.domain), assert_message)
Beispiel #14
0
def explain_tabular(dataset, blackbox, target_class_idx=1, pre_label=True, random_seed=42):
    '''
    Input Params:
    1. dataset: a Orange data table
    3. blackbox: a blackbox predict function, such as `c.predict` where c is a scikit-classifier
    4. target_class
    ---
    Output:
    A decision set.
    '''
    np.random.seed(random_seed)

    if pre_label == False:
        # re-labelled the data using the blackbox, otherwise assuming the labels provided is labeled by the classifier, (instead of the groundtruth label)
        labels = blackbox(dataset.X)
        dataset = Orange.data.Table(dataset.domain, dataset.X, labels)

    # fit the explainer to the data
    # explainer = IDS(dataset, blackbox)
    # rule_set = explainer.fit(dataset.domain,dataset.X,dataset.Y,target_class=target_class)


    # df = pd.read_csv('titanic_train.tab',' ', header=None, names=['Passenger_Cat', 'Age_Cat', 'Gender'])
    # df1 = pd.read_csv('titanic_train.Y', ' ', header=None, names=['Died', 'Survived'])
    # Y = list(df1['Died'].values)
    # df1.head()

    import Orange
    from Orange.data.pandas_compat import table_to_frame
    import pandas as pd
    if all([ a.is_discrete for a in dataset.domain.attributes]) == True:
        disc_data_table = dataset
    else:
        print("discre")
        disc = Orange.preprocess.Discretize()
        disc.method = Orange.preprocess.discretize.EqualFreq(n=5)
        # disc.method = Orange.preprocess.discretize.EntropyMDL(force=True)
        disc_data_table = disc(dataset)
        # df = table_to_frame(disc_data_table)
        # Y = pd.DataFrame(disc_data_table.Y,columns=[disc_data_table.domain.class_var.name],dtype='int32')

    assert all([ a.is_discrete for a in disc_data_table.domain.attributes]), " is not pre-discretized!"
    # disc_data_table = dataset
    Y = disc_data_table.Y
    df = table_to_frame(disc_data_table)
    df.drop(df.columns[-1],axis = 1, inplace = True)


    print("start Apriori")
    itemsets = run_apriori(df, 0.05)
    # itemsets = run_apriori(df, 0.5)
    print("finish Apriori. Converting itemset")
    list_of_rules = createrules(itemsets, list(set(Y)))
    print("Pre-mined okay. all pre-mined rules of",len(list_of_rules))
    # print("----------------------")
    # for r in list_of_rules:
    #     r.print_rule()

    # lambda_array = [1.0]*7     # use separate hyperparamter search routine
    lambda_array = [0.5,1.0,1.0,1.0,1.0,1.5,1.0]     # use separate hyperparamter search routine
    s1 = smooth_local_search(list_of_rules, df, Y, lambda_array, 0.33, 0.33)
    s2 = smooth_local_search(list_of_rules, df, Y, lambda_array, 0.33, -1.0)
    f1 = func_evaluation(s1, list_of_rules, df, Y, lambda_array)
    f2 = func_evaluation(s2, list_of_rules, df, Y, lambda_array)
    if f1 > f2:
        print("The Solution Set is: "+str(s1))
        rule_set = [ list_of_rules[idx] for idx in s1]
    else:
        print("The Solution Set is: "+str(s2))
        rule_set = [ list_of_rules[idx] for idx in s2]
        print(rule_set)

    # convert the rule representation
    rule_set = rules_convert(rule_set,dataset, target_class_idx=target_class_idx)
    return rule_set