def get_dataset(self): """ Form a dataframe with the descriptions from all openml datasets :return: unique dataset descriptions with length min=50 """ dataset_list = datasets.list_datasets(output_format='dataframe', status='active') data_dict = defaultdict(list) for did in dataset_list['did']: try: data = datasets.get_dataset(did, download_data=False) if data.description is not None and data.name is not None: data_dict['id'].append(did) data_dict['name'].append(data.name) data_dict['text'].append(data.description + " " + data.name + " ") except: # TODO: Exception type # For some reasons we get multiple exceptions apart from FileNotFound pass self.df = pd.DataFrame(data_dict) self.df.sort_values(by='id', inplace=True) self.df_unique = self._remove_duplicates() return self.df_unique
def _get_compatible_rand_dataset(self) -> List: compatible_datasets = [] active_datasets = list_datasets(status='active') # depending on the task type, find either datasets # with only symbolic features or datasets with only # numerical features. if self.task_type_id == 2: # regression task for dataset_id, dataset_info in active_datasets.items(): if 'NumberOfSymbolicFeatures' in dataset_info: if dataset_info['NumberOfSymbolicFeatures'] == 0: compatible_datasets.append(dataset_id) elif self.task_type_id == 5: # clustering task compatible_datasets = list(active_datasets.keys()) else: for dataset_id, dataset_info in active_datasets.items(): # extra checks because of: # https://github.com/openml/OpenML/issues/959 if 'NumberOfNumericFeatures' in dataset_info: if dataset_info['NumberOfNumericFeatures'] == 0: compatible_datasets.append(dataset_id) # in-place shuffling shuffle(compatible_datasets) return compatible_datasets
def test_all_datasets(dash_br): df = datasets.list_datasets(output_format='dataframe') ids = [] for id in df['did'].values[:30]: dash_br.server_url = BASE_URL + 'data/' + str(id) time.sleep(5) if dash_br.get_logs() != []: ids.append(id) print(id) np.save('ids.npy', np.asarray(ids))
def get_datasets(): datasets_list = open_ml_dataset.list_datasets() datasets = [] for (dataset_id, dataset) in datasets_list.items(): if 'NumberOfInstances' not in dataset or 'NumberOfClasses' not in dataset: continue if dataset['NumberOfInstances'] > 20000.0: continue datasets.append(DatasetInfo(dataset_id, dataset['NumberOfInstances'], dataset['NumberOfFeatures'], dataset['NumberOfClasses'])) return datasets
def _get_compatible_rand_dataset(self) -> int: compatible_datasets = [] active_datasets = list_datasets(status='active') # depending on the task type, find either datasets # with only symbolic features or datasets with only # numerical features. if self.task_type_id != 2: for dataset_id, dataset_info in active_datasets.items(): # extra checks because of: # https://github.com/openml/OpenML/issues/959 if 'NumberOfNumericFeatures' in dataset_info: if dataset_info['NumberOfNumericFeatures'] == 0: compatible_datasets.append(dataset_id) else: for dataset_id, dataset_info in active_datasets.items(): if 'NumberOfSymbolicFeatures' in dataset_info: if dataset_info['NumberOfSymbolicFeatures'] == 0: compatible_datasets.append(dataset_id) random_dataset_pos = randint(0, len(compatible_datasets) - 1) return compatible_datasets[random_dataset_pos]
def dataset_overview(radio): """ :return: overview of datasets page """ if radio == "active": df = datasets.list_datasets(output_format="dataframe") else: df = datasets.list_datasets(output_format="dataframe", status="all") df.dropna(inplace=True) # Binning bins_1 = [ 1, 500, 1000, 5000, 10000, 50000, 100000, 500000, max(df["NumberOfInstances"]), ] bins_2 = [1, 500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000] df["Number of instances"] = pd.cut( df["NumberOfInstances"], bins=bins_1, precision=0 ).astype(str) df["Number of features"] = pd.cut( df["NumberOfFeatures"], bins=bins_2, precision=0 ).astype(str) for col in ["Number of instances", "Number of features"]: df[col] = df[col].str.replace(",", " -") df[col] = df[col].str.replace("(", "") df[col] = df[col].str.replace("]", "") df[col] = df[col].str.replace(".0", " ", regex=False) title = [ "Attribute Types", "Number of classes", "Number of instances across datasets", "Number of features across datasets", ] # Attribute types df["Attribute Type"] = "mixed" df["Attribute Type"][df["NumberOfSymbolicFeatures"] <= 1] = "numeric" df["Attribute Type"][df["NumberOfNumericFeatures"] == 0] = "categorical" grouped = df.groupby("Attribute Type").size().reset_index(name="counts") colors = ["darkblue", "steelblue", "lightsteelblue"] types_chart = go.Pie( labels=grouped["Attribute Type"], values=grouped["counts"], marker=dict(colors=colors), showlegend=True, ) fig1 = go.Figure(data=[types_chart]) fig1.update_layout(height=400) # No of classes showlegend = False classes_plot = go.Violin( y=df["NumberOfClasses"], showlegend=showlegend, box_visible=True, fillcolor="mediumpurple", meanline_visible=True, name=" ", ) fig2 = go.Figure(data=[classes_plot]) fig2.update_xaxes(tickfont=dict(size=10)) fig2.update_layout(height=400) # Instances plot df.sort_values(by="NumberOfInstances", inplace=True) instances_plot = go.Histogram( x=df["Number of instances"], marker_color="#EB89B5", showlegend=showlegend ) fig3 = go.Figure( data=[instances_plot], ) fig3.update_layout(bargap=0.4, width=900, height=400) fig3.update_xaxes(tickfont=dict(size=10)) # Features plot df.sort_values(by="NumberOfFeatures", inplace=True) features_plot = go.Histogram(x=df["Number of features"], showlegend=showlegend) fig4 = go.Figure(data=[features_plot]) fig4.update_layout(bargap=0.4, width=900, height=400) fig4.update_xaxes(tickfont=dict(size=10)) return html.Div( [ html.Div( [html.P(title[0]), dcc.Graph(figure=fig1, id="fig1")], className="row metric-row", style={ "width": "48%", "text-align": "center", "display": "inline-block", }, ), html.Div( [html.P(title[1]), dcc.Graph(figure=fig2, id="fig2")], className="row metric-row", style={ "width": "48%", "text-align": "center", "display": "inline-block", }, ), html.P(title[2]), dcc.Graph(figure=fig3, id="fig3"), html.P(title[3]), dcc.Graph(figure=fig4, id="fig4"), ], )
def get_dataset_overview(): """ :return: overview of datasets page """ df = datasets.list_datasets(output_format='dataframe') df.dropna(inplace=True) bins_1 = [ 1, 500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000, max(df["NumberOfInstances"]) ] bins_2 = [1, 500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000] df["Number of instances"] = pd.cut(df["NumberOfInstances"], bins=bins_1).astype(str) df["Number of features"] = pd.cut(df["NumberOfFeatures"], bins=bins_2).astype(str) title = [ "Number of instances across datasets", "Number of features across datasets", "Attribute Type percentage distribution", "Number of classes" ] fig = plotly.subplots.make_subplots(rows=4, cols=1, subplot_titles=tuple(title)) for col in ["Number of instances", "Number of features"]: df[col] = df[col].str.replace(',', ' -') df[col] = df[col].str.replace('(', "") df[col] = df[col].str.replace(']', "") df.sort_values(by="NumberOfInstances", inplace=True) showlegend = False fig.add_trace(go.Histogram(x=df["Number of instances"], showlegend=showlegend), row=1, col=1) df.sort_values(by="NumberOfFeatures", inplace=True) fig.add_trace(go.Histogram(x=df["Number of features"], showlegend=showlegend), row=2, col=1) df["Attribute Type"] = "mixed" df["Attribute Type"][df['NumberOfSymbolicFeatures'] <= 1] = 'numeric' df["Attribute Type"][df['NumberOfNumericFeatures'] == 0] = 'categorical' fig.add_trace(go.Histogram(x=df["Attribute Type"], histnorm="percent", showlegend=showlegend), row=3, col=1) fig.add_trace(go.Violin(x=df["NumberOfClasses"], showlegend=showlegend, name="NumberOfClasses"), row=4, col=1) fig.update_layout(height=1000) fig.update_xaxes(tickfont=dict(size=10)) return html.Div(dcc.Graph(figure=fig), style={"fontsize": 10})
# flake8: noqa import os import openml from openml import datasets from openml.datasets.functions import DATASETS_CACHE_DIR_NAME # get all datasets before running app, so that datasets are loaded faster from openml.utils import _create_cache_directory_for_id root_dir = os.path.abspath(os.sep) openml.config.cache_directory = os.path.join(root_dir, "public", "python-cache", ".openml", "cache") df = datasets.list_datasets(output_format="dataframe") for idx, row in df.iterrows(): data_id = row["did"] instances = row["NumberOfInstances"] cols = row["NumberOfFeatures"] print(data_id) # delete existing cache directory did_cache_dir = _create_cache_directory_for_id( DATASETS_CACHE_DIR_NAME, data_id, ) # print(did_cache_dir) if os.path.exists(did_cache_dir): openml.utils._remove_cache_dir_for_id(DATASETS_CACHE_DIR_NAME, did_cache_dir) # download dataset and cache again, # (FileNotFoundError, openml.exceptions.OpenMLServerException)
from openml import datasets # get all datasets before running app, so that datasets are loaded faster df = datasets.list_datasets(output_format='dataframe') for data_id in df['did']: try: datasets.get_dataset(data_id) except FileNotFoundError: pass
def dataset_overview(radio): """ :return: overview of datasets page """ if radio == 'active': df = datasets.list_datasets(output_format='dataframe') else: df = datasets.list_datasets(output_format='dataframe', status='all') df.dropna(inplace=True) # Binning bins_1 = [ 1, 500, 1000, 5000, 10000, 50000, 100000, 500000, max(df["NumberOfInstances"]) ] bins_2 = [1, 500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000] df["Number of instances"] = pd.cut(df["NumberOfInstances"], bins=bins_1, precision=0).astype(str) df["Number of features"] = pd.cut(df["NumberOfFeatures"], bins=bins_2, precision=0).astype(str) for col in ["Number of instances", "Number of features"]: df[col] = df[col].str.replace(',', ' -') df[col] = df[col].str.replace('(', "") df[col] = df[col].str.replace(']', "") df[col] = df[col].str.replace('.0', " ", regex=False) title = [ "Attribute Types", "Number of classes", "Number of instances across datasets", "Number of features across datasets", ] # Attribute types df["Attribute Type"] = "mixed" df["Attribute Type"][df['NumberOfSymbolicFeatures'] <= 1] = 'numeric' df["Attribute Type"][df['NumberOfNumericFeatures'] == 0] = 'categorical' grouped = (df.groupby("Attribute Type").size().reset_index( name='counts')) colors = ['darkblue', 'steelblue', 'lightsteelblue'] types_chart = go.Pie(labels=grouped["Attribute Type"], values=grouped['counts'], marker=dict(colors=colors), showlegend=True) fig1 = go.Figure(data=[types_chart]) fig1.update_layout(height=400) # No of classes showlegend = False classes_plot = go.Violin(y=df["NumberOfClasses"], showlegend=showlegend, box_visible=True, fillcolor='mediumpurple', meanline_visible=True, name=' ') fig2 = go.Figure(data=[classes_plot]) fig2.update_xaxes(tickfont=dict(size=10)) fig2.update_layout(height=400) # Instances plot df.sort_values(by="NumberOfInstances", inplace=True) instances_plot = go.Histogram(x=df["Number of instances"], marker_color='#EB89B5', showlegend=showlegend) fig3 = go.Figure(data=[instances_plot], ) fig3.update_layout(bargap=0.4, width=900, height=400) fig3.update_xaxes(tickfont=dict(size=10)) # Features plot df.sort_values(by="NumberOfFeatures", inplace=True) features_plot = go.Histogram(x=df["Number of features"], showlegend=showlegend) fig4 = go.Figure(data=[features_plot]) fig4.update_layout(bargap=0.4, width=900, height=400) fig4.update_xaxes(tickfont=dict(size=10)) return html.Div([ html.Div( [html.P(title[0]), dcc.Graph(figure=fig1, id='fig1')], className="row metric-row", style={ 'width': '48%', 'text-align': 'center', 'display': 'inline-block', }), html.Div([html.P(title[1]), dcc.Graph(figure=fig2, id='fig2')], className="row metric-row", style={ 'width': '48%', 'text-align': 'center', 'display': 'inline-block' }), html.P(title[2]), dcc.Graph(figure=fig3, id='fig3'), html.P(title[3]), dcc.Graph(figure=fig4, id='fig4') ], )