def __init__(self): """ prepare all table """ self.conn = sqlite3.connect(filename) self.pl_players = get_dataset(self.all_players_name_id).get_data(dataset_format='dataframe')[0] self.pl_players_attributes = get_dataset(self.all_players_stats_id).get_data(dataset_format='dataframe')[0] self.pl_players_attributes["date"] = pd.to_datetime(self.pl_players_attributes["date"]) self.pl_players_attributes_small = self.pl_players_attributes[ self.pl_players_attributes['date'] > self.date_time_str].groupby(['player_fifa_api_id']).agg( {'overall_rating': ['mean']})
def soil(): data, _, _, _ = datasets.get_dataset(688).get_data() X = data.iloc[:, :-1] X = X.astype("float64") y = np.array(data.iloc[:, -1]) name = "Soil Data" return X, y, name
def get_metadata(data_id: int): data = datasets.get_dataset(data_id, download_data=False) features = pd.DataFrame( [vars(data.features[i]) for i in range(0, len(data.features))]) is_target = [ "true" if name == data.default_target_attribute else "false" for name in features["name"] ] features["Target"] = is_target # Extract #categories size = [ str(len(value)) if value is not None else " " for value in features["nominal_values"] ] features["nominal_values"].replace({None: " "}, inplace=True) features["# categories"] = size # choose features to be displayed meta_features = features[[ "name", "data_type", "number_missing_values", "# categories", "Target" ]] meta_features.rename( columns={ "name": "Attribute", "data_type": "DataType", "number_missing_values": "Missing values", }, inplace=True, ) meta_features.sort_values(by="Target", ascending=False, inplace=True) if meta_features.shape[0] > 1000: meta_features = meta_features[:1000] return meta_features, data, (vars(data)["name"])
def get_dataset(self): """ Form a dataframe with the descriptions from all openml datasets :return: unique dataset descriptions with length min=50 """ dataset_list = datasets.list_datasets(output_format='dataframe', status='active') data_dict = defaultdict(list) for did in dataset_list['did']: try: data = datasets.get_dataset(did, download_data=False) if data.description is not None and data.name is not None: data_dict['id'].append(did) data_dict['name'].append(data.name) data_dict['text'].append(data.description + " " + data.name + " ") except: # TODO: Exception type # For some reasons we get multiple exceptions apart from FileNotFound pass self.df = pd.DataFrame(data_dict) self.df.sort_values(by='id', inplace=True) self.df_unique = self._remove_duplicates() return self.df_unique
def download_openml(ID=0, api_key='', tmp='tmp', dataset_path='datasets'): config.apikey = api_key os.makedirs(tmp, exist_ok=True) os.makedirs(dataset_path, exist_ok=True) config.set_cache_directory(os.path.abspath('tmp')) if not os.path.exists('{}/{}.p'.format(datasets_path, ID)): try: odata = datasets.get_dataset(int(ID)) X, y, categorical, attribute_names = odata.get_data( target=odata.default_target_attribute) y = y.astype(str) if not isinstance(y, np.ndarray): y = y.values X = np.hstack((X, y.reshape(-1, 1))) df = pd.DataFrame(X, columns=attribute_names + [odata.default_target_attribute]) pickle.dump( dict(cat=categorical, names=attribute_names + [odata.default_target_attribute], X=df), open('{}/{}.p'.format(dataset_path, ID), 'wb')) return 1 except Exception as e: return 0 return 2
def download_data(): data_folder = "_Data" idx_folder = os.path.join(data_folder, "idx") if not os.path.isdir(data_folder): os.makedirs(data_folder) if not os.path.isdir(idx_folder): os.makedirs(idx_folder) for idx in IDS: print("Downloading {}".format(idx)) data_file = os.path.join(data_folder, "{}.txt".format(idx)) idx_file = os.path.join(idx_folder, "{}.npy".format(idx)) if os.path.isfile(data_file) and os.path.isfile(idx_file): continue dataset = datasets.get_dataset(idx) data, categorical_idx, names = dataset.get_data( return_categorical_indicator=True, return_attribute_names=True) data = data.toarray() if not isinstance(data, np.ndarray) else data target_idx = names.index(dataset.default_target_attribute) numerical_idx = ~np.array(categorical_idx) swap(numerical_idx, target_idx, -1) swap(data, target_idx, -1) with open(data_file, "w") as file: file.write("\n".join( [" ".join(map(lambda n: str(n), line)) for line in data])) np.save(idx_file, numerical_idx)
def download_data(): data_folder = "_Data" idx_folder = os.path.join(data_folder, "idx") if not os.path.isdir(data_folder): os.makedirs(data_folder) if not os.path.isdir(idx_folder): os.makedirs(idx_folder) for idx in IDS: print("Downloading {}".format(idx)) data_file = os.path.join(data_folder, "{}.txt".format(idx)) idx_file = os.path.join(idx_folder, "{}.npy".format(idx)) if os.path.isfile(data_file) and os.path.isfile(idx_file): continue dataset = datasets.get_dataset(idx) data, categorical_idx, names = dataset.get_data( return_categorical_indicator=True, return_attribute_names=True ) data = data.toarray() if not isinstance(data, np.ndarray) else data target_idx = names.index(dataset.default_target_attribute) numerical_idx = ~np.array(categorical_idx) swap(numerical_idx, target_idx, -1) swap(data, target_idx, -1) with open(data_file, "w") as file: file.write("\n".join([" ".join(map(lambda n: str(n), line)) for line in data])) np.save(idx_file, numerical_idx)
def download_data(): data_folder = "_Data" idx_folder = os.path.join(data_folder, "idx") if not os.path.isdir(data_folder): os.makedirs(data_folder) if not os.path.isdir(idx_folder): os.makedirs(idx_folder) for idx in IDS: print("Downloading {}".format(idx)) data_file = os.path.join(data_folder, "{}.txt".format(idx)) idx_file = os.path.join(idx_folder, "{}.npy".format(idx)) if os.path.isfile(data_file) and os.path.isfile(idx_file): continue dataset = datasets.get_dataset(idx) x, y, categorical_idx, names = dataset.get_data( target=dataset.default_target_attribute, dataset_format="array") categorical_idx.append(True) to_array = lambda arr: arr.toarray() if not isinstance( arr, np.ndarray) else arr data = np.hstack(list(map(to_array, [x, y.reshape([-1, 1])]))) numerical_idx = ~np.array(categorical_idx) with open(data_file, "w") as file: file.write("\n".join( [" ".join(map(lambda n: str(n), line)) for line in data])) np.save(idx_file, numerical_idx)
def download_arff_files(datasets, processed_datasets): with open('./datasets/datasets.csv', 'a', newline='') as csvfile: writer = csv.writer(csvfile, delimiter=',') for dataset in datasets: try: if dataset.id not in processed_datasets: print("processing dataset %d" % dataset.id) ds: OpenMLDataset = open_ml_dataset.get_dataset(dataset.id) file = ds.data_file filename, file_extension = os.path.splitext(file) cur_dir = os.path.abspath(os.getcwd()) shutil.copyfile(file, f'{cur_dir}/datasets/arff/{dataset.id}{file_extension}') features = ds.features target_name = ds.default_target_attribute if target_name is not None: target_id = (-1, '') for (_, f) in features.items(): if f.name == target_name: target_id = (f.index, f.data_type) writer.writerow( [dataset.id, dataset.instances_number, dataset.features_number, dataset.class_number, target_id[0], ds.url]) csvfile.flush() except : print("error")
def _load_players(): """Load the DataFrame containing the players' attributes Returns ------- players_df: pd.DataFrame The DataFrame containing the players """ players_df = get_dataset(all_players_stats_id).get_data(dataset_format='dataframe')[0] players_df["date"] = pd.to_datetime(players_df["date"]) return players_df
def _get_random_feature(self, dataset_id: int) -> str: random_dataset = get_dataset(dataset_id) # necessary loop to overcome string and date type # features. while True: random_feature_index = randint(0, len(random_dataset.features) - 1) random_feature = random_dataset.features[random_feature_index] if self.task_type_id == 2: if random_feature.data_type == 'numeric': break else: if random_feature.data_type == 'nominal': break return random_feature.name
def _get_random_feature(self, dataset_id: int) -> str: random_dataset = get_dataset(dataset_id) # necessary loop to overcome string and date type # features. while True: random_feature_index = randint(0, len(random_dataset.features) - 1) random_feature = random_dataset.features[random_feature_index] if self.task_type == TaskType.SUPERVISED_REGRESSION: if random_feature.data_type == "numeric": break else: if random_feature.data_type == "nominal": break return random_feature.name
def get_data_metadata(data_id: int): """ Download the dataset and get metadata :param data_id: ID of the OpenML dataset :return: """ # Get data in pandas df format import time start = time.time() data = datasets.get_dataset(data_id) x, y, categorical, attribute_names = data.get_data() end = time.time() print("time taken get data", end-start) df = pd.DataFrame(x, columns=attribute_names) df.to_pickle('cache/df'+str(data_id)+'.pkl') # Get meta-features and add target features = pd.DataFrame([vars(data.features[i]) for i in range(0, len(data.features))]) is_target = ["true" if name == data.default_target_attribute else "false" for name in features["name"]] features["Target"] = is_target # Extract #categories size = [str(len(value)) if value is not None else ' ' for value in features['nominal_values']] features['nominal_values'].replace({None: ' '}, inplace=True) features['# categories'] = size # choose features to be displayed meta_features = features[["name", "data_type", "number_missing_values", '# categories', "Target"]] meta_features.rename(columns={"name": "Attribute", "data_type": "DataType", "number_missing_values": "Missing values"}, inplace=True) meta_features.sort_values(by='Target', ascending=False, inplace=True) meta_features = meta_features[meta_features["Attribute"].isin(pd.Series(df.columns))] # Add entropy numerical_features = list(meta_features["Attribute"][meta_features["DataType"] == "numeric"]) nominal_features = list(meta_features["Attribute"][meta_features["DataType"] == "nominal"]) entropy = [] for column in meta_features['Attribute']: if column in nominal_features: count = df[column].value_counts() ent = round(scipy.stats.entropy(count),2) entropy.append(ent) else: entropy.append(' ') meta_features['Entropy'] = entropy meta_features['Target'].replace({'false': ' '}, inplace=True) return df, meta_features, numerical_features, nominal_features, (vars(data)['name'])
def _load_matches(league): """Load the DataFrame according to the league Parameters ---------- league: str The name of the league Returns ------- matches_df: pd.DataFrame The requested matches """ if league == "England Premier League": target_id = premier_league_matches_id else: target_id = serie_a_matches_id matches_df = get_dataset(target_id).get_data(dataset_format='dataframe')[0] matches_df["date"] = pd.to_datetime(matches_df["date"]) return matches_df
def coil(): data, _, _, _ = datasets.get_dataset(298).get_data() X = data.iloc[:, :-1] y = np.array(data.iloc[:, -1]) name = "Coil2000" return X, y, name
def liver_disorder(): data, _, _, _ = datasets.get_dataset(8).get_data() X = data.iloc[:, :-1] y = np.array(data.iloc[:, -1]) name = "Liver_Disorder" return X, y, name
def employee_accept(): data, _, _, _ = datasets.get_dataset(1030).get_data() X = data.iloc[:, :-1] y = np.array(data.iloc[:, -1]) name = "Employee Accept" return X, y, name
def music_origin(): data, _, _, _ = datasets.get_dataset(4544).get_data() X = data.iloc[:, :-1] y = np.array(data.iloc[:, -1]) name = "Music Origin" return X, y, name
def titanic_price(): data, _, _, _ = datasets.get_dataset(41265).get_data() y = np.array(data.pop("Fare")) X = data.copy() name = "Titanic Price" return X, y, name
def cpu_act(): data, _, _, _ = datasets.get_dataset(227).get_data() X = data.iloc[:, :-1] y = np.array(data.iloc[:, -1]) name = "CPU Activity" return X, y, name
def online_news(): data, _, _, _ = datasets.get_dataset(4545).get_data() X = data.iloc[:, 2:-1] y = np.array(data.iloc[:, -1]) name = "Online News" return X, y, name
def bank32(): data, _, _, _ = datasets.get_dataset(558).get_data() X = data.iloc[:, :-1] y = np.array(data.iloc[:, -1]) name = "Banking Data" return X, y, name
def higgs(): data, _, _, _ = datasets.get_dataset(4532).get_data() X = data.iloc[:, :-1] y = np.array(data.iloc[:, -1]) name = "Higgs" return X, y, name
def friedman(): data, _, _, _ = datasets.get_dataset(564).get_data() X = data.iloc[:, :-1] y = np.array(data.iloc[:, -1]) name = "Friedman" return X, y, name
def satellite(): data, _, _, _ = datasets.get_dataset(294).get_data() X = data.iloc[:, :-1] y = np.array(data.iloc[:, -1]) name = "Satellite" return X, y, name
def quake(): data, _, _, _ = datasets.get_dataset(209).get_data() X = data.iloc[:, :-1] y = np.array(data.iloc[:, -1]) name = "Quake" return X, y, name
def balloon(): data, _, _, _ = datasets.get_dataset(512).get_data() X = data.iloc[:, :-1] y = np.array(data.iloc[:, -1]) name = "Balloon" return X, y, name
def sales(): data, _, _, _ = datasets.get_dataset(42183).get_data() X = data.iloc[:, :-1] y = np.array(data.iloc[:, -1]) name = "Sales" return X, y, name
def telecom(): data, _, _, _ = datasets.get_dataset(201).get_data() X = data.iloc[:, :-1] y = np.array(data.iloc[:, -1]) name = "Telecom" return X, y, name
def auto_price(): data, _, _, _ = datasets.get_dataset(42224).get_data() X = data.iloc[:, :-1] y = np.array(data.iloc[:, -1]) name = "Auto Price" return X, y, name
def wine_quality(): data, _, _, _ = datasets.get_dataset(287).get_data() X = data.iloc[:, :-1] y = np.array(data.iloc[:, -1]) name = "Wine Quality" return X, y, name