Exemple #1
0
 def __init__(self):
     """ prepare all table """
     self.conn = sqlite3.connect(filename)
     self.pl_players = get_dataset(self.all_players_name_id).get_data(dataset_format='dataframe')[0]
     self.pl_players_attributes = get_dataset(self.all_players_stats_id).get_data(dataset_format='dataframe')[0]
     self.pl_players_attributes["date"] = pd.to_datetime(self.pl_players_attributes["date"])
     self.pl_players_attributes_small = self.pl_players_attributes[
         self.pl_players_attributes['date'] > self.date_time_str].groupby(['player_fifa_api_id']).agg(
         {'overall_rating': ['mean']})
def soil():
    data, _, _, _ = datasets.get_dataset(688).get_data()
    X = data.iloc[:, :-1]
    X = X.astype("float64")
    y = np.array(data.iloc[:, -1])
    name = "Soil Data"
    return X, y, name
Exemple #3
0
def get_metadata(data_id: int):
    data = datasets.get_dataset(data_id, download_data=False)
    features = pd.DataFrame(
        [vars(data.features[i]) for i in range(0, len(data.features))])
    is_target = [
        "true" if name == data.default_target_attribute else "false"
        for name in features["name"]
    ]
    features["Target"] = is_target

    # Extract #categories
    size = [
        str(len(value)) if value is not None else " "
        for value in features["nominal_values"]
    ]
    features["nominal_values"].replace({None: " "}, inplace=True)
    features["# categories"] = size

    # choose features to be displayed
    meta_features = features[[
        "name", "data_type", "number_missing_values", "# categories", "Target"
    ]]
    meta_features.rename(
        columns={
            "name": "Attribute",
            "data_type": "DataType",
            "number_missing_values": "Missing values",
        },
        inplace=True,
    )
    meta_features.sort_values(by="Target", ascending=False, inplace=True)
    if meta_features.shape[0] > 1000:
        meta_features = meta_features[:1000]
    return meta_features, data, (vars(data)["name"])
Exemple #4
0
    def get_dataset(self):
        """
        Form a dataframe with the descriptions from all openml datasets
        :return: unique dataset descriptions with length min=50
        """
        dataset_list = datasets.list_datasets(output_format='dataframe',
                                              status='active')
        data_dict = defaultdict(list)
        for did in dataset_list['did']:
            try:
                data = datasets.get_dataset(did, download_data=False)
                if data.description is not None and data.name is not None:
                    data_dict['id'].append(did)
                    data_dict['name'].append(data.name)
                    data_dict['text'].append(data.description + " " +
                                             data.name + " ")
            except:
                # TODO: Exception type
                # For some reasons we get multiple exceptions apart from FileNotFound
                pass

        self.df = pd.DataFrame(data_dict)
        self.df.sort_values(by='id', inplace=True)
        self.df_unique = self._remove_duplicates()
        return self.df_unique
def download_openml(ID=0, api_key='', tmp='tmp', dataset_path='datasets'):
    config.apikey = api_key
    os.makedirs(tmp, exist_ok=True)
    os.makedirs(dataset_path, exist_ok=True)
    config.set_cache_directory(os.path.abspath('tmp'))
    if not os.path.exists('{}/{}.p'.format(datasets_path, ID)):
        try:
            odata = datasets.get_dataset(int(ID))
            X, y, categorical, attribute_names = odata.get_data(
                target=odata.default_target_attribute)
            y = y.astype(str)
            if not isinstance(y, np.ndarray):
                y = y.values
            X = np.hstack((X, y.reshape(-1, 1)))
            df = pd.DataFrame(X,
                              columns=attribute_names +
                              [odata.default_target_attribute])
            pickle.dump(
                dict(cat=categorical,
                     names=attribute_names + [odata.default_target_attribute],
                     X=df), open('{}/{}.p'.format(dataset_path, ID), 'wb'))
            return 1
        except Exception as e:
            return 0
        return 2
Exemple #6
0
def download_data():
    data_folder = "_Data"
    idx_folder = os.path.join(data_folder, "idx")
    if not os.path.isdir(data_folder):
        os.makedirs(data_folder)
    if not os.path.isdir(idx_folder):
        os.makedirs(idx_folder)
    for idx in IDS:
        print("Downloading {}".format(idx))
        data_file = os.path.join(data_folder, "{}.txt".format(idx))
        idx_file = os.path.join(idx_folder, "{}.npy".format(idx))
        if os.path.isfile(data_file) and os.path.isfile(idx_file):
            continue
        dataset = datasets.get_dataset(idx)
        data, categorical_idx, names = dataset.get_data(
            return_categorical_indicator=True, return_attribute_names=True)
        data = data.toarray() if not isinstance(data, np.ndarray) else data
        target_idx = names.index(dataset.default_target_attribute)
        numerical_idx = ~np.array(categorical_idx)
        swap(numerical_idx, target_idx, -1)
        swap(data, target_idx, -1)
        with open(data_file, "w") as file:
            file.write("\n".join(
                [" ".join(map(lambda n: str(n), line)) for line in data]))
        np.save(idx_file, numerical_idx)
def download_data():
    data_folder = "_Data"
    idx_folder = os.path.join(data_folder, "idx")
    if not os.path.isdir(data_folder):
        os.makedirs(data_folder)
    if not os.path.isdir(idx_folder):
        os.makedirs(idx_folder)
    for idx in IDS:
        print("Downloading {}".format(idx))
        data_file = os.path.join(data_folder, "{}.txt".format(idx))
        idx_file = os.path.join(idx_folder, "{}.npy".format(idx))
        if os.path.isfile(data_file) and os.path.isfile(idx_file):
            continue
        dataset = datasets.get_dataset(idx)
        data, categorical_idx, names = dataset.get_data(
            return_categorical_indicator=True,
            return_attribute_names=True
        )
        data = data.toarray() if not isinstance(data, np.ndarray) else data
        target_idx = names.index(dataset.default_target_attribute)
        numerical_idx = ~np.array(categorical_idx)
        swap(numerical_idx, target_idx, -1)
        swap(data, target_idx, -1)
        with open(data_file, "w") as file:
            file.write("\n".join([" ".join(map(lambda n: str(n), line)) for line in data]))
        np.save(idx_file, numerical_idx)
Exemple #8
0
def download_data():
    data_folder = "_Data"
    idx_folder = os.path.join(data_folder, "idx")
    if not os.path.isdir(data_folder):
        os.makedirs(data_folder)
    if not os.path.isdir(idx_folder):
        os.makedirs(idx_folder)
    for idx in IDS:
        print("Downloading {}".format(idx))
        data_file = os.path.join(data_folder, "{}.txt".format(idx))
        idx_file = os.path.join(idx_folder, "{}.npy".format(idx))
        if os.path.isfile(data_file) and os.path.isfile(idx_file):
            continue
        dataset = datasets.get_dataset(idx)
        x, y, categorical_idx, names = dataset.get_data(
            target=dataset.default_target_attribute, dataset_format="array")
        categorical_idx.append(True)
        to_array = lambda arr: arr.toarray() if not isinstance(
            arr, np.ndarray) else arr
        data = np.hstack(list(map(to_array, [x, y.reshape([-1, 1])])))
        numerical_idx = ~np.array(categorical_idx)
        with open(data_file, "w") as file:
            file.write("\n".join(
                [" ".join(map(lambda n: str(n), line)) for line in data]))
        np.save(idx_file, numerical_idx)
Exemple #9
0
def download_arff_files(datasets, processed_datasets):
    with open('./datasets/datasets.csv', 'a', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        for dataset in datasets:
            try:
                if dataset.id not in processed_datasets:
                    print("processing dataset %d" % dataset.id)
                    ds: OpenMLDataset = open_ml_dataset.get_dataset(dataset.id)
                    file = ds.data_file
                    filename, file_extension = os.path.splitext(file)
                    cur_dir = os.path.abspath(os.getcwd())
                    shutil.copyfile(file, f'{cur_dir}/datasets/arff/{dataset.id}{file_extension}')
                    features = ds.features
                    target_name = ds.default_target_attribute
                    if target_name is not None:
                        target_id = (-1, '')
                        for (_, f) in features.items():
                            if f.name == target_name:
                                target_id = (f.index, f.data_type)
                        writer.writerow(
                            [dataset.id, dataset.instances_number, dataset.features_number,
                             dataset.class_number, target_id[0], ds.url])
                        csvfile.flush()
            except :
                print("error")
def _load_players():
    """Load the DataFrame containing the players' attributes

    Returns
    -------
    players_df: pd.DataFrame
        The DataFrame containing the players
    """
    players_df = get_dataset(all_players_stats_id).get_data(dataset_format='dataframe')[0]
    players_df["date"] = pd.to_datetime(players_df["date"])
    return players_df
Exemple #11
0
    def _get_random_feature(self, dataset_id: int) -> str:

        random_dataset = get_dataset(dataset_id)
        # necessary loop to overcome string and date type
        # features.
        while True:
            random_feature_index = randint(0, len(random_dataset.features) - 1)
            random_feature = random_dataset.features[random_feature_index]
            if self.task_type_id == 2:
                if random_feature.data_type == 'numeric':
                    break
            else:
                if random_feature.data_type == 'nominal':
                    break
        return random_feature.name
Exemple #12
0
    def _get_random_feature(self, dataset_id: int) -> str:

        random_dataset = get_dataset(dataset_id)
        # necessary loop to overcome string and date type
        # features.
        while True:
            random_feature_index = randint(0, len(random_dataset.features) - 1)
            random_feature = random_dataset.features[random_feature_index]
            if self.task_type == TaskType.SUPERVISED_REGRESSION:
                if random_feature.data_type == "numeric":
                    break
            else:
                if random_feature.data_type == "nominal":
                    break
        return random_feature.name
Exemple #13
0
def get_data_metadata(data_id: int):
    """ Download the dataset and get metadata

    :param data_id: ID of the OpenML dataset
    :return:
    """
    # Get data in pandas df format
    import time
    start = time.time()
    data = datasets.get_dataset(data_id)
    x, y, categorical, attribute_names = data.get_data()
    end = time.time()
    print("time taken get data", end-start)
    df = pd.DataFrame(x, columns=attribute_names)
    df.to_pickle('cache/df'+str(data_id)+'.pkl')

    # Get meta-features and add target
    features = pd.DataFrame([vars(data.features[i]) for i in range(0, len(data.features))])
    is_target = ["true" if name == data.default_target_attribute else "false" for name in features["name"]]
    features["Target"] = is_target

    # Extract #categories
    size = [str(len(value)) if value is not None else ' ' for value in features['nominal_values']]
    features['nominal_values'].replace({None: ' '}, inplace=True)
    features['# categories'] = size

    # choose features to be displayed
    meta_features = features[["name", "data_type", "number_missing_values", '# categories', "Target"]]
    meta_features.rename(columns={"name": "Attribute", "data_type": "DataType",
                                  "number_missing_values": "Missing values"}, inplace=True)
    meta_features.sort_values(by='Target', ascending=False, inplace=True)
    meta_features = meta_features[meta_features["Attribute"].isin(pd.Series(df.columns))]

    # Add entropy
    numerical_features = list(meta_features["Attribute"][meta_features["DataType"] == "numeric"])
    nominal_features = list(meta_features["Attribute"][meta_features["DataType"] == "nominal"])
    entropy = []

    for column in meta_features['Attribute']:
        if column in nominal_features:
            count = df[column].value_counts()
            ent = round(scipy.stats.entropy(count),2)
            entropy.append(ent)
        else:
            entropy.append(' ')
    meta_features['Entropy'] = entropy
    meta_features['Target'].replace({'false': ' '}, inplace=True)
    return df, meta_features, numerical_features, nominal_features, (vars(data)['name'])
def _load_matches(league):
    """Load the DataFrame according to the league

    Parameters
    ----------
    league: str
        The name of the league

    Returns
    -------
    matches_df: pd.DataFrame
        The requested matches
    """
    if league == "England Premier League":
        target_id = premier_league_matches_id
    else:
        target_id = serie_a_matches_id

    matches_df = get_dataset(target_id).get_data(dataset_format='dataframe')[0]
    matches_df["date"] = pd.to_datetime(matches_df["date"])
    return matches_df
def coil():
    data, _, _, _ = datasets.get_dataset(298).get_data()
    X = data.iloc[:, :-1]
    y = np.array(data.iloc[:, -1])
    name = "Coil2000"
    return X, y, name
def liver_disorder():
    data, _, _, _ = datasets.get_dataset(8).get_data()
    X = data.iloc[:, :-1]
    y = np.array(data.iloc[:, -1])
    name = "Liver_Disorder"
    return X, y, name
def employee_accept():
    data, _, _, _ = datasets.get_dataset(1030).get_data()
    X = data.iloc[:, :-1]
    y = np.array(data.iloc[:, -1])
    name = "Employee Accept"
    return X, y, name
def music_origin():
    data, _, _, _ = datasets.get_dataset(4544).get_data()
    X = data.iloc[:, :-1]
    y = np.array(data.iloc[:, -1])
    name = "Music Origin"
    return X, y, name
def titanic_price():
    data, _, _, _ = datasets.get_dataset(41265).get_data()
    y = np.array(data.pop("Fare"))
    X = data.copy()
    name = "Titanic Price"
    return X, y, name
def cpu_act():
    data, _, _, _ = datasets.get_dataset(227).get_data()
    X = data.iloc[:, :-1]
    y = np.array(data.iloc[:, -1])
    name = "CPU Activity"
    return X, y, name
def online_news():
    data, _, _, _ = datasets.get_dataset(4545).get_data()
    X = data.iloc[:, 2:-1]
    y = np.array(data.iloc[:, -1])
    name = "Online News"
    return X, y, name
def bank32():
    data, _, _, _ = datasets.get_dataset(558).get_data()
    X = data.iloc[:, :-1]
    y = np.array(data.iloc[:, -1])
    name = "Banking Data"
    return X, y, name
def higgs():
    data, _, _, _ = datasets.get_dataset(4532).get_data()
    X = data.iloc[:, :-1]
    y = np.array(data.iloc[:, -1])
    name = "Higgs"
    return X, y, name
def friedman():
    data, _, _, _ = datasets.get_dataset(564).get_data()
    X = data.iloc[:, :-1]
    y = np.array(data.iloc[:, -1])
    name = "Friedman"
    return X, y, name
def satellite():
    data, _, _, _ = datasets.get_dataset(294).get_data()
    X = data.iloc[:, :-1]
    y = np.array(data.iloc[:, -1])
    name = "Satellite"
    return X, y, name
def quake():
    data, _, _, _ = datasets.get_dataset(209).get_data()
    X = data.iloc[:, :-1]
    y = np.array(data.iloc[:, -1])
    name = "Quake"
    return X, y, name
def balloon():
    data, _, _, _ = datasets.get_dataset(512).get_data()
    X = data.iloc[:, :-1]
    y = np.array(data.iloc[:, -1])
    name = "Balloon"
    return X, y, name
def sales():
    data, _, _, _ = datasets.get_dataset(42183).get_data()
    X = data.iloc[:, :-1]
    y = np.array(data.iloc[:, -1])
    name = "Sales"
    return X, y, name
def telecom():
    data, _, _, _ = datasets.get_dataset(201).get_data()
    X = data.iloc[:, :-1]
    y = np.array(data.iloc[:, -1])
    name = "Telecom"
    return X, y, name
def auto_price():
    data, _, _, _ = datasets.get_dataset(42224).get_data()
    X = data.iloc[:, :-1]
    y = np.array(data.iloc[:, -1])
    name = "Auto Price"
    return X, y, name
def wine_quality():
    data, _, _, _ = datasets.get_dataset(287).get_data()
    X = data.iloc[:, :-1]
    y = np.array(data.iloc[:, -1])
    name = "Wine Quality"
    return X, y, name