Ejemplo n.º 1
0
 def __init__(self, root, split=TRAIN, validation_size=0.2):
     dataset_path = os.path.join(root, self.name)
     url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00464/superconduct.zip'
     download_unzip(url, dataset_path)
     file_path = os.path.join(dataset_path, 'train.csv')
     df = pd.read_csv(file_path)
     y_columns = ['critical_temp']
     self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
Ejemplo n.º 2
0
 def __init__(self, root, split=TRAIN, validation_size=0.2):
     dataset_path = os.path.join(root, self.name)
     url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00294/CCPP.zip'
     download_unzip(url, dataset_path)
     file_path = os.path.join(dataset_path, 'CCPP', 'Folds5x2_pp.xlsx')
     df = pd.read_excel(file_path)
     y_columns = ['PE']  # Not clear if this is the aim of the dataset
     self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
Ejemplo n.º 3
0
 def __init__(self, root, split=TRAIN, validation_size=0.2):
     dataset_path = os.path.join(root, self.name)
     filename = 'data.csv'
     file_path = os.path.join(dataset_path, filename)
     url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv'
     download_file(url, dataset_path, filename)
     df = pd.read_csv(file_path, sep=';')
     y_columns = ['quality']
     self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
 def __init__(self, root, split=TRAIN, validation_size=0.2):
     file_name = 'data.csv'
     dataset_path = os.path.join(root, self.name)
     url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00325/Sensorless_drive_diagnosis.txt'
     download_file(url, dataset_path, file_name)
     file_path = os.path.join(dataset_path, file_name)
     df = pd.read_csv(file_path, header=None, sep=' ')
     y_columns = [48]
     label_encode_df_(df, y_columns[0])
     self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
 def __init__(self, root, split=TRAIN, validation_size=0.2):
     file_name = 'data.csv'
     dataset_path = os.path.join(root, self.name)
     url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data'
     download_file(url, dataset_path, file_name)
     file_path = os.path.join(dataset_path, file_name)
     df = pd.read_csv(file_path, header=None)
     y_columns = [0]
     label_encode_df_(df, y_columns[0])
     self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
 def __init__(self, root, split=TRAIN, validation_size=0.2):
     dataset_path = os.path.join(root, self.name)
     filename = 'data.xls'
     file_path = os.path.join(dataset_path, filename)
     url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00350/' \
           'default%20of%20credit%20card%20clients.xls'
     download_file(url, dataset_path, filename)
     df = pd.read_excel(file_path, skiprows=1, index_col='ID')
     y_columns = ['default payment next month']
     self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
 def __init__(self, root, split=TRAIN, validation_size=0.2):
     dataset_path = os.path.join(root, self.name)
     url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip'
     download_unzip(url, dataset_path)
     file_path = os.path.join(dataset_path, 'bank-additional', 'bank-additional-full.csv')
     df = pd.read_csv(file_path, sep=';')
     y_columns = ['y']
     one_hot_encode_df_(df, skip_columns=y_columns)
     label_encode_df_(df, y_columns[0])
     self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
Ejemplo n.º 8
0
    def __init__(self, root, split=TRAIN, validation_size=0.2):
        dataset_path = os.path.join(root, self.name)
        filename = 'airfoil_self_noise.dat'
        url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00291/airfoil_self_noise.dat'
        download_file(url, dataset_path, filename)
        file_path = os.path.join(dataset_path, filename)

        df = pd.read_csv(file_path, sep='\t', header=None)
        y_columns = [5]
        self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
Ejemplo n.º 9
0
    def __init__(self, root, split=TRAIN, validation_size=0.2):
        dataset_path = os.path.join(root, self.name)
        filename = 'Real estate valuation data set.xlsx'
        url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00477/Real%20estate%20valuation%20data%20set.xlsx'
        download_file(url, dataset_path, filename)
        file_path = os.path.join(dataset_path, filename)

        df = pd.read_excel(file_path, index_col='No')
        y_columns = ['Y house price of unit area']
        self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
Ejemplo n.º 10
0
 def __init__(self, root, split=TRAIN, validation_size=0.2):
     dataset_path = os.path.join(root, self.name)
     url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00332/OnlineNewsPopularity.zip'
     download_unzip(url, dataset_path)
     file_path = os.path.join(dataset_path, 'OnlineNewsPopularity', 'OnlineNewsPopularity.csv')
     df = pd.read_csv(file_path, )
     df.drop(columns=['url', ' timedelta'], inplace=True)
     y_columns = [' shares']
     df[y_columns[0]] = np.log(df[y_columns[0]])
     self.x, self. y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
Ejemplo n.º 11
0
    def __init__(self, root, split=TRAIN, validation_size=0.2):
        dataset_path = os.path.join(root, self.name)
        filename = 'AirQualityUCI.csv'
        url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00360/AirQualityUCI.zip'
        download_unzip(url, dataset_path)
        file_path = os.path.join(dataset_path, filename)

        df = pd.read_csv(file_path, sep=';', parse_dates=[0, 1])
        df.dropna(axis=0, how='all', inplace=True)
        df.dropna(axis=1, how='all', inplace=True)

        df.Date = (df.Date - df.Date.min()).astype('timedelta64[D]')  # Days as int
        df.Time = df.Time.apply(lambda x: int(x.split('.')[0]))  # Hours as int
        df['C6H6(GT)'] = df['C6H6(GT)'].apply(lambda x: float(x.replace(',', '.')))  # Target as float

        # Some floats are given with ',' instead of '.'
        df = df.applymap(lambda x: float(x.replace(',', '.')) if type(x) is str else x)  # Target as float

        df = df[df['C6H6(GT)'] != -200]  # Drop all rows with missing target values
        df.loc[df['CO(GT)'] == -200, 'CO(GT)'] = -10  # -200 means missing value, shifting this to be closer to
        # the other values for this column

        y_columns = ['C6H6(GT)']
        self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)