Python build_data_set Examples, utils.datasets.build_data_set Python Examples

Example #1

0

Show file

File: __init__.py Project: ryuzakyl/data-bloodhound

def load_ms_glycol():
    """Loads the MS Glycol data set.

    Returns:
        A dictionary with all the data set info.

    Examples:
        >>> ds = load_ms_glycol()
        >>> ds['glycol1'].shape
        (162, 254)
        >>> ds['glycol2'].shape
        (126, 256)

    """

    # loading matlab data sets
    glycol1_raw_data = sio.loadmat(__glycol1_data_set_path)
    glycol2_raw_data = sio.loadmat(__glycol2_data_set_path)

    # validating loaded data
    if glycol1_raw_data is None or glycol2_raw_data is None:
        raise Exception('Error while loading Glycol data set.')

    # ----------------

    # getting samples labels
    glycol1_samples_labels = glycol1_raw_data['obj_labels_all'].tolist()

    # getting features labels
    glycol1_features_labels = glycol1_raw_data['var_labels_all'].tolist()

    # getting glycol1 data
    glycol1_data = glycol1_raw_data['data_all'].tolist()

    # building glycol1 data set
    glycol1_ds = utils.build_data_set(glycol1_data, glycol1_samples_labels,
                                      glycol1_features_labels)

    # ----------------

    glycol2_samples_labels = glycol2_raw_data['obj_labels_all']

    glycol2_features_labels = glycol2_raw_data['var_labels_all']

    glycol2_data = glycol2_raw_data['data_all']

    # building glycol2 data set
    glycol2_ds = utils.build_data_set(glycol2_data, glycol2_samples_labels,
                                      glycol2_features_labels)

    # ----------------

    # the glycol data set
    ds = {
        'glycol1': glycol1_ds,
        'glycol2': glycol2_ds,
    }

    # returning the final data set
    return ds

Example #2

0

Show file

def load_mvda_soil():
    # parsing the raw data file
    with open(__data_set_path, 'r') as f:
        # declaring variables holding labels and data
        samples_labels = []
        data = []
        soil_types = []

        # reading header
        header = f.readline()
        features_labels = [h.strip() for h in header.split('\t')]

        # for each line in the file
        for line in f:
            # parsing current line
            line_parsed = [s.strip() for s in line.split('\t')]

            # adding sample name to labels list
            samples_labels.append(line_parsed[0])

            # building data
            data.append([float(s) for s in line_parsed[1:]])

            soil_types.append(re.findall(soil_type_regex, line_parsed[0])[0])

    # cleaning soil types for inserting in data frame
    soil_types = [st if st != 'v' else '?' for st in soil_types]

    # returning the built data set
    return utils.build_data_set(data,
                                samples_labels,
                                features_labels,
                                extra_cols={'type': soil_types})

Example #3

0

Show file

def load_raman_tablets():
    # loading matlab data set
    raw_data = sio.loadmat(__data_set_path)

    # getting samples labels
    samples_labels = raw_data['ObjLabels'].tolist()

    # getting features labels
    raw_features = raw_data['VarLabels'].tolist()
    features_labels = list(map(float, raw_features[2:]))

    # getting data
    raw_data = raw_data['Matrix']
    data = raw_data[:, 2:]

    # creating the extra columns
    other_cols = {
        'active (% w/w)': raw_data[:, 0].tolist(),
        'Type': raw_data[:, 1].astype(int).tolist(),
    }

    # returning the built data set
    return utils.build_data_set(data,
                                samples_labels,
                                features_labels,
                                extra_cols=other_cols)

Example #4

0

Show file

File: __init__.py Project: ryuzakyl/data-bloodhound

def load_mvda_sucos():
    # parsing the raw data file
    with open(__data_set_path, 'r') as f:
        # declaring variables holding labels and data
        samples_labels = []
        data = []

        # reading header
        header = f.readline()
        features_labels = [h.strip() for h in header.split(' ')]

        # for each line in the file
        for line in f:
            # parsing current line
            line_parsed = [s.strip() for s in line.split(' ')]

            # adding sample name to labels list
            samples_labels.append(line_parsed[0])

            data.append([float(s) for s in line_parsed[1:]])

    # building classes from samples names
    classes = [s_name[:-1] for s_name in samples_labels]

    # returning the built data set
    return utils.build_data_set(data, samples_labels, features_labels, extra_cols={'class': classes})

Example #5

0

Show file

File: __init__.py Project: ryuzakyl/data-bloodhound

def load_gc_wines():
    # loading matlab (v7.3) data
    raw_data = h5py.File(__data_set_path)

    # validating loaded data
    if raw_data is None:
        raise Exception('Error while loading GC-MS Wines data.')

    # https://groups.google.com/forum/#!topic/h5py/FT7nbKnU24s
    hdf5_samples_labels = raw_data['Label_Wine_samples']
    samples_labels = [
        ''.join(chr(c) for c in raw_data[hdf5_samples_labels[0][i]].value)
        for i in range(hdf5_samples_labels.size)
    ]

    # gettting class labels
    wine_origin = np.squeeze(raw_data['Class'].value).tolist()

    # loading GC spectra wavenumbers elution times
    hdf5_gc_labels = raw_data['Label_Elution_time']
    gc_labels = [
        raw_data[hdf5_gc_labels[i][0]].value[0][0]
        for i in range(hdf5_gc_labels.size)
    ]

    # loading the GC spectral data
    gc_data = raw_data['Elution_profiles'].value.T

    # returning the built data set
    return utils.build_data_set(gc_data,
                                samples_labels,
                                gc_labels,
                                extra_cols={'origin': wine_origin})

Example #6

0

Show file

File: __init__.py Project: ryuzakyl/data-bloodhound

def load_nir_tecator():
    # loading matlab x data
    x_raw_data = sio.loadmat(__x_data_path)['TecatorX']
    x_rows, x_cols = x_raw_data.shape

    # loading matlab y data
    y_raw_data = np.ravel(sio.loadmat(__y_data_path)['TecatorY'])

    # threshold for the 2 classes
    fat_thres = 20.0

    # getting class labels
    classes = y_raw_data >= fat_thres

    # getting samples labels
    samples_labels = range(1, x_rows + 1)

    # getting features labels
    features_labels = range(1, x_cols + 1)

    return utils.build_data_set(data=x_raw_data,
                                samples_labels=samples_labels,
                                features_labels=features_labels,
                                extra_cols={
                                    'fat': y_raw_data,
                                    'class': classes.astype(int)
                                })

Example #7

0

Show file

File: __init__.py Project: ryuzakyl/data-bloodhound

def load_nir_fuel():
    # loading matlab data set
    raw_data = sio.loadmat(__data_set_path)

    # getting samples labels
    samples_labels = [int(l) for l in raw_data['diesel_spec'][0][0][8][0][0]]

    # getting features labels
    features_labels = [f for f in raw_data['diesel_spec'][0][0][12][1][0][0]]

    # getting properties values
    props_labels = [s.strip() for s in raw_data['diesel_prop'][0][0][8][1][0]]

    # getting spectra data
    data = raw_data['diesel_spec'][0][0][7]

    # getting properties data
    props_data = raw_data['diesel_prop'][0][0][7].T

    other_cols = {
        prop_name: prop_data
        for prop_name, prop_data in zip(props_labels, props_data)
    }

    # actually building the data set
    return utils.build_data_set(data,
                                samples_labels,
                                features_labels,
                                extra_cols=other_cols)

Example #8

0

Show file

File: __init__.py Project: ryuzakyl/data-bloodhound

def load_nir_sugarcane():
    # loading matlab data set
    raw_data = sio.loadmat(__data_set_path)

    # getting 'X' data
    x_data = raw_data['X'][0][0][7]
    samples_count, features_count = x_data.shape

    # getting X samples labels
    samples_labels = range(1, samples_count + 1)

    # getting X features labels
    features_labels = [int(nm) for nm in raw_data['X'][0][0][8][1][0]]

    # getting 'Brix' data
    brix_arr = raw_data['Brix'][0][0][7]
    brix_data = [b[0] for b in brix_arr]

    # getting 'pol' data
    pol_arr = raw_data['pol'][0][0][7]
    pol_data = [p[0] for p in pol_arr]

    # getting classes descriptions
    classes_headers = [l[0] for l in raw_data['X'][0][0][12][0][1]]

    # getting classes data
    classes_data = [
        # corresponds to classes_headers[0]
        raw_data['X'][0][0][12][0][0][0][0].tolist(),

        # corresponds to classes_headers[1]
        raw_data['X'][0][0][12][0][0][1][0].tolist(),

        # corresponds to classes_headers[2]
        raw_data['X'][0][0][12][0][0][2][0].tolist(),
    ]

    # getting classesid maps (to be able to convert labels into semantic labels)
    classesid_map = [
        # id_map for classes_headers[0]
        {t[0][0][0]: t[1][0] for t in raw_data['X'][0][0][14][0][0]},

        # id_map for classes_headers[1]
        {t[0][0][0]: t[1][0] for t in raw_data['X'][0][0][14][0][1]},

        # id_map for classes_headers[1]
        {t[0][0][0]: t[1][0] for t in raw_data['X'][0][0][14][0][2]},
    ]

    # columns to add
    regression_cols = {'brix': brix_data, 'pol': pol_data}
    class_cols = {
        ch: list(map(lambda x: classesid_map[i][x], classes_data[i]))
        for i, ch in enumerate(classes_headers)
    }

    # actually building the data set
    return utils.build_data_set(x_data, samples_labels, features_labels, extra_cols={**regression_cols, **class_cols})

Example #9

0

Show file

File: __init__.py Project: ryuzakyl/data-bloodhound

def load_nmr_wines():
    """Loads the NMR Wines data set.

    Returns:
        A Pandas DataFrame with all the data set info.

    Examples:
        >>> ds = load_nmr_wines()
        >>> ds['wine_data'].shape
        (40, 8729)
        >>> ds['wine_ints'].shape
        (22, 1)

    """

    # loading matlab data set object
    raw_data = sio.loadmat(__data_path)

    # validating loaded data
    if raw_data is None:
        raise Exception('Error while loading 1H-NMR Wines data.')

    # getting features labels
    features_labels = raw_data['ppm'][0].tolist()

    # getting properties labels
    props_labels = list(map(lambda x: x[0], raw_data['Label'][0]))

    # getting samples data
    data = raw_data['X']

    # getting properties data
    props_data = raw_data['Y']

    # creating the wine data set
    all_data = np.hstack([data, props_data])
    all_labels = range(all_data.shape[0])
    all_features = features_labels + props_labels
    wine_ds = utils.build_data_set(all_data.tolist(), all_labels, all_features)

    # ----------------------

    wine_ints_data = raw_data['wine_ints'][0]
    wine_ints_ds = pd.DataFrame(wine_ints_data)

    # ----------------------

    # the final data set
    ds = {
        'wine_data': wine_ds,
        'wine_ints': wine_ints_ds,
    }

    # returning the final data set
    return ds

Example #10

0

Show file

File: __init__.py Project: ryuzakyl/data-bloodhound

def load_mvda_peas_raw():
    # loading matlab data set
    raw_data = sio.loadmat(__data_set_path)

    features_labels = raw_data['var_labels_all']

    data = raw_data['data_all']

    samples_labels = list(range(1, data.shape[0] + 1))

    return utils.build_data_set(data, samples_labels, features_labels)

Example #11

0

Show file

def load_mvda_tea():
    # parsing the raw data file
    with open(__data_set_path, 'r') as f:
        # declaring variables holding labels and data
        samples_labels = []
        data = []

        # reading header
        header = f.readline()
        features_labels = [h.strip() for h in header.split(' ')]

        # for each line in the file
        for line in f:
            # parsing current line
            line_parsed = [s.strip() for s in line.split(' ')]

            # adding sample name to labels list
            samples_labels.append(line_parsed[0])

            data.append([float(s) for s in line_parsed[1:]])

    # building varieties from samples names
    varieties = [s_name[0] for s_name in samples_labels]

    # building type/class from samples names
    var_to_type = {
        # Black tea
        'K': 'Black',
        'F': 'Black',

        # Green tea
        'C': 'Green',
        'H': 'Green',

        # Oolong tea
        'S': 'Oolong',
        'T': 'Oolong',
    }
    types = [var_to_type[v_name] for v_name in varieties]

    # returning the built data set
    return utils.build_data_set(data,
                                samples_labels,
                                features_labels,
                                extra_cols={
                                    'variety': varieties,
                                    'type': types
                                })

Example #12

0

Show file

File: __init__.py Project: ryuzakyl/data-bloodhound

def load_raman_porkfat():
    # loading matlab data set object
    raw_data = sio.loadmat(__data_set_path)

    # validating loaded data
    if raw_data is None:
        raise Exception('Error while loading Raman Pork Fat data.')

    # getting X and Y info
    X = raw_data['X']
    Y = raw_data['Y']

    # getting samples labels
    samples_labels = list(X['label'][0][0][0][0])

    # getting features labels
    features_labels = list(map(str, list(X['axisscale'][0][0][1][0][0])))

    # getting samples data
    data = list(map(list, X['data'][0][0]))

    # adding extra columns
    other_cols = OrderedDict()

    # adding first labeling
    classes1 = list(X['class'][0][0][0][0][0][0])
    other_cols['classes1'] = classes1

    # adding second labeling
    classes2 = list(X['class'][0][0][0][0][1][0])
    other_cols['classes2'] = classes2

    # adding third labeling
    classes3 = list(X['class'][0][0][0][0][2][0])
    other_cols['classes3'] = classes3

    # adding properties
    props_labels = list(Y['label'][0][0][1][0])
    props = list(map(list, Y['data'][0][0].T))
    for i, pl in enumerate(props_labels):
        other_cols[pl] = props[i]

    # actually building the data set
    return utils.build_data_set(data, samples_labels, features_labels,
                                other_cols)

Example #13

0

Show file

File: __init__.py Project: ryuzakyl/data-bloodhound

def load_hplc_oil():
    # loading matlab data set object
    raw_data = sio.loadmat(__data_set_path)

    # validating loaded data
    if raw_data is None:
        raise Exception('Error while loading HPLC Oil data.')

    # getting 'HPLCforweb' info
    hplc_oil = raw_data['HPLCforweb']

    # getting samples labels
    samples_labels = list(hplc_oil['label'][0][0][0][0])

    # getting features labels
    features_labels = list(hplc_oil['include'][0][0][1][0][0])

    # getting samples data
    data = list(map(list, hplc_oil['data'][0][0]))

    # adding extra columns
    other_cols = OrderedDict()

    # getting samples classes
    class_labels = list(hplc_oil['class'][0][0][0][0][0])
    other_cols['class'] = class_labels

    classid_map = {
        # 1 --> not
        hplc_oil['classlookup'][0][0][0][0][1][0][0][0]:
        hplc_oil['classlookup'][0][0][0][0][1][1][0],
        # 2 --> olive
        hplc_oil['classlookup'][0][0][0][0][2][0][0][0]:
        hplc_oil['classlookup'][0][0][0][0][2][1][0],
        # 3 --> mix
        hplc_oil['classlookup'][0][0][0][0][3][0][0][0]:
        hplc_oil['classlookup'][0][0][0][0][3][1][0],
    }
    classids = list(map(lambda x: classid_map[x], class_labels))
    other_cols['classid'] = classids

    # actually building the data set
    return utils.build_data_set(data, samples_labels, features_labels,
                                other_cols)

Example #14

0

Show file

File: __init__.py Project: ryuzakyl/data-bloodhound

def load_nmr_onion():
    # loading matlab data set object
    raw_data = sio.loadmat(__data_path)

    # validating loaded data
    if raw_data is None:
        raise Exception('Error while loading 1H-NMR Onion data.')

    # getting samples labels
    samples_labels = list(map(lambda x: x[0][0], raw_data['Samples_name']))

    # getting features labels
    features_labels = raw_data['ppm'][0].tolist()

    # getting samples data
    data = raw_data['x'].tolist()

    # getting onion percent
    onion_percent = raw_data['onion'][0].tolist()

    # actually building the data set
    return utils.build_data_set(data, samples_labels, features_labels, {'% onion': onion_percent})

Example #15

0

Show file

File: __init__.py Project: ryuzakyl/data-bloodhound

def load_nir_corn():
    """Loads the NIR Corn data set.

    Returns:
        A dictionary with all the data set info.

    Examples:
        >>> ds = load_nir_corn()
        >>> ds['m5_nbs'].shape
        (3, 700)
        >>> ds['m5_spec'].shape
        (80, 700)
        >>> ds['mp5_nbs'].shape
        (4, 700)
        >>> ds['mp5_spec'].shape
        (80, 700)
        >>> ds['mp6_nbs'].shape
        (4, 700)
        >>> ds['mp6_spec'].shape
        (80, 700)
        >>> ds['propvals'].shape
        (80, 4)

    """

    # loading matlab data set
    raw_data = sio.loadmat(__data_set_path)

    # building features labels
    features_labels = list(range(1, 701))

    # ---------------- m5 info ----------------

    m5nbs_data = raw_data['m5nbs'][0][0][7]
    m5_nbs_ds = utils.build_data_set(m5nbs_data, list(range(1, m5nbs_data.shape[0] + 1)), features_labels)

    m5spec_data = raw_data['m5spec'][0][0][7]
    m5_spec_ds = utils.build_data_set(m5spec_data, list(range(1, m5spec_data.shape[0] + 1)), features_labels)

    # ---------------- mp5 info ----------------

    mp5nbs_data = raw_data['mp5nbs'][0][0][7]
    mp5_nbs_ds = utils.build_data_set(mp5nbs_data, list(range(1, mp5nbs_data.shape[0] + 1)), features_labels)

    mp5spec_data = raw_data['mp5spec'][0][0][7]
    mp5_spec_ds = utils.build_data_set(mp5spec_data, list(range(1, mp5spec_data.shape[0] + 1)), features_labels)

    # ---------------- mp6 info ----------------

    mp6nbs_data = raw_data['mp6nbs'][0][0][7]
    mp6_nbs_ds = utils.build_data_set(mp6nbs_data, list(range(1, mp6nbs_data.shape[0] + 1)), features_labels)

    mp6spec_data = raw_data['mp6spec'][0][0][7]
    mp6_spec_ds = utils.build_data_set(mp6spec_data, list(range(1, mp6spec_data.shape[0] + 1)), features_labels)

    # ---------------- propvals info ----------------

    prop_values = raw_data['propvals'][0][0][7]
    prop_names = list(map(lambda s: s.strip(), raw_data['propvals'][0][0][8][1][0]))

    propvals_ds = utils.build_data_set(prop_values, list(range(1, prop_values.shape[0] + 1)), prop_names)

    # ----------------

    # actually building the joint data set
    ds = {
        'm5_nbs': m5_nbs_ds,

        'm5_spec': m5_spec_ds,

        'mp5_nbs': mp5_nbs_ds,

        'mp5_spec': mp5_spec_ds,

        'mp6_nbs': mp6_nbs_ds,

        'mp6_spec': mp6_spec_ds,

        'propvals': propvals_ds,
    }

    return ds

Example #16

0

Show file

File: __init__.py Project: ryuzakyl/data-bloodhound

def load_nir_alcohol():
    """Loads the NIR Alcohol data set.

    Returns:
        A Pandas DataFrame with all the data set info.

    Examples:
        >>> ds = load_nir_alcohol()
        >>> ds.ix['train'].shape
        (27, 104)
        >>> ds.ix['new'].shape
        (13, 104)
        >>> ds.ix['msc'].shape
        (27, 104)

    """

    # loading matlab data set
    raw_data = sio.loadmat(__data_set_path)

    # ----------------

    # getting all variable/features labels
    var_labels_all = raw_data['var_labels_all'].tolist()

    # getting spectra labels
    var_labels_spectra = [int(l) for l in var_labels_all[3:]]

    # getting properties labels
    var_labels_properties = var_labels_all[:3]

    # ----------------

    # getting all data (train-new-msc)
    data_all = raw_data['data_all']

    # getting spectra data
    data_spectra = data_all[:, 3:]

    # getting properties data
    data_properties = data_all[:, :3]

    # ----------------

    obj_labels_train = raw_data['obj_labels_train'].tolist()
    data_train = data_spectra[:27, :]
    other_cols_train = {
        p_name: p_data
        for p_name, p_data in zip(var_labels_properties,
                                  data_properties[:27, :].T)
    }

    ds_train = utils.build_data_set(data_train,
                                    obj_labels_train,
                                    var_labels_spectra,
                                    extra_cols=other_cols_train)

    # ----------------

    obj_labels_new = raw_data['obj_labels_new'].tolist()
    data_new = data_spectra[27:40, :]
    other_cols_new = {
        p_name: p_data
        for p_name, p_data in zip(var_labels_properties, data_properties[
            27:40, :].T)
    }

    ds_new = utils.build_data_set(data_new,
                                  obj_labels_new,
                                  var_labels_spectra,
                                  extra_cols=other_cols_new)

    # ----------------

    obj_labels_msc = raw_data['obj_labels_mscorrected'].tolist()
    data_msc = data_spectra[40:, :]
    other_cols_msc = {
        p_name: p_data
        for p_name, p_data in zip(var_labels_properties, data_properties[
            40:, :].T)
    }

    ds_msc = utils.build_data_set(data_msc,
                                  obj_labels_msc,
                                  var_labels_spectra,
                                  extra_cols=other_cols_msc)

    # ----------------

    # training/validation data sets and labels
    data_sets = [ds_train, ds_new, ds_msc]
    labels = ['train', 'new', 'msc']

    # actually building the joint data set
    ds = pd.concat(data_sets, keys=labels)

    # returning the final data set
    return ds