def load_ms_glycol(): """Loads the MS Glycol data set. Returns: A dictionary with all the data set info. Examples: >>> ds = load_ms_glycol() >>> ds['glycol1'].shape (162, 254) >>> ds['glycol2'].shape (126, 256) """ # loading matlab data sets glycol1_raw_data = sio.loadmat(__glycol1_data_set_path) glycol2_raw_data = sio.loadmat(__glycol2_data_set_path) # validating loaded data if glycol1_raw_data is None or glycol2_raw_data is None: raise Exception('Error while loading Glycol data set.') # ---------------- # getting samples labels glycol1_samples_labels = glycol1_raw_data['obj_labels_all'].tolist() # getting features labels glycol1_features_labels = glycol1_raw_data['var_labels_all'].tolist() # getting glycol1 data glycol1_data = glycol1_raw_data['data_all'].tolist() # building glycol1 data set glycol1_ds = utils.build_data_set(glycol1_data, glycol1_samples_labels, glycol1_features_labels) # ---------------- glycol2_samples_labels = glycol2_raw_data['obj_labels_all'] glycol2_features_labels = glycol2_raw_data['var_labels_all'] glycol2_data = glycol2_raw_data['data_all'] # building glycol2 data set glycol2_ds = utils.build_data_set(glycol2_data, glycol2_samples_labels, glycol2_features_labels) # ---------------- # the glycol data set ds = { 'glycol1': glycol1_ds, 'glycol2': glycol2_ds, } # returning the final data set return ds
def load_mvda_soil(): # parsing the raw data file with open(__data_set_path, 'r') as f: # declaring variables holding labels and data samples_labels = [] data = [] soil_types = [] # reading header header = f.readline() features_labels = [h.strip() for h in header.split('\t')] # for each line in the file for line in f: # parsing current line line_parsed = [s.strip() for s in line.split('\t')] # adding sample name to labels list samples_labels.append(line_parsed[0]) # building data data.append([float(s) for s in line_parsed[1:]]) soil_types.append(re.findall(soil_type_regex, line_parsed[0])[0]) # cleaning soil types for inserting in data frame soil_types = [st if st != 'v' else '?' for st in soil_types] # returning the built data set return utils.build_data_set(data, samples_labels, features_labels, extra_cols={'type': soil_types})
def load_raman_tablets(): # loading matlab data set raw_data = sio.loadmat(__data_set_path) # getting samples labels samples_labels = raw_data['ObjLabels'].tolist() # getting features labels raw_features = raw_data['VarLabels'].tolist() features_labels = list(map(float, raw_features[2:])) # getting data raw_data = raw_data['Matrix'] data = raw_data[:, 2:] # creating the extra columns other_cols = { 'active (% w/w)': raw_data[:, 0].tolist(), 'Type': raw_data[:, 1].astype(int).tolist(), } # returning the built data set return utils.build_data_set(data, samples_labels, features_labels, extra_cols=other_cols)
def load_mvda_sucos(): # parsing the raw data file with open(__data_set_path, 'r') as f: # declaring variables holding labels and data samples_labels = [] data = [] # reading header header = f.readline() features_labels = [h.strip() for h in header.split(' ')] # for each line in the file for line in f: # parsing current line line_parsed = [s.strip() for s in line.split(' ')] # adding sample name to labels list samples_labels.append(line_parsed[0]) data.append([float(s) for s in line_parsed[1:]]) # building classes from samples names classes = [s_name[:-1] for s_name in samples_labels] # returning the built data set return utils.build_data_set(data, samples_labels, features_labels, extra_cols={'class': classes})
def load_gc_wines(): # loading matlab (v7.3) data raw_data = h5py.File(__data_set_path) # validating loaded data if raw_data is None: raise Exception('Error while loading GC-MS Wines data.') # https://groups.google.com/forum/#!topic/h5py/FT7nbKnU24s hdf5_samples_labels = raw_data['Label_Wine_samples'] samples_labels = [ ''.join(chr(c) for c in raw_data[hdf5_samples_labels[0][i]].value) for i in range(hdf5_samples_labels.size) ] # gettting class labels wine_origin = np.squeeze(raw_data['Class'].value).tolist() # loading GC spectra wavenumbers elution times hdf5_gc_labels = raw_data['Label_Elution_time'] gc_labels = [ raw_data[hdf5_gc_labels[i][0]].value[0][0] for i in range(hdf5_gc_labels.size) ] # loading the GC spectral data gc_data = raw_data['Elution_profiles'].value.T # returning the built data set return utils.build_data_set(gc_data, samples_labels, gc_labels, extra_cols={'origin': wine_origin})
def load_nir_tecator(): # loading matlab x data x_raw_data = sio.loadmat(__x_data_path)['TecatorX'] x_rows, x_cols = x_raw_data.shape # loading matlab y data y_raw_data = np.ravel(sio.loadmat(__y_data_path)['TecatorY']) # threshold for the 2 classes fat_thres = 20.0 # getting class labels classes = y_raw_data >= fat_thres # getting samples labels samples_labels = range(1, x_rows + 1) # getting features labels features_labels = range(1, x_cols + 1) return utils.build_data_set(data=x_raw_data, samples_labels=samples_labels, features_labels=features_labels, extra_cols={ 'fat': y_raw_data, 'class': classes.astype(int) })
def load_nir_fuel(): # loading matlab data set raw_data = sio.loadmat(__data_set_path) # getting samples labels samples_labels = [int(l) for l in raw_data['diesel_spec'][0][0][8][0][0]] # getting features labels features_labels = [f for f in raw_data['diesel_spec'][0][0][12][1][0][0]] # getting properties values props_labels = [s.strip() for s in raw_data['diesel_prop'][0][0][8][1][0]] # getting spectra data data = raw_data['diesel_spec'][0][0][7] # getting properties data props_data = raw_data['diesel_prop'][0][0][7].T other_cols = { prop_name: prop_data for prop_name, prop_data in zip(props_labels, props_data) } # actually building the data set return utils.build_data_set(data, samples_labels, features_labels, extra_cols=other_cols)
def load_nir_sugarcane(): # loading matlab data set raw_data = sio.loadmat(__data_set_path) # getting 'X' data x_data = raw_data['X'][0][0][7] samples_count, features_count = x_data.shape # getting X samples labels samples_labels = range(1, samples_count + 1) # getting X features labels features_labels = [int(nm) for nm in raw_data['X'][0][0][8][1][0]] # getting 'Brix' data brix_arr = raw_data['Brix'][0][0][7] brix_data = [b[0] for b in brix_arr] # getting 'pol' data pol_arr = raw_data['pol'][0][0][7] pol_data = [p[0] for p in pol_arr] # getting classes descriptions classes_headers = [l[0] for l in raw_data['X'][0][0][12][0][1]] # getting classes data classes_data = [ # corresponds to classes_headers[0] raw_data['X'][0][0][12][0][0][0][0].tolist(), # corresponds to classes_headers[1] raw_data['X'][0][0][12][0][0][1][0].tolist(), # corresponds to classes_headers[2] raw_data['X'][0][0][12][0][0][2][0].tolist(), ] # getting classesid maps (to be able to convert labels into semantic labels) classesid_map = [ # id_map for classes_headers[0] {t[0][0][0]: t[1][0] for t in raw_data['X'][0][0][14][0][0]}, # id_map for classes_headers[1] {t[0][0][0]: t[1][0] for t in raw_data['X'][0][0][14][0][1]}, # id_map for classes_headers[1] {t[0][0][0]: t[1][0] for t in raw_data['X'][0][0][14][0][2]}, ] # columns to add regression_cols = {'brix': brix_data, 'pol': pol_data} class_cols = { ch: list(map(lambda x: classesid_map[i][x], classes_data[i])) for i, ch in enumerate(classes_headers) } # actually building the data set return utils.build_data_set(x_data, samples_labels, features_labels, extra_cols={**regression_cols, **class_cols})
def load_nmr_wines(): """Loads the NMR Wines data set. Returns: A Pandas DataFrame with all the data set info. Examples: >>> ds = load_nmr_wines() >>> ds['wine_data'].shape (40, 8729) >>> ds['wine_ints'].shape (22, 1) """ # loading matlab data set object raw_data = sio.loadmat(__data_path) # validating loaded data if raw_data is None: raise Exception('Error while loading 1H-NMR Wines data.') # getting features labels features_labels = raw_data['ppm'][0].tolist() # getting properties labels props_labels = list(map(lambda x: x[0], raw_data['Label'][0])) # getting samples data data = raw_data['X'] # getting properties data props_data = raw_data['Y'] # creating the wine data set all_data = np.hstack([data, props_data]) all_labels = range(all_data.shape[0]) all_features = features_labels + props_labels wine_ds = utils.build_data_set(all_data.tolist(), all_labels, all_features) # ---------------------- wine_ints_data = raw_data['wine_ints'][0] wine_ints_ds = pd.DataFrame(wine_ints_data) # ---------------------- # the final data set ds = { 'wine_data': wine_ds, 'wine_ints': wine_ints_ds, } # returning the final data set return ds
def load_mvda_peas_raw(): # loading matlab data set raw_data = sio.loadmat(__data_set_path) features_labels = raw_data['var_labels_all'] data = raw_data['data_all'] samples_labels = list(range(1, data.shape[0] + 1)) return utils.build_data_set(data, samples_labels, features_labels)
def load_mvda_tea(): # parsing the raw data file with open(__data_set_path, 'r') as f: # declaring variables holding labels and data samples_labels = [] data = [] # reading header header = f.readline() features_labels = [h.strip() for h in header.split(' ')] # for each line in the file for line in f: # parsing current line line_parsed = [s.strip() for s in line.split(' ')] # adding sample name to labels list samples_labels.append(line_parsed[0]) data.append([float(s) for s in line_parsed[1:]]) # building varieties from samples names varieties = [s_name[0] for s_name in samples_labels] # building type/class from samples names var_to_type = { # Black tea 'K': 'Black', 'F': 'Black', # Green tea 'C': 'Green', 'H': 'Green', # Oolong tea 'S': 'Oolong', 'T': 'Oolong', } types = [var_to_type[v_name] for v_name in varieties] # returning the built data set return utils.build_data_set(data, samples_labels, features_labels, extra_cols={ 'variety': varieties, 'type': types })
def load_raman_porkfat(): # loading matlab data set object raw_data = sio.loadmat(__data_set_path) # validating loaded data if raw_data is None: raise Exception('Error while loading Raman Pork Fat data.') # getting X and Y info X = raw_data['X'] Y = raw_data['Y'] # getting samples labels samples_labels = list(X['label'][0][0][0][0]) # getting features labels features_labels = list(map(str, list(X['axisscale'][0][0][1][0][0]))) # getting samples data data = list(map(list, X['data'][0][0])) # adding extra columns other_cols = OrderedDict() # adding first labeling classes1 = list(X['class'][0][0][0][0][0][0]) other_cols['classes1'] = classes1 # adding second labeling classes2 = list(X['class'][0][0][0][0][1][0]) other_cols['classes2'] = classes2 # adding third labeling classes3 = list(X['class'][0][0][0][0][2][0]) other_cols['classes3'] = classes3 # adding properties props_labels = list(Y['label'][0][0][1][0]) props = list(map(list, Y['data'][0][0].T)) for i, pl in enumerate(props_labels): other_cols[pl] = props[i] # actually building the data set return utils.build_data_set(data, samples_labels, features_labels, other_cols)
def load_hplc_oil(): # loading matlab data set object raw_data = sio.loadmat(__data_set_path) # validating loaded data if raw_data is None: raise Exception('Error while loading HPLC Oil data.') # getting 'HPLCforweb' info hplc_oil = raw_data['HPLCforweb'] # getting samples labels samples_labels = list(hplc_oil['label'][0][0][0][0]) # getting features labels features_labels = list(hplc_oil['include'][0][0][1][0][0]) # getting samples data data = list(map(list, hplc_oil['data'][0][0])) # adding extra columns other_cols = OrderedDict() # getting samples classes class_labels = list(hplc_oil['class'][0][0][0][0][0]) other_cols['class'] = class_labels classid_map = { # 1 --> not hplc_oil['classlookup'][0][0][0][0][1][0][0][0]: hplc_oil['classlookup'][0][0][0][0][1][1][0], # 2 --> olive hplc_oil['classlookup'][0][0][0][0][2][0][0][0]: hplc_oil['classlookup'][0][0][0][0][2][1][0], # 3 --> mix hplc_oil['classlookup'][0][0][0][0][3][0][0][0]: hplc_oil['classlookup'][0][0][0][0][3][1][0], } classids = list(map(lambda x: classid_map[x], class_labels)) other_cols['classid'] = classids # actually building the data set return utils.build_data_set(data, samples_labels, features_labels, other_cols)
def load_nmr_onion(): # loading matlab data set object raw_data = sio.loadmat(__data_path) # validating loaded data if raw_data is None: raise Exception('Error while loading 1H-NMR Onion data.') # getting samples labels samples_labels = list(map(lambda x: x[0][0], raw_data['Samples_name'])) # getting features labels features_labels = raw_data['ppm'][0].tolist() # getting samples data data = raw_data['x'].tolist() # getting onion percent onion_percent = raw_data['onion'][0].tolist() # actually building the data set return utils.build_data_set(data, samples_labels, features_labels, {'% onion': onion_percent})
def load_nir_corn(): """Loads the NIR Corn data set. Returns: A dictionary with all the data set info. Examples: >>> ds = load_nir_corn() >>> ds['m5_nbs'].shape (3, 700) >>> ds['m5_spec'].shape (80, 700) >>> ds['mp5_nbs'].shape (4, 700) >>> ds['mp5_spec'].shape (80, 700) >>> ds['mp6_nbs'].shape (4, 700) >>> ds['mp6_spec'].shape (80, 700) >>> ds['propvals'].shape (80, 4) """ # loading matlab data set raw_data = sio.loadmat(__data_set_path) # building features labels features_labels = list(range(1, 701)) # ---------------- m5 info ---------------- m5nbs_data = raw_data['m5nbs'][0][0][7] m5_nbs_ds = utils.build_data_set(m5nbs_data, list(range(1, m5nbs_data.shape[0] + 1)), features_labels) m5spec_data = raw_data['m5spec'][0][0][7] m5_spec_ds = utils.build_data_set(m5spec_data, list(range(1, m5spec_data.shape[0] + 1)), features_labels) # ---------------- mp5 info ---------------- mp5nbs_data = raw_data['mp5nbs'][0][0][7] mp5_nbs_ds = utils.build_data_set(mp5nbs_data, list(range(1, mp5nbs_data.shape[0] + 1)), features_labels) mp5spec_data = raw_data['mp5spec'][0][0][7] mp5_spec_ds = utils.build_data_set(mp5spec_data, list(range(1, mp5spec_data.shape[0] + 1)), features_labels) # ---------------- mp6 info ---------------- mp6nbs_data = raw_data['mp6nbs'][0][0][7] mp6_nbs_ds = utils.build_data_set(mp6nbs_data, list(range(1, mp6nbs_data.shape[0] + 1)), features_labels) mp6spec_data = raw_data['mp6spec'][0][0][7] mp6_spec_ds = utils.build_data_set(mp6spec_data, list(range(1, mp6spec_data.shape[0] + 1)), features_labels) # ---------------- propvals info ---------------- prop_values = raw_data['propvals'][0][0][7] prop_names = list(map(lambda s: s.strip(), raw_data['propvals'][0][0][8][1][0])) propvals_ds = utils.build_data_set(prop_values, list(range(1, prop_values.shape[0] + 1)), prop_names) # ---------------- # actually building the joint data set ds = { 'm5_nbs': m5_nbs_ds, 'm5_spec': m5_spec_ds, 'mp5_nbs': mp5_nbs_ds, 'mp5_spec': mp5_spec_ds, 'mp6_nbs': mp6_nbs_ds, 'mp6_spec': mp6_spec_ds, 'propvals': propvals_ds, } return ds
def load_nir_alcohol(): """Loads the NIR Alcohol data set. Returns: A Pandas DataFrame with all the data set info. Examples: >>> ds = load_nir_alcohol() >>> ds.ix['train'].shape (27, 104) >>> ds.ix['new'].shape (13, 104) >>> ds.ix['msc'].shape (27, 104) """ # loading matlab data set raw_data = sio.loadmat(__data_set_path) # ---------------- # getting all variable/features labels var_labels_all = raw_data['var_labels_all'].tolist() # getting spectra labels var_labels_spectra = [int(l) for l in var_labels_all[3:]] # getting properties labels var_labels_properties = var_labels_all[:3] # ---------------- # getting all data (train-new-msc) data_all = raw_data['data_all'] # getting spectra data data_spectra = data_all[:, 3:] # getting properties data data_properties = data_all[:, :3] # ---------------- obj_labels_train = raw_data['obj_labels_train'].tolist() data_train = data_spectra[:27, :] other_cols_train = { p_name: p_data for p_name, p_data in zip(var_labels_properties, data_properties[:27, :].T) } ds_train = utils.build_data_set(data_train, obj_labels_train, var_labels_spectra, extra_cols=other_cols_train) # ---------------- obj_labels_new = raw_data['obj_labels_new'].tolist() data_new = data_spectra[27:40, :] other_cols_new = { p_name: p_data for p_name, p_data in zip(var_labels_properties, data_properties[ 27:40, :].T) } ds_new = utils.build_data_set(data_new, obj_labels_new, var_labels_spectra, extra_cols=other_cols_new) # ---------------- obj_labels_msc = raw_data['obj_labels_mscorrected'].tolist() data_msc = data_spectra[40:, :] other_cols_msc = { p_name: p_data for p_name, p_data in zip(var_labels_properties, data_properties[ 40:, :].T) } ds_msc = utils.build_data_set(data_msc, obj_labels_msc, var_labels_spectra, extra_cols=other_cols_msc) # ---------------- # training/validation data sets and labels data_sets = [ds_train, ds_new, ds_msc] labels = ['train', 'new', 'msc'] # actually building the joint data set ds = pd.concat(data_sets, keys=labels) # returning the final data set return ds