def prepare_features(config): """load and prepare the features, either conventional or spectral""" if config['features']['type'] == 'conventional': features_path = os.path.join(config['data_path'], 'conventional_features') if config['features']['descriptor'] == 'all': all_features = defaultdict(lambda: np.array([])) feature_files = glob.glob(os.path.join(features_path, '*.csv')) for feature_file in feature_files: if 'haddad' in feature_file or 'saito' in feature_file: continue features = flib.read_feature_csv(feature_file) for key, value in features.items(): all_features[key] = np.hstack((all_features[key], value)) features = all_features max_len = max([len(val) for val in features.values()]) for key in list(features.keys()): if len(features[key]) < max_len: del features[key] print 'deleted a molecule because not all features available' else: feature_file = os.path.join(config['data_path'], 'conventional_features', config['features']['descriptor'] + '.csv') features = flib.read_feature_csv(feature_file) elif config['features']['type'] == 'spectral': feature_file = os.path.join(config['data_path'], 'spectral_features', 'large_base', 'parsed.pckl') spectra = pickle.load(open(feature_file)) features = flib.get_spectral_features(spectra, spec_type=config['features']['spec_type'], use_intensity=config['features']['use_intensity'], kernel_widths=config['features']['kernel_width'], bin_width=config['features']['bin_width']) features = flib.remove_invalid_features(features) if config['features']['properties_to_add']: flib.add_molecule_properties(features, config['features']['properties_to_add']) if config['features']['normalize']: features = flib.normalize_features(features) return features
molid = '1' outpath = '/Users/dedan/projects/master/results/visualization/' data_path = os.path.join(os.path.dirname(__file__), '..', '..', 'data') feature_file = os.path.join(data_path, 'spectral_features', 'large_base', 'parsed.pckl') spectra = pickle.load(open(feature_file)) fig = plt.figure() ax = fig.add_subplot(211) ax.hist(spectra[molid]['freq'], bins=4000, edgecolor='0.4', facecolor='0.4') ax.set_xlim([0, 4000]) ax.set_xticks([0, 4000]) ax.set_xticklabels([0, 4000]) ax.set_yticks(ax.get_ylim()) utils.simple_axis(ax) print('loading..') features = fl.get_spectral_features(spectra, use_intensity=False, kernel_widths=k_width, bin_width=10) print('plotting..') ax = fig.add_subplot(212) ax.plot(features[molid], color='0.4', label='sigma: 20') ax.set_xticks([ax.get_xticks()[0], ax.get_xticks()[-1]]) ax.set_xticklabels([0, 4000]) utils.simple_axis(ax) ax.set_yticks(ax.get_ylim()) ax.set_xlabel('wavenumber (1/cm)') ax.legend(loc='upper left', frameon=False, numpoints=1) fig.savefig(os.path.join(outpath, 'eva.png'), dpi=300)
print glom glom_idx = glomeruli.index(glom) # select molecules available for the glomerulus targets , tmp_cas_numbers = rdl.get_avail_targets_for_glom(rm, cas_numbers, glom_idx) molids = [door2id[cas_number][0] for cas_number in tmp_cas_numbers] # for some of them the spectra are not available targets = [targets[i] for i in range(len(tmp_cas_numbers)) if str(molids[i]) in spectra] molids = [m for m in molids if str(m) in spectra] res[glom] = {'data': {}, 'regression': {}, 'forest': {}, 'oob': {}, 'targets': targets, 'oob_prediction': {}, 'oob_sel': {}} for i, kernel_width in enumerate(kernel_widths): data = features_lib.get_spectral_features(spectra, molids, resolution, kernel_width=kernel_width) # univariate test _, p = f_regression(data, targets) res[glom]['regression'][kernel_width] = -np.log10(p) # random forest regression rfr = RandomForestRegressor(n_estimators=n_estimators, compute_importances=True, oob_score=True) # TODO: feature selection step rfr.fit(data,targets) res[glom]['forest'][kernel_width] = rfr.feature_importances_ res[glom]['oob'][kernel_width] = rfr.oob_score_ res[glom]['oob_prediction'][kernel_width] = rfr.oob_prediction_