reload(run_lib) plt.close('all') n_folds_list = [5, 10, 20, 50] n_repetitions = 5 method = 'svr' out_path = '/Users/dedan/projects/master/results/validation/gen_score_svr' base_path = os.path.join(os.path.dirname(__file__), '..') config = json.load(open(os.path.join(base_path, 'config', 'validate_genscore_svr.json'))) config['data_path'] = os.path.join(base_path, 'data') # load the features features = run_lib.prepare_features(config) used_glomeruli = json.load(open(os.path.join(config['data_path'], 'used_glomeruli.json'))) res = {g: {nf: [] for nf in n_folds_list} for g in used_glomeruli} for glom in used_glomeruli: print(glom) config['glomerulus'] = glom data, targets, molids = run_lib.load_data_targets(config, features) config['feature_selection']['k_best'] = data.shape[1] for i, n_folds in enumerate(n_folds_list): print(n_folds) config['methods'][method]['n_folds'] = n_folds for j in range(n_repetitions): run_res = run_lib.run_runner(config, data, targets) res[glom][n_folds].append(run_res[method]['gen_score']) json.dump(open(os.path.join(out_path, 'res.json')))
cache = {} for k, config in feat_config.items(): base_config['features'].update(config) cache[k] = {"features": rl.prepare_features(base_config)} all_mols = [r['features'].keys() for r in cache.values()] mol_intersection = set(all_mols[0]).intersection(*all_mols[1:]) res = {n: {g: {} for g in gloms} for n in feat_config} for glom in gloms: print('{}\n'.format(glom)) base_config.update({'glomerulus': glom, 'data_path': data_path}) dtm = {} for name, config in feat_config.items(): base_config['features'].update(config) data, targets, molids = rl.load_data_targets(base_config, cache[name]['features']) dtm[name] = { 'data': data, 'targets': targets, 'molids': molids } # select molecules that none of the models will be trained on all_trained = set(dtm.values()[0]['molids']).union(*[m['molids'] for m in dtm.values()[1:]]) to_predict_molids = mol_intersection - all_trained for name, data in dtm.items(): # fit model print('working on model: {}'.format(name)) base_config['feature_selection']['k_best'] = data['data'].shape[1]
"n_folds": 50 } }, "randomization_test": False } used_gloms = json.load(open(os.path.join(config['data_path'], 'used_glomeruli.json'))) alone_haddad, alone_vib, together = [], [], [] for glom in used_gloms: config['glomerulus'] = glom # prepare haddad features features_h = run_lib.prepare_features(config) data_h, targets_h, molids_h = run_lib.load_data_targets(config, features_h) config['feature_selection']['k_best'] = data_h.shape[1] tmp = run_lib.run_runner(config, data_h, targets_h) print glom, tmp alone_haddad.append(tmp['svr']['gen_score']) # prepare vib100 config_spec = copy.deepcopy(config) config_spec['features']['type'] = 'spectral' config_spec['features']['kernel_width'] = 100 config_spec['features']['bin_width'] = 150 config_spec['features']['use_intensity'] = False config_spec['features']['spec_type'] = 'ir' features_v = run_lib.prepare_features(config_spec) data_v, targets_v, molids_v = run_lib.load_data_targets(config_spec, features_v)
}, "methods": { "svr": { "cross_val": True, "C": 1.0, "n_folds": 10 } }, "data_path": "/Users/dedan/projects/master/data", "glomerulus": "Or22a", "randomization_test": False } # get the molids spectra = pickle.load(open('/Users/dedan/projects/master/data/spectral_features/large_base/parsed.pckl')) _, targets, molids = run_lib.load_data_targets(config, {str(i): [] for i in spectra.keys()}) # some molids map to two CAS numbers for some molecules, use only first first_molids_idx = sorted([molids.index(m) for m in set(molids)]) targets = targets[first_molids_idx] molids = [molids[i] for i in first_molids_idx] # place the intensity values in a high-resolution vector freqs = {k: v for k, v in spectra.items() if k in molids} data_orig = flib._place_waves_in_vector(freqs, 0.01, True, 'ir') assert len(molids) == len(targets) == data_orig.shape[0] # randomization np.random.seed() map(np.random.shuffle, data_orig.T)
import os import json from master.libs import run_lib import numpy as np import pylab as plt config = { "data_path": os.path.join(os.path.dirname(__file__), "..", "data"), "features": {"type": "conventional", "descriptor": "all", "normalize": True, "properties_to_add": []}, "feature_selection": {"method": "linear"}, "methods": {"svr": {"C": 1.0, "n_folds": 50}}, "randomization_test": False, } used_gloms = json.load(open(os.path.join(config["data_path"], "used_glomeruli.json"))) for glom in used_gloms: config["glomerulus"] = glom features = run_lib.prepare_features(config) data_all, targets, _ = run_lib.load_data_targets(config, features) config["features"]["descriptor"] = "saito_desc" data_saito, _, _ = run_lib.load_data_targets(config, features) np.random.seed() # map(np.random.shuffle, data_all.T) new_data = np.hstack((data_saito, data_all)) config["feature_selection"]["k_best"] = data_all.shape[1] tmp = run_lib.run_runner(config, data_all, targets, False, False) print glom, tmp["svr"]["gen_score"]