}
    },
    "data_path": "/Users/dedan/projects/master/data",
    "glomerulus": "Or22a",
    "randomization_test": False
}

# get the molids
spectra = pickle.load(open('/Users/dedan/projects/master/data/spectral_features/large_base/parsed.pckl'))
_, targets, molids = run_lib.load_data_targets(config, {str(i): [] for i in spectra.keys()})

# some molids map to two CAS numbers for some molecules, use only first
first_molids_idx = sorted([molids.index(m) for m in set(molids)])
targets = targets[first_molids_idx]
molids = [molids[i] for i in first_molids_idx]

# place the intensity values in a high-resolution vector
freqs = {k: v for k, v in spectra.items() if k in molids}
data_orig = flib._place_waves_in_vector(freqs, 0.01, True, 'ir')
assert len(molids) == len(targets) == data_orig.shape[0]

# randomization
np.random.seed()
map(np.random.shuffle, data_orig.T)

# fit model
sel_scores = run_lib.get_selection_score(config, data_orig, targets)
data = flib.select_k_best(data_orig, sel_scores, 2**9)
tmp_res = run_lib.run_runner(config, data, targets, get_models=True)
print tmp_res['svr']['model'].oob_score_
fig = plt.figure()
all_freqs = list(it.chain(*freqs.values()))
ax = fig.add_subplot(311)
ax.hist(all_freqs, 400, range=[0, 4000])
ax.set_xlabel("histogram of all frequencies")

ax = fig.add_subplot(312)
ax.hist(targets)
ax.set_xlabel("target value histogram")

# frequency distribution of active targets
active = [m for i, m in enumerate(molids) if targets[i] > active_thresh]
act_freqs = list(it.chain(*[v for k, v in freqs.items() if k in active]))
ax = fig.add_subplot(313)
ax.hist(act_freqs, 40000, range=[0, 4000])
ax.set_xlabel("do ligand share bins for resolution of 0.1?")
fig.savefig(os.path.join(outpath, "distributions.png"))

# look at the relation between data and target inner distances
fig = plt.figure()
ma = flib._place_waves_in_vector(freqs, 0.1, True, "ir")
target_distances, ligands = pdist_1d(targets)
f_select_config = {"feature_selection": {"method": "linear"}}
sel_scores = run_lib.get_selection_score(f_select_config, ma, targets)
ma_sel = flib.select_k_best(ma, sel_scores, 2 ** 11)
feature_distances_sel = pdist(ma_sel, "cosine")
ax = fig.add_subplot(111)
ax.plot(feature_distances_sel[~ligands], target_distances[~ligands], "xb")
ax.plot(feature_distances_sel[ligands], target_distances[ligands], "xr")
fig.savefig(os.path.join(outpath, "distance_relation.png"))